From f7bbaba2b4907fc00e9d1f1a54a28acdf209fb38 Mon Sep 17 00:00:00 2001 From: AwesomeAdam54321 Date: Sat, 9 Mar 2024 11:17:42 +0800 Subject: [PATCH] foundation-module: Chapter 4: Nowebify. --- .../Chapter_4/{C Strings.w => C_Strings.nw} | 49 +-- .../Chapter_4/{Characters.w => Characters.nw} | 30 +- .../Chapter_4/{JSON.w => JSON.nw} | 281 +++++++++--------- ...Pattern Matching.w => Pattern_Matching.nw} | 182 ++++++------ .../{Preprocessor.w => Preprocessor.nw} | 239 +++++++-------- ... Manipulation.w => String_Manipulation.nw} | 123 ++++---- .../Chapter_4/{Text Files.w => Text_Files.nw} | 106 +++---- ...ies and Avinues.w => Tries_and_Avinues.nw} | 110 +++---- .../{Wide Strings.w => Wide_Strings.nw} | 10 +- 9 files changed, 570 insertions(+), 560 deletions(-) rename foundation-module/Chapter_4/{C Strings.w => C_Strings.nw} (81%) rename foundation-module/Chapter_4/{Characters.w => Characters.nw} (91%) rename foundation-module/Chapter_4/{JSON.w => JSON.nw} (88%) rename foundation-module/Chapter_4/{Pattern Matching.w => Pattern_Matching.nw} (75%) rename foundation-module/Chapter_4/{Preprocessor.w => Preprocessor.nw} (82%) rename foundation-module/Chapter_4/{String Manipulation.w => String_Manipulation.nw} (91%) rename foundation-module/Chapter_4/{Text Files.w => Text_Files.nw} (76%) rename foundation-module/Chapter_4/{Tries and Avinues.w => Tries_and_Avinues.nw} (75%) rename foundation-module/Chapter_4/{Wide Strings.w => Wide_Strings.nw} (77%) diff --git a/foundation-module/Chapter_4/C Strings.w b/foundation-module/Chapter_4/C_Strings.nw similarity index 81% rename from foundation-module/Chapter_4/C Strings.w rename to foundation-module/Chapter_4/C_Strings.nw index 350a6ac..6db0d8f 100644 --- a/foundation-module/Chapter_4/C Strings.w +++ b/foundation-module/Chapter_4/C_Strings.nw @@ -2,8 +2,8 @@ A minimal library for handling C-style strings. -@ Programs using Foundation store text in |text_stream| structures almost all -of the time, but old-style, null-terminated |char *| array strings are +@ Programs using Foundation store text in [[text_stream]] structures almost all +of the time, but old-style, null-terminated [[char *]] array strings are still occasionally needed. We need to handle C strings long enough to contain any plausible filename, and @@ -11,17 +11,18 @@ any run of a dozen or so lines of code; but we have no real need to handle strings of unlimited length, nor to be parsimonious with memory. The following defines a type for a string long enough for our purposes. -It should be at least as long as the constant sometimes called |PATH_MAX|, +It should be at least as long as the constant sometimes called [[PATH_MAX]], the maximum length of a pathname, which is 1024 on Mac OS X. -@d MAX_STRING_LENGTH 8*1024 +<<*>>= +#define MAX_STRING_LENGTH 8*1024 -= +<<*>>= typedef char string[MAX_STRING_LENGTH+1]; @ Occasionally we need access to the real, unbounded strlen: -= +<<*>>= int CStrings::strlen_unbounded(const char *p) { return (int) strlen(p); } @@ -31,16 +32,16 @@ an attempt to continue execution after a string overflow might conceivably result in a malformatted shell command being passed to the operating system, which we cannot risk. -= +<<*>>= int CStrings::check_len(int n) { - if ((n > MAX_STRING_LENGTH) || (n < 0)) Errors::fatal("String overflow\n"); + if ((n > MAX_STRING_LENGTH) [[]] (n < 0)) Errors::fatal("String overflow\n"); return n; } @ The following is then protected from reading out of range if given a non-terminated string, though this should never actually happen. -= +<<*>>= int CStrings::len(char *str) { for (int i=0; i<=MAX_STRING_LENGTH; i++) if (str[i] == 0) return i; @@ -48,10 +49,10 @@ int CStrings::len(char *str) { return MAX_STRING_LENGTH; } -@ We then have a replacement for |strcpy|, identical except that it's +@ We then have a replacement for [[strcpy]], identical except that it's bounds-checked: -= +<<*>>= void CStrings::copy(char *to, char *from) { CStrings::check_len(CStrings::len(from)); int i; @@ -59,9 +60,9 @@ void CStrings::copy(char *to, char *from) { to[i] = 0; } -@ String comparisons will be done with the following, not |strcmp| directly: +@ String comparisons will be done with the following, not [[strcmp]] directly: -= +<<*>>= int CStrings::eq(char *A, char *B) { return (CStrings::cmp(A, B) == 0)?TRUE:FALSE; } @@ -72,20 +73,20 @@ int CStrings::ne(char *A, char *B) { @ On the rare occasions when we need to sort alphabetically we'll also call: -= +<<*>>= int CStrings::cmp(char *A, char *B) { - if ((A == NULL) || (A[0] == 0)) { - if ((B == NULL) || (B[0] == 0)) return 0; + if ((A == NULL) [[]] (A[0] == 0)) { + if ((B == NULL) [[]] (B[0] == 0)) return 0; return -1; } - if ((B == NULL) || (B[0] == 0)) return 1; + if ((B == NULL) [[]] (B[0] == 0)) return 1; return strcmp(A, B); } @ And the following is needed to deal with extension filenames on platforms whose locale is encoded as UTF-8. -= +<<*>>= void CStrings::transcode_ISO_string_to_UTF8(char *p, char *dest) { int i, j; for (i=0, j=0; p[i]; i++) { @@ -100,33 +101,33 @@ void CStrings::transcode_ISO_string_to_UTF8(char *p, char *dest) { dest[j] = 0; } -@ I dislike to use |strncpy| because, and for some reason this surprises +@ I dislike to use [[strncpy]] because, and for some reason this surprises me every time, it truncates but fails to write a null termination character if the string to be copied is larger than the buffer to write to: the result is therefore not a well-formed string and we have to fix matters by hand. This I think makes for opaque code. So: -= +<<*>>= void CStrings::truncated_strcpy(char *to, char *from, int max) { int i; for (i=0; ((from[i]) && (i>= typedef struct string_storage_area { char *storage_at; int capacity; CLASS_DEFINITION } string_storage_area; -@ = +<<*>>= char *CStrings::park_string(char *from) { string_storage_area *ssa = CREATE(string_storage_area); ssa->capacity = (int) CStrings::strlen_unbounded(from) + 1; @@ -137,7 +138,7 @@ char *CStrings::park_string(char *from) { @ And here we free any SSAs needed in the course of the run. -= +<<*>>= void CStrings::free_ssas(void) { string_storage_area *ssa; LOOP_OVER(ssa, string_storage_area) diff --git a/foundation-module/Chapter_4/Characters.w b/foundation-module/Chapter_4/Characters.nw similarity index 91% rename from foundation-module/Chapter_4/Characters.w rename to foundation-module/Chapter_4/Characters.nw index de8c65e..32a7677 100644 --- a/foundation-module/Chapter_4/Characters.w +++ b/foundation-module/Chapter_4/Characters.nw @@ -2,9 +2,9 @@ Individual characters. -@h Character classes. +@ \section{Character classes.} -= +<<*>>= wchar_t Characters::tolower(wchar_t c) { return (wchar_t) tolower((int) c); } @@ -31,19 +31,19 @@ int Characters::iscntrl(wchar_t c) { return ((i >= 0) && (i < 32)); } int Characters::vowel(wchar_t c) { - if ((c == 'a') || (c == 'e') || (c == 'i') || (c == 'o') || (c == 'u')) return TRUE; + if ((c == 'a') [[| (c == 'e') || (c == 'i') || (c == 'o') |]] (c == 'u')) return TRUE; return FALSE; } @ White space classes: -= +<<*>>= int Characters::is_space_or_tab(int c) { - if ((c == ' ') || (c == '\t')) return TRUE; + if ((c == ' ') [[]] (c == '\t')) return TRUE; return FALSE; } int Characters::is_whitespace(int c) { - if ((c == ' ') || (c == '\t') || (c == '\n')) return TRUE; + if ((c == ' ') [[| (c == '\t') |]] (c == '\n')) return TRUE; return FALSE; } @@ -51,18 +51,18 @@ int Characters::is_whitespace(int c) { sense of the Treaty of Babel rules on leading and trailing spaces in iFiction records. -= +<<*>>= int Characters::is_babel_whitespace(int c) { - if ((c == ' ') || (c == '\t') || (c == '\x0a') - || (c == '\x0d') || (c == NEWLINE_IN_STRING)) return TRUE; + if ((c == ' ') [[| (c == '\t') |]] (c == '\x0a') + [[| (c == '\x0d') |]] (c == NEWLINE_IN_STRING)) return TRUE; return FALSE; } -@h Unicode composition. +@ \section{Unicode composition.} A routine which converts the Unicode combining accents with letters, sufficient correctly to handle all characters in the ZSCII set. -= +<<*>>= int Characters::combine_accent(int accent, int letter) { switch(accent) { case 0x0300: /* Unicode combining grave */ @@ -112,11 +112,11 @@ int Characters::combine_accent(int accent, int letter) { return '?'; } -@h Accent stripping. +@ \section{Accent stripping.} It's occasionally useful to simplify text used as a filename by removing the more obvious accents from it. -= +<<*>>= int Characters::make_filename_safe(int charcode) { charcode = Characters::remove_accent(charcode); if (charcode >= 128) charcode = '-'; @@ -131,7 +131,7 @@ wchar_t Characters::make_wchar_t_filename_safe(wchar_t charcode) { @ The following strips the accent, if present, from an ISO Latin-1 character: -= +<<*>>= int Characters::remove_accent(int charcode) { switch (charcode) { case 0xC0: case 0xC1: case 0xC2: case 0xC3: @@ -165,7 +165,7 @@ wchar_t Characters::remove_wchar_t_accent(wchar_t charcode) { @ This will do until we properly use Unicode character classes some day: -= +<<*>>= int Characters::isalphabetic(int letter) { return Characters::isalpha((wchar_t) Characters::remove_accent(letter)); } diff --git a/foundation-module/Chapter_4/JSON.w b/foundation-module/Chapter_4/JSON.nw similarity index 88% rename from foundation-module/Chapter_4/JSON.w rename to foundation-module/Chapter_4/JSON.nw index 189e7ea..f7d4b32 100755 --- a/foundation-module/Chapter_4/JSON.w +++ b/foundation-module/Chapter_4/JSON.nw @@ -2,7 +2,7 @@ To read, validate and write JSON data interchange material. -@h Introduction. +@ \section{Introduction.} JSON (Douglas Crockford, c. 2000) stands for "JavaScript Object Notation", but is now a //standardised data interchange format -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf// used in many contexts. It's especially suitable for passing small amounts of data @@ -13,17 +13,17 @@ or preference files. This section provides encoding and decoding facilities. It is intended to comply with //ECMA-404 -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf//, except that (i) it disallows repetition the same key in the same object, and (ii) -text can only be used in the Basic Multilingual Plane of Unicode points |0x0000| -to |0xffff|. +text can only be used in the Basic Multilingual Plane of Unicode points [[0x0000]] +to [[0xffff]]. There are no size maxima or limitations. Still, this code was written at typing speed, and no effort has gone into reducing memory usage or running time in the face of large (or malicious) JSON content. Error reporting is also limited in fulsomeness. -See the |foundation-test| test case |json| for many exercises of the code below; +See the [[foundation-test]] test case [[json]] for many exercises of the code below; do not change this section without checking that it continues to pass. -@h Data model. +@ \section{Data model.} JSON has a simple data model which we need to replicate in memory. Each value will be a pointer to a (permanently held in memory) //JSON_value// object. This is in effect a union, in that its type is always one of the following, @@ -31,19 +31,20 @@ and then only certain elements are meaningful depending on type. These are exactly the JSON types except that numbers are split between integer and floating-point versions (the conflation of the two is where the Javascript -origins of JSON show through), and that the type |ERROR_JSONTYPE| represents +origins of JSON show through), and that the type [[ERROR_JSONTYPE]] represents invalid data resulting from attempting to decode erroneous JSON. -@e NUMBER_JSONTYPE from 1 -@e DOUBLE_JSONTYPE -@e STRING_JSONTYPE -@e BOOLEAN_JSONTYPE -@e ARRAY_JSONTYPE -@e OBJECT_JSONTYPE -@e NULL_JSONTYPE -@e ERROR_JSONTYPE +<<*>>= +enum NUMBER_JSONTYPE from 1 +enum DOUBLE_JSONTYPE +enum STRING_JSONTYPE +enum BOOLEAN_JSONTYPE +enum ARRAY_JSONTYPE +enum OBJECT_JSONTYPE +enum NULL_JSONTYPE +enum ERROR_JSONTYPE -= +<<*>>= void JSON::write_type(OUTPUT_STREAM, int t) { switch (t) { case NUMBER_JSONTYPE: WRITE("number"); break; @@ -60,23 +61,23 @@ void JSON::write_type(OUTPUT_STREAM, int t) { @ -= +<<*>>= typedef struct JSON_value { int JSON_type; int if_integer; double if_double; struct text_stream *if_string; int if_boolean; - struct linked_list *if_list; /* of |JSON_value| */ - struct dictionary *dictionary_if_object; /* to |JSON_value| */ - struct linked_list *list_if_object; /* of |text_stream| */ + struct linked_list *if_list; /* of [[JSON_value]] */ + struct dictionary *dictionary_if_object; /* to [[JSON_value]] */ + struct linked_list *list_if_object; /* of [[text_stream]] */ struct text_stream *if_error; CLASS_DEFINITION } JSON_value; @ Now some constructor functions to create data of each JSON type: -= +<<*>>= JSON_value *JSON::new_null(void) { JSON_value *value = CREATE(JSON_value); value->JSON_type = NULL_JSONTYPE; @@ -123,7 +124,7 @@ JSON_value *JSON::new_string(text_stream *S) { @ JSON arrays -- lists, in effect -- should be created in an empty state, and then have entries added sequentially: -= +<<*>>= JSON_value *JSON::new_array(void) { JSON_value *value = JSON::new_null(); value->JSON_type = ARRAY_JSONTYPE; @@ -144,7 +145,7 @@ JSON_value *JSON::add_to_array(JSON_value *array, JSON_value *new_entry) { @ Similarly, JSON objects -- dictionaries of key-value pairs, in effect -- should be created in an empty state, and then have key-value pairs added as needed: -= +<<*>>= JSON_value *JSON::new_object(void) { JSON_value *value = JSON::new_null(); value->JSON_type = OBJECT_JSONTYPE; @@ -166,10 +167,10 @@ JSON_value *JSON::add_to_object(JSON_value *obj, text_stream *key, JSON_value *v return obj; } -@ The following looks up a key in an object, returning |NULL| if and only if +@ The following looks up a key in an object, returning [[NULL]] if and only if it is not present: -= +<<*>>= JSON_value *JSON::look_up_object(JSON_value *obj, text_stream *key) { if (obj == NULL) internal_error("no object"); if (obj->JSON_type == ERROR_JSONTYPE) return NULL; @@ -182,7 +183,7 @@ JSON_value *JSON::look_up_object(JSON_value *obj, text_stream *key) { @ One last constructor creates an invalid JSON value resulting from incorrect JSON input: -= +<<*>>= JSON_value *JSON::error(text_stream *msg) { JSON_value *value = JSON::new_null(); value->JSON_type = ERROR_JSONTYPE; @@ -193,7 +194,7 @@ JSON_value *JSON::error(text_stream *msg) { @ This is a very limited form of comparison, since it cannot test equality of arrays or objects. -= +<<*>>= int JSON::eq(JSON_value *val1, JSON_value *val2) { if ((val1 == NULL) && (val2)) return FALSE; if ((val1) && (val2 == NULL)) return FALSE; @@ -208,16 +209,16 @@ int JSON::eq(JSON_value *val1, JSON_value *val2) { return FALSE; } -@h Decoding JSON. +@ \section{Decoding JSON.} We do no actual file-handling in this section, but the following decoder can be pointed to the contents of UTF-8 text file as needed. -The decoder returns a non-|NULL| pointer in all cases. If the text contains +The decoder returns a non-[[NULL]] pointer in all cases. If the text contains any malformed JSON anywhere inside it, this pointer will be to a value of type -|ERROR_JSONTYPE|. Such a value should be thrown away as soon as the error +[[ERROR_JSONTYPE]]. Such a value should be thrown away as soon as the error message is made use of. -= +<<*>>= JSON_value *JSON::decode(text_stream *T, text_file_position *tfp) { return JSON::decode_range(T, 0, Str::len(T), tfp); } @@ -248,19 +249,19 @@ JSON_value *JSON::decode_error_q(text_stream *err, text_file_position *tfp, return value; } -@ This decodes the text in the character position range |[from, to)| as a +@ This decodes the text in the character position range [[[from, to)]] as a JSON value. -The possibilities here are |[ ... ]| for an array, |{ ... }| for an object, -|"..."| for a string, a token beginning with a digit or a minus sign for a -number (note that |+| and |.| are not allowed to open a number according to -the JSON standard), and the special cases |true|, |false| and |null|. +The possibilities here are [[[ ... ]]] for an array, [[{ ... }]] for an object, +[["..."]] for a string, a token beginning with a digit or a minus sign for a +number (note that [[+]] and [[.]] are not allowed to open a number according to +the JSON standard), and the special cases [[true]], [[false]] and [[null]]. -= +<<*>>= JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_position *tfp) { int first_nws = -1, last_nws = -1; wchar_t first_c = 0, last_c = 0; - @; + <>; switch (first_c) { case '[': if (last_c != ']') return JSON::decode_error(I"mismatched '[' ... ']'", tfp); @@ -274,7 +275,7 @@ JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_posit if (last_c != '"') return JSON::decode_error(I"mismatched quotation marks", tfp); return JSON::decode_string(T, first_nws+1, last_nws, tfp); } - if ((Characters::isdigit(first_c)) || (first_c == '-')) + if ((Characters::isdigit(first_c)) [[]] (first_c == '-')) return JSON::decode_number(T, first_nws, last_nws+1, tfp); if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3)) return JSON::new_boolean(TRUE); @@ -285,7 +286,7 @@ JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_posit return JSON::decode_error(I"unknown JSON value", tfp); } -@ = +<>= for (int i=from; i>= JSON_value *JSON::decode_array(JSON_value *array, text_stream *T, int from, int to, text_file_position *tfp) { int content = FALSE; @@ -340,7 +341,7 @@ JSON_value *JSON::decode_array_entry(JSON_value *array, text_stream *T, int from @ And similarly for objects. -= +<<*>>= JSON_value *JSON::decode_object(JSON_value *obj, text_stream *T, int from, int to, text_file_position *tfp) { int content = FALSE; @@ -370,13 +371,13 @@ JSON_value *JSON::decode_object(JSON_value *obj, text_stream *T, int from, int t } @ Note that we allow key names to include all kinds of unconscionable garbage, -as ECMA requires. |\u0003\"\t\t\t| is a valid JSON key name; so is the empty string. +as ECMA requires. [[\u0003\"\t\t\t]] is a valid JSON key name; so is the empty string. We are however slightly stricter than ECMA in that we disallow duplicate keys in the same object. ECMA says this is a "semantic consideration that may be defined by JSON processors". We are hereby defining it. -= +<<*>>= JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from, int to, text_file_position *tfp) { while (Characters::is_whitespace(Str::get_at(T, from))) from++; @@ -399,7 +400,7 @@ JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from, } if (ended == FALSE) return JSON::decode_error_q(I"key does not end with quotation mark", tfp, T, saved_from, saved_to); while (Characters::is_whitespace(Str::get_at(T, from))) from++; - if ((from >= to) || (Str::get_at(T, from) != ':')) + if ((from >= to) [[]] (Str::get_at(T, from) != ':')) return JSON::decode_error_q(I"key is not followed by ':'", tfp, T, saved_from, saved_to); from++; if (JSON::look_up_object(obj, key)) return JSON::decode_error_q(I"duplicate key", tfp, T, saved_from, saved_to); @@ -412,9 +413,9 @@ JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from, @ Numbers are annoying to decode since they can be given either in a restricted floating-point syntax, or in decimal. ECMA is slippery on the question of exactly what floating-point numbers can be represented, but it's common to consider -them as being |double|, so we'll follow suit. +them as being [[double]], so we'll follow suit. -= +<<*>>= JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_position *tfp) { while (Characters::is_whitespace(Str::get_at(T, from))) from++; while ((to > from) && (Characters::is_whitespace(Str::get_at(T, to-1)))) to--; @@ -426,8 +427,8 @@ JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_posi for (int i=at; i>= JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_position *tfp) { TEMPORARY_TEXT(string) for (int i=from; i; break; + case 'u': <>; break; default: return JSON::decode_error(I"bad '\\' escape in string", tfp); } PUT_TO(string, c); @@ -484,9 +485,9 @@ JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_posi @ We don't quite fully implement ECMA here: the following is fine for code points in the Basic Multilingual Plane, but we don't handle the curious UTF-16 surrogate pair -rule for code points between |0x10000| and |0x10fff|. +rule for code points between [[0x10000]] and [[0x10fff]]. -@ = +<>= if (i+4 >= to) return JSON::decode_error(I"incomplete '\\u' escape", tfp); int hex = 0; for (int j=0; j<4; j++) { @@ -501,9 +502,9 @@ rule for code points between |0x10000| and |0x10fff|. c = (wchar_t) hex; i += 4; -@h Encoding JSON. +@ \section{Encoding JSON.} -= +<<*>>= void JSON::encode(OUTPUT_STREAM, JSON_value *J) { if (J == NULL) internal_error("no JSON value supplied"); switch (J->JSON_type) { @@ -563,7 +564,7 @@ void JSON::encode(OUTPUT_STREAM, JSON_value *J) { @ Note that we elect not to escape the slash character, or any Unicode code points above 32. -= +<<*>>= void JSON::encode_string(OUTPUT_STREAM, text_stream *T) { LOOP_THROUGH_TEXT(pos, T) { wchar_t c = Str::get(pos); @@ -582,7 +583,7 @@ void JSON::encode_string(OUTPUT_STREAM, text_stream *T) { } } -@h Requirements. +@ \section{Requirements.} Of course, the trouble with JSON is that it's a soup of undifferentiated data. Just because you're expecting a pair of numbers, there's no reason to suppose that's what you've been given. @@ -591,9 +592,9 @@ A //JSON_requirement// is a sort of JSON schema: a specification for the structu of a //JSON_value//. At the top level, it's a list of one or more equally good alternative specifications. Note that the empty list is not allowed. -= +<<*>>= typedef struct JSON_requirement { - struct linked_list *alternatives; /* of |JSON_single_requirement| */ + struct linked_list *alternatives; /* of [[JSON_single_requirement]] */ CLASS_DEFINITION } JSON_requirement; @@ -615,7 +616,7 @@ JSON_requirement *JSON::add_alternative(JSON_requirement *so_far, if Javascript actually had types. It can communicate something like "a number" or "a list of strings"; but it can also say "the value has to be exactly this". -= +<<*>>= typedef struct JSON_single_requirement { struct JSON_requirement *this_requirement; struct JSON_value *this_value; @@ -623,10 +624,10 @@ typedef struct JSON_single_requirement { CLASS_DEFINITION } JSON_single_requirement; -@ Exactly one of |this_requirement|, |this_value| and |this_type| should be -non-|NULL|, so we have one constructor function for each case: +@ Exactly one of [[this_requirement]], [[this_value]] and [[this_type]] should be +non-[[NULL]], so we have one constructor function for each case: -= +<<*>>= JSON_single_requirement *JSON::require_requirement(JSON_requirement *req) { JSON_single_requirement *sing = CREATE(JSON_single_requirement); sing->this_requirement = req; @@ -653,15 +654,15 @@ JSON_single_requirement *JSON::require_type(int t) { @ JSON types, in our model, look very like //JSON_value//s. -= +<<*>>= typedef struct JSON_type { int JSON_type; - struct linked_list *if_list; /* of |JSON_requirement| */ + struct linked_list *if_list; /* of [[JSON_requirement]] */ struct JSON_requirement *all_if_list; - struct dictionary *dictionary_if_object; /* to |JSON_pair_requirement| */ - struct linked_list *list_if_object; /* of |text_stream| */ + struct dictionary *dictionary_if_object; /* to [[JSON_pair_requirement]] */ + struct linked_list *list_if_object; /* of [[text_stream]] */ struct text_stream *if_error; CLASS_DEFINITION @@ -698,9 +699,9 @@ JSON_type *JSON::new_type_requirement(int t) { } @ A convenience for "the value must be an array of any number of entries, each -of which meets the requirement |E_req|": +of which meets the requirement [[E_req]]": -= +<<*>>= JSON_single_requirement *JSON::require_array_of(JSON_requirement *E_req) { JSON_single_requirement *req = JSON::require_type(ARRAY_JSONTYPE); req->this_type->all_if_list = E_req; @@ -708,22 +709,22 @@ JSON_single_requirement *JSON::require_array_of(JSON_requirement *E_req) { } @ If an array wants to be a tuple with a fixed number of entries, each with -its own requirement, then instead call |JSON::require_type(ARRAY_JSONTYPE)| and +its own requirement, then instead call [[JSON::require_type(ARRAY_JSONTYPE)]] and then make a number of calls to the following in sequence: -= +<<*>>= void JSON::require_entry(JSON_single_requirement *array_sr, JSON_requirement *entry_sr) { if (array_sr == NULL) internal_error("no array"); - if ((array_sr->this_type == NULL) || + if ((array_sr->this_type == NULL) [[]] (array_sr->this_type->JSON_type != ARRAY_JSONTYPE)) internal_error("not an array"); if (entry_sr == NULL) internal_error("no new entry"); ADD_TO_LINKED_LIST(entry_sr, JSON_requirement, array_sr->this_type->if_list); } -@ Similarly, create an object requirement with |JSON::require_type(OBJECT_JSONTYPE)| +@ Similarly, create an object requirement with [[JSON::require_type(OBJECT_JSONTYPE)]] and then either require or allow key-value pairs with: -= +<<*>>= void JSON::require_pair(JSON_single_requirement *obj_sr, text_stream *key, JSON_requirement *req) { JSON::require_pair_inner(obj_sr, key, req, FALSE); } @@ -735,7 +736,7 @@ void JSON::allow_pair(JSON_single_requirement *obj_sr, text_stream *key, JSON_re void JSON::require_pair_inner(JSON_single_requirement *obj_sr, text_stream *key, JSON_requirement *req, int opt) { if (obj_sr == NULL) internal_error("no object"); - if ((obj_sr->this_type == NULL) || + if ((obj_sr->this_type == NULL) [[]] (obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object"); if (req == NULL) internal_error("no val req"); key = Str::duplicate(key); @@ -747,13 +748,13 @@ void JSON::require_pair_inner(JSON_single_requirement *obj_sr, text_stream *key, if (de) de->value = pr; } -@ This then extracts the requirement on a given key, or returns |NULL| is if +@ This then extracts the requirement on a given key, or returns [[NULL]] is if is not permitted: -= +<<*>>= JSON_pair_requirement *JSON::look_up_pair(JSON_single_requirement *obj_sr, text_stream *key) { if (obj_sr == NULL) internal_error("no object"); - if ((obj_sr->this_type == NULL) || + if ((obj_sr->this_type == NULL) [[]] (obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object"); dict_entry *de = Dictionaries::find(obj_sr->this_type->dictionary_if_object, key); if (de == NULL) return NULL; @@ -763,25 +764,25 @@ JSON_pair_requirement *JSON::look_up_pair(JSON_single_requirement *obj_sr, text_ @ This is used when parsing textual requirements, to indicate a syntax error; but it is not valid as a requirement itself. -= +<<*>>= JSON_single_requirement *JSON::error_sr(text_stream *msg) { JSON_single_requirement *req = JSON::require_type(ERROR_JSONTYPE); req->this_type->if_error = Str::duplicate(msg); return req; } -@h Validation. +@ \section{Validation.} To "validate" a JSON value is to determine that it meets some //JSON_requirement//. -The following returns |TRUE| if the value meets the requirement in full; -if not, |FALSE|, and then if |errs| is not null, a list of error messages is -appended to the linked list |errs|. +The following returns [[TRUE]] if the value meets the requirement in full; +if not, [[FALSE]], and then if [[errs]] is not null, a list of error messages is +appended to the linked list [[errs]]. The stack here is used to give better error messages by locating where the -problem was: e.g. |"object.coordinates[1]"| is the result of the stack -holding |"object" > ".cooordinates" > "[1]"|. +problem was: e.g. [["object.coordinates[1]"]] is the result of the stack +holding [["object" > ".cooordinates" > "[1]"]]. -= +<<*>>= int JSON::validate(JSON_value *val, JSON_requirement *req, linked_list *errs) { lifo_stack *location = NEW_LIFO_STACK(text_stream); if ((val) && (val->JSON_type == ARRAY_JSONTYPE)) { @@ -815,7 +816,7 @@ value must match one of the single requirements in the list. (We can stop as soon as it has met one.) If it meets none of them, we produce error messages for the reason it fails just the first. -= +<<*>>= int JSON::validate_r(JSON_value *val, JSON_requirement *req, linked_list *errs, lifo_stack *location) { if (val == NULL) internal_error("no value"); @@ -834,7 +835,7 @@ int JSON::validate_r(JSON_value *val, JSON_requirement *req, linked_list *errs, @ Bad data always fails, and otherwise we split into the three cases. -= +<<*>>= int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req, linked_list *errs, lifo_stack *location) { if (val->JSON_type == ERROR_JSONTYPE) { @@ -842,16 +843,16 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req, I"erroneous JSON value from parsing bad text", location); return FALSE; } - if (req->this_requirement) @; - if (req->this_value) @; - if (req->this_type) @; + if (req->this_requirement) <>; + if (req->this_value) <>; + if (req->this_type) <>; internal_error("bad single requirement"); } -@ = +<>= return JSON::validate_r(val, req->this_requirement, errs, location); -@ = +<>= if (JSON::eq(val, req->this_value) == FALSE) { TEMPORARY_TEXT(msg) WRITE_TO(msg, "value "); @@ -863,16 +864,16 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req, } return TRUE; -@ = - @; +<>= + <>; int outcome = TRUE; if (val->JSON_type == ARRAY_JSONTYPE) - @; + <>; if (val->JSON_type == OBJECT_JSONTYPE) - @; + <>; return outcome; -@ = +<>= if (val->JSON_type != req->this_type->JSON_type) { if (errs) { TEMPORARY_TEXT(msg) @@ -886,7 +887,7 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req, return FALSE; } -@ = +<>= int count = 0; JSON_value *E; LOOP_OVER_LINKED_LIST(E, JSON_value, val->if_list) { @@ -912,19 +913,19 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req, count++; } -@ = +<>= text_stream *key; LOOP_OVER_LINKED_LIST(key, text_stream, val->list_if_object) - @; + <>; LOOP_OVER_LINKED_LIST(key, text_stream, req->this_type->list_if_object) { JSON_pair_requirement *pr = Dictionaries::read_value(req->this_type->dictionary_if_object, key); if (pr == NULL) internal_error("broken JSON object requirement"); if (pr->optional == FALSE) - @; + <>; } -@ = +<>= JSON_value *E = Dictionaries::read_value(val->dictionary_if_object, key); if (E == NULL) internal_error("broken JSON object dictionary"); JSON_pair_requirement *pr = JSON::look_up_pair(req, key); @@ -943,7 +944,7 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req, POP_LIFO_STACK(text_stream, location); DISCARD_TEXT(at) -@ = +<>= JSON_value *E = JSON::look_up_object(val, key); if (E == NULL) { TEMPORARY_TEXT(msg) @@ -953,21 +954,20 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req, outcome = FALSE; } -@h Decoding JSON requirements. +@ \section{Decoding JSON requirements.} It's convenient to be able to read and write these requirements to textual form, exactly as we do with JSON itself, and here goes. This is an example of the syntax we parse. It's JSON except that -(a) the type names |number|, |double|, |string|, |boolean| and |null| are +(a) the type names [[number]], [[double]], [[string]], [[boolean]] and [[null]] are used in place of their respective values; -(b) a question mark |?| before the name of a key means that it is optional; -(c) if an array has one entry followed by an asterisk |*|, it means +(b) a question mark [[?]] before the name of a key means that it is optional; +(c) if an array has one entry followed by an asterisk [[*]], it means "any number of entries, each of which must match this"; -(d) || refers to a requirement recorded in the |known_names| dictionary. +(d) [[]] refers to a requirement recorded in the [[known_names]] dictionary. For example: -= (text) { "coordinates": [ double, double, string ], ?"jurisdiction": string, @@ -976,28 +976,27 @@ For example: "entry": string }* ] } -= This function is essentially the same as //JSON::decode//, but returning a requirement rather than a value. -Note that |known_names| can be |NULL| to have it not recognise any such names; +Note that [[known_names]] can be [[NULL]] to have it not recognise any such names; there's no need to create an empty dictionary if this feature is unwanted. -= +<<*>>= JSON_requirement *JSON::decode_req(text_stream *T, dictionary *known_names) { return JSON::decode_req_range(T, 0, Str::len(T), known_names); } -@ This decodes the text in the character position range |[from, to)| as a +@ This decodes the text in the character position range [[[from, to)]] as a JSON requirement. -= +<<*>>= JSON_requirement *JSON::decode_req_range(text_stream *T, int from, int to, dictionary *known_names) { int first_nws = -1, last_nws = -1; wchar_t first_c = 0, last_c = 0; - @; + <>; if (first_c == '(') { if (last_c != ')') return JSON::single_choice(JSON::error_sr(I"mismatched '(' ... ')'")); @@ -1032,17 +1031,17 @@ JSON_requirement *JSON::decode_req_alternative(JSON_requirement *req, text_strea return JSON::add_alternative(req, sing); } -@ Note that the keyword |null| is ambiguous in the grammar for JSON requirements: -does it mean "the value |null|", or does it mean "any value of the type |null|"? -This makes no difference because the type |null| admits only the value |null|, but +@ Note that the keyword [[null]] is ambiguous in the grammar for JSON requirements: +does it mean "the value [[null]]", or does it mean "any value of the type [[null]]"? +This makes no difference because the type [[null]] admits only the value [[null]], but for what it's worth, we opt for the value. -= +<<*>>= JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int to, dictionary *known_names) { int first_nws = -1, last_nws = -1; wchar_t first_c = 0, last_c = 0; - @; + <>; if (first_nws < 0) return JSON::error_sr(I"whitespace where requirement expected"); switch (first_c) { case '[': @@ -1073,7 +1072,7 @@ JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int t int require_value = FALSE; - if ((first_c == '"') || (first_c == '-') || (Characters::isdigit(first_c))) + if ((first_c == '"') [[| (first_c == '-') |]] (Characters::isdigit(first_c))) require_value = TRUE; if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3)) require_value = TRUE; @@ -1110,7 +1109,7 @@ JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int t return JSON::error_sr(msg); } -@ = +<>= for (int i=from; i>= JSON_single_requirement *JSON::decode_req_array(JSON_single_requirement *array_sr, text_stream *T, int from, int to, dictionary *known_names) { int content = FALSE; @@ -1167,7 +1166,7 @@ JSON_single_requirement *JSON::decode_req_array_entry(JSON_single_requirement *a @ And similarly for objects. -= +<<*>>= JSON_single_requirement *JSON::decode_req_object(JSON_single_requirement *obj, text_stream *T, int from, int to, dictionary *known_names) { int content = FALSE; @@ -1218,7 +1217,7 @@ JSON_single_requirement *JSON::decode_req_object_entry(JSON_single_requirement * } if (ended == FALSE) return JSON::error_sr(I"key does not end with quotation mark"); while (Characters::is_whitespace(Str::get_at(T, from))) from++; - if ((from >= to) || (Str::get_at(T, from) != ':')) + if ((from >= to) [[]] (Str::get_at(T, from) != ':')) return JSON::error_sr(I"key is not followed by ':'"); from++; if (JSON::look_up_pair(obj, key)) return JSON::error_sr(I"duplicate key"); @@ -1229,21 +1228,21 @@ JSON_single_requirement *JSON::decode_req_object_entry(JSON_single_requirement * return obj; } -@h Encoding JSON requirements. +@ \section{Encoding JSON requirements.} This is now simple, with one caveat. It's possible to set up requirement trees so that they are not well-founded. For example: -= (text as InC) + JSON_single_requirement *set = JSON::require_type(ARRAY_JSONTYPE); set->all_if_list = JSON::single_choice(set); -= -This is not useless: it matches, say, |[]|, |[ [] ]| and |[ [], [ [] ] ]| + +This is not useless: it matches, say, [[[]]], [[[ [] ]]] and [[[ [], [ [] ] ]]] and other constructions giving amusement to set theorists. But it would cause the following to hang. Note that requirements read in from files (see below) are always well-founded, and so do not have this issue. -= +<<*>>= void JSON::encode_req(OUTPUT_STREAM, JSON_requirement *req) { JSON::encode_req_r(OUT, req); } @@ -1311,32 +1310,32 @@ void JSON::encode_type(OUTPUT_STREAM, JSON_type *type) { } } -@h Reading requirements files. +@ \section{Reading requirements files.} This convenient function reads in a set of requirements from a text file. Each -requirement should begin | ::=|, and then continues until the next such +requirement should begin [[ ::=]], and then continues until the next such header, or the end of the file. So for example: -= (text) + ! My scheme for JSON files describing geographical locations - ::= ( "alpha" | "beta" | null ) + ::= ( "alpha" [[ "beta" ]] null ) ::= { "category": , "latitude": double, "longitude": double, } -= + is a valid file declaring two requirements. Forward references are not allowed -- e.g., can refer to but not vice versa -- and therefore the requirements read in will always be well-founded. Comments are -lines beginning with |!|; other than comments, only white space is permitted +lines beginning with [[!]]; other than comments, only white space is permitted before the first requirement begins. Note that the function //JSON::read_requirements_file// returns a dictionary of the requirements it has read, by name (but without their angle-brackets): -here, it would have two keys, |optional-letter| and |position|. +here, it would have two keys, [[optional-letter]] and [[position]]. -= +<<*>>= typedef struct JSON_rrf_state { struct text_stream *name; struct text_stream *defn; @@ -1379,7 +1378,7 @@ void JSON::read_requirements_file_helper(text_stream *text, text_file_position * @ This is called when the end of a definition is reached, either because another is about to start, or because the end of the file has come: -= +<<*>>= void JSON::process_req_defn(JSON_rrf_state *state) { if (Str::len(state->name) > 0) { JSON_requirement *req = diff --git a/foundation-module/Chapter_4/Pattern Matching.w b/foundation-module/Chapter_4/Pattern_Matching.nw similarity index 75% rename from foundation-module/Chapter_4/Pattern Matching.w rename to foundation-module/Chapter_4/Pattern_Matching.nw index cd97b66..5be2540 100755 --- a/foundation-module/Chapter_4/Pattern Matching.w +++ b/foundation-module/Chapter_4/Pattern_Matching.nw @@ -2,36 +2,36 @@ To provide a limited regular-expression parser. -@h Character types. +@ \section{Character types.} We will define white space as spaces and tabs only, since the various kinds of line terminator will always be stripped out before this is applied. -= +<<*>>= int Regexp::white_space(int c) { - if ((c == ' ') || (c == '\t')) return TRUE; + if ((c == ' ') [[]] (c == '\t')) return TRUE; return FALSE; } -@ The presence of |:| here is perhaps a bit surprising, since it's illegal in +@ The presence of [[:]] here is perhaps a bit surprising, since it's illegal in C and has other meanings in other languages, but it's legal in C-for-Inform identifiers. -= +<<*>>= int Regexp::identifier_char(int c) { - if ((c == '_') || (c == ':') || - ((c >= 'A') && (c <= 'Z')) || - ((c >= 'a') && (c <= 'z')) || + if ((c == '_') [[| (c == ':') |]] + ((c >= 'A') && (c <= 'Z')) [[]] + ((c >= 'a') && (c <= 'z')) [[]] ((c >= '0') && (c <= '9'))) return TRUE; return FALSE; } -@h Simple parsing. +@ \section{Simple parsing.} The following finds the earliest minimal-length substring of a string, -delimited by two pairs of characters: for example, |<<| and |>>|. This could -easily be done as a regular expression using |Regexp::match|, but the routine +delimited by two pairs of characters: for example, [[<<]] and [[>>]]. This could +easily be done as a regular expression using [[Regexp::match]], but the routine here is much quicker. -= +<<*>>= int Regexp::find_expansion(text_stream *text, wchar_t on1, wchar_t on2, wchar_t off1, wchar_t off2, int *len) { for (int i = 0; i < Str::len(text); i++) @@ -47,7 +47,7 @@ int Regexp::find_expansion(text_stream *text, wchar_t on1, wchar_t on2, @ Still more simply: -= +<<*>>= int Regexp::find_open_brace(text_stream *text) { for (int i=0; i < Str::len(text); i++) if (Str::get_at(text, i) == '{') @@ -56,9 +56,9 @@ int Regexp::find_open_brace(text_stream *text) { } @ Note that we count the empty string as being white space. Again, this is -equivalent to |Regexp::match(p, " *")|, but much faster. +equivalent to [[Regexp::match(p, " *")]], but much faster. -= +<<*>>= int Regexp::string_is_white_space(text_stream *text) { LOOP_THROUGH_TEXT(P, text) if (Regexp::white_space(Str::get(P)) == FALSE) @@ -66,46 +66,47 @@ int Regexp::string_is_white_space(text_stream *text) { return TRUE; } -@h A Worse PCRE. -I originally wanted to call the function in this section |a_better_sscanf|, then -thought perhaps |a_worse_PCRE| would be more true. (PCRE is Philip Hazel's superb +@ \section{A Worse PCRE.} +I originally wanted to call the function in this section [[a_better_sscanf]], then +thought perhaps [[a_worse_PCRE]] would be more true. (PCRE is Philip Hazel's superb C implementation of regular-expression parsing, but I didn't need its full strength, and I didn't want to complicate the build process by linking to it.) This is a very minimal regular expression parser, simply for convenience of parsing short texts against particularly simple patterns. Here is an example of use: -= (text as code) + match_results mr = Regexp::create_mr(); if (Regexp::match(&mr, text, L"fish (%d+) ([a-zA-Z_][a-zA-Z0-9_]*) *") { PRINT("Fish number: %S\n", mr.exp[0]); PRINT("Fish name: %S\n", mr.exp[1]); } Regexp::dispose_of(&mr); -= -Note the |L| at the front of the regex itself: this is a wide string. -This tries to match the given |text| to see if it consists of the word fish, +Note the [[L]] at the front of the regex itself: this is a wide string. + +This tries to match the given [[text]] to see if it consists of the word fish, then any amount of whitespace, then a string of digits which are copied into -|mr->exp[0]|, then whitespace again, and then an alphanumeric identifier to be -copied into |mr->exp[1]|, and finally optional whitespace. (If no match is +[[mr->exp[0]]], then whitespace again, and then an alphanumeric identifier to be +copied into [[mr->exp[1]]], and finally optional whitespace. (If no match is made, the contents of the found strings are undefined.) Note that this differs from, for example, Perl's regular expression matcher in several ways. The regular expression syntax is slightly different and in general simpler. A match has to be made from start to end, so it's as if there -were an implicit |^| at the front and |$| at the back (in Perl terms). The +were an implicit [[^]] at the front and [[$]] at the back (in Perl terms). The full match text is therefore always the entire text put in, so there's no -need to record this. In Perl, matching against |m/(.*) plus (.*)/| would +need to record this. In Perl, matching against [[m/(.*) plus (.*)/]] would set three subexpressions: number 0 would be the whole text matched, number 1 would be the first bracketed part, number 2 the second. Here, though, the -corresponding regex would be written |L"(%c*) plus (%c*)"|, and the bracketed +corresponding regex would be written [[L"(%c*) plus (%c*)"]], and the bracketed terms would be subexpressions 0 and 1. -@d MAX_BRACKETED_SUBEXPRESSIONS 5 /* this many bracketed subexpressions can be extracted */ +<<*>>= +#define MAX_BRACKETED_SUBEXPRESSIONS 5 /* this many bracketed subexpressions can be extracted */ @ The internal state of the matcher is stored as follows: -= +<<*>>= typedef struct match_position { int tpos; /* position within text being matched */ int ppos; /* position within pattern */ @@ -120,9 +121,10 @@ typedef struct match_position { @ It may appear that match texts are limited to 64 characters here, but they are not. They are simply a little faster to access if short. -@d MATCH_TEXT_INITIAL_ALLOCATION 64 +<<*>>= +#define MATCH_TEXT_INITIAL_ALLOCATION 64 -= +<<*>>= typedef struct match_result { wchar_t match_text_storage[MATCH_TEXT_INITIAL_ALLOCATION]; struct text_stream match_text_struct; @@ -140,7 +142,7 @@ explicitly. Note that the storage required is on the C stack (unless some result strings grow very large), so that it's very quick to allocate and deallocate. -= +<<*>>= match_results Regexp::create_mr(void) { match_results mr; mr.no_matched_texts = 0; @@ -164,7 +166,7 @@ void Regexp::dispose_of(match_results *mr) { @ So, then: the matcher itself. -= +<<*>>= int Regexp::match(match_results *mr, text_stream *text, wchar_t *pattern) { if (mr) Regexp::prepare(mr); int rv = (Regexp::match_r(mr, text, pattern, NULL, FALSE) >= 0)?TRUE:FALSE; @@ -203,41 +205,41 @@ void Regexp::prepare(match_results *mr) { } } -@ = +<<*>>= int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern, match_position *scan_from, int allow_partial) { match_position at; if (scan_from) at = *scan_from; else { at.tpos = 0; at.ppos = 0; at.bc = 0; at.bl = 0; } - while ((Str::get_at(text, at.tpos)) || (pattern[at.ppos])) { + while ((Str::get_at(text, at.tpos)) [[]] (pattern[at.ppos])) { if ((allow_partial) && (pattern[at.ppos] == 0)) break; - @; + <>; - int chcl, /* what class of characters to match: a |*_CHARCLASS| value */ - range_from, range_to, /* for |LITERAL_CHARCLASS| only */ + int chcl, /* what class of characters to match: a [[*_CHARCLASS]] value */ + range_from, range_to, /* for [[LITERAL_CHARCLASS]] only */ reverse = FALSE; /* require a non-match rather than a match */ - @; + <>; int rep_from = 1, rep_to = 1; /* minimum and maximum number of repetitions */ int greedy = TRUE; /* go for a maximal-length match if possible */ - @; + <>; int reps = 0; - @; + <>; if (reps < rep_from) return -1; - /* we can now accept anything from |rep_from| to |reps| repetitions */ + /* we can now accept anything from [[rep_from| to |reps]] repetitions */ if (rep_from == reps) { at.tpos += reps; continue; } - @; + <>; /* no match length worked, so no match */ return -1; } - @; + <>; return at.tpos; } -@ = +<>= if (pattern[at.ppos] == '(') { if (at.bl < MAX_BRACKETED_SUBEXPRESSIONS) at.bracket_nesting[at.bl] = -1; if (at.bc < MAX_BRACKETED_SUBEXPRESSIONS) { @@ -255,7 +257,7 @@ int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern, continue; } -@ = +<>= if (pattern[at.ppos] == 0) return -1; int len = 0; chcl = Regexp::get_cclass(pattern, at.ppos, &len, &range_from, &range_to, &reverse); @@ -265,7 +267,7 @@ int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern, @ This is standard regular-expression notation, except that I haven't bothered to implement numeric repetition counts, which we won't need: -@ = +<>= if (chcl == WHITESPACE_CHARCLASS) { rep_from = 1; rep_to = Str::len(text)-at.tpos; } @@ -276,13 +278,13 @@ to implement numeric repetition counts, which we won't need: } if (pattern[at.ppos] == '?') { greedy = FALSE; at.ppos++; } -@ = +<>= for (reps = 0; ((Str::get_at(text, at.tpos+reps)) && (reps < rep_to)); reps++) if (Regexp::test_cclass(Str::get_at(text, at.tpos+reps), chcl, range_from, range_to, pattern, reverse) == FALSE) break; -@ = +<>= int from = rep_from, to = reps, dj = 1, from_tpos = at.tpos; if (greedy) { from = reps; to = rep_from; dj = -1; } for (int j = from; j != to+dj; j += dj) { @@ -291,7 +293,7 @@ to implement numeric repetition counts, which we won't need: if (try >= 0) return try; } -@ = +<>= if (mr) { for (int i=0; iexp[i]); @@ -303,38 +305,39 @@ to implement numeric repetition counts, which we won't need: } @ So then: most characters in the pattern are taken literally (if the pattern -says |q|, the only match is with a lower-case letter "q"), except that: +says [[q]], the only match is with a lower-case letter "q"), except that: (a) a space means "one or more characters of white space"; -(b) |%d| means any decimal digit; -(c) |%c| means any character at all; -(d) |%C| means any character which isn't white space; -(e) |%i| means any character from the identifier class (see above); -(f) |%p| means any character which can be used in the name of a Preform +(b) [[%d]] means any decimal digit; +(c) [[%c]] means any character at all; +(d) [[%C]] means any character which isn't white space; +(e) [[%i]] means any character from the identifier class (see above); +(f) [[%p]] means any character which can be used in the name of a Preform nonterminal, which is to say, an identifier character or a hyphen; -(g) |%P| means the same or else a colon; -(h) |%t| means a tab; -(i) |%q| means a double-quote. +(g) [[%P]] means the same or else a colon; +(h) [[%t]] means a tab; +(i) [[%q]] means a double-quote. -|%| otherwise makes a literal escape; a space means any whitespace character; +[[%]] otherwise makes a literal escape; a space means any whitespace character; square brackets enclose literal alternatives, and note as usual with grep -engines that |[]xyz]| is legal and makes a set of four possibilities, the +engines that [[[]xyz]]] is legal and makes a set of four possibilities, the first of which is a literal close square; within a set, a hyphen makes a -character range; an initial |^| negates the result; and otherwise everything +character range; an initial [[^]] negates the result; and otherwise everything is literal. -@d ANY_CHARCLASS 1 -@d DIGIT_CHARCLASS 2 -@d WHITESPACE_CHARCLASS 3 -@d NONWHITESPACE_CHARCLASS 4 -@d IDENTIFIER_CHARCLASS 5 -@d PREFORM_CHARCLASS 6 -@d PREFORMC_CHARCLASS 7 -@d LITERAL_CHARCLASS 8 -@d TAB_CHARCLASS 9 -@d QUOTE_CHARCLASS 10 +<<*>>= +#define ANY_CHARCLASS 1 +#define DIGIT_CHARCLASS 2 +#define WHITESPACE_CHARCLASS 3 +#define NONWHITESPACE_CHARCLASS 4 +#define IDENTIFIER_CHARCLASS 5 +#define PREFORM_CHARCLASS 6 +#define PREFORMC_CHARCLASS 7 +#define LITERAL_CHARCLASS 8 +#define TAB_CHARCLASS 9 +#define QUOTE_CHARCLASS 10 -= +<<*>>= int Regexp::get_cclass(wchar_t *pattern, int ppos, int *len, int *from, int *to, int *reverse) { if (pattern[ppos] == '^') { ppos++; *reverse = TRUE; } else { *reverse = FALSE; } switch (pattern[ppos]) { @@ -364,7 +367,7 @@ int Regexp::get_cclass(wchar_t *pattern, int ppos, int *len, int *from, int *to, *len = 1; *from = ppos; *to = ppos; return LITERAL_CHARCLASS; } -@ = +<<*>>= int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *drawn_from, int reverse) { int match = FALSE; switch (chcl) { @@ -375,11 +378,11 @@ int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t * case NONWHITESPACE_CHARCLASS: if (!(Characters::is_whitespace(c))) match = TRUE; break; case QUOTE_CHARCLASS: if (c != '\"') match = TRUE; break; case IDENTIFIER_CHARCLASS: if (Regexp::identifier_char(c)) match = TRUE; break; - case PREFORM_CHARCLASS: if ((c == '-') || (c == '_') || - ((c >= 'a') && (c <= 'z')) || + case PREFORM_CHARCLASS: if ((c == '-') [[| (c == '_') |]] + ((c >= 'a') && (c <= 'z')) [[]] ((c >= '0') && (c <= '9'))) match = TRUE; break; - case PREFORMC_CHARCLASS: if ((c == '-') || (c == '_') || (c == ':') || - ((c >= 'a') && (c <= 'z')) || + case PREFORMC_CHARCLASS: if ((c == '-') [[| (c == '_') || (c == ':') |]] + ((c >= 'a') && (c <= 'z')) [[]] ((c >= '0') && (c <= '9'))) match = TRUE; break; case LITERAL_CHARCLASS: if ((range_to > range_from) && (drawn_from[range_from] == '^')) { @@ -398,20 +401,21 @@ int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t * return match; } -@h Replacement. +@ \section{Replacement.} And this routine conveniently handles searching and replacing. This time we -can match at substrings of the |text| (i.e., we are not forced to match +can match at substrings of the [[text]] (i.e., we are not forced to match from the start right to the end), and multiple replacements can be made. For example, -= (text as code) + Regexp::replace(text, L"[aeiou]", L"!", REP_REPEATING); -= -will turn the |text| "goose eggs" into "g!!s! !ggs". -@d REP_REPEATING 1 -@d REP_ATSTART 2 +will turn the [[text]] "goose eggs" into "g!!s! !ggs". -= +<<*>>= +#define REP_REPEATING 1 +#define REP_ATSTART 2 + +<<*>>= int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, int options) { TEMPORARY_TEXT(altered) match_results mr = Regexp::create_mr(); @@ -439,10 +443,10 @@ int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, i changes++; Regexp::dispose_of(&mr); L = Str::len(text); i = L-left-1; - if ((options & REP_REPEATING) == 0) { @; break; } + if ((options & REP_REPEATING) == 0) { <>; break; } continue; } else PUT_TO(altered, Str::get_at(text, i)); - if (options & REP_ATSTART) { @; break; } + if (options & REP_ATSTART) { <>; break; } } Regexp::dispose_of(&mr); if (changes > 0) Str::copy(text, altered); @@ -450,6 +454,6 @@ int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, i return changes; } -@ = +<>= for (i++; i>= +#define PROTECTED_OPEN_BRACE_PPCHAR 0x25A0 +#define PROTECTED_CLOSE_BRACE_PPCHAR 0x25A1 +#define PROTECTED_BLANK_PPCHAR 0x25A2 + +<<*>>= void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *header, linked_list *special_macros, general_pointer specifics, wchar_t comment_char, int encoding) { @@ -35,7 +36,7 @@ void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *hea WRITE("%S", header); preprocessor_state PPS; - @; + <>; TextFiles::read(prototype, FALSE, "can't open prototype file", TRUE, Preprocessor::scan_line, NULL, &PPS); for (int i=0; i>= +#define MAX_PREPROCESSOR_LOOP_DEPTH 8 -= +<<*>>= typedef struct preprocessor_state { struct text_stream *dest; struct preprocessor_macro *defining; /* a "define" body being scanned */ @@ -63,19 +65,19 @@ typedef struct preprocessor_state { int last_line_was_blank; /* used to suppress runs of multiple blank lines */ struct preprocessor_variable_set *global_variables; struct preprocessor_variable_set *stack_frame; - struct linked_list *known_macros; /* of |preprocessor_macro| */ + struct linked_list *known_macros; /* of [[preprocessor_macro]] */ struct general_pointer specifics; wchar_t comment_character; } preprocessor_state; typedef struct preprocessor_loop { struct text_stream *loop_var_name; - struct linked_list *iterations; /* of |text_stream| */ + struct linked_list *iterations; /* of [[text_stream]] */ int repeat_is_block; struct text_stream *repeat_saved_dest; } preprocessor_loop; -@ = +<>= PPS.dest = Str::new(); PPS.suppress_newline = FALSE; PPS.last_line_was_blank = TRUE; @@ -91,7 +93,7 @@ typedef struct preprocessor_loop { @ Conceptually, each loop runs a variable with a given name through a series of textual values in sequence, and we store that data here: -= +<<*>>= void Preprocessor::set_loop_var_name(preprocessor_loop *loop, text_stream *name) { loop->loop_var_name = Str::duplicate(name); } @@ -102,31 +104,31 @@ void Preprocessor::add_loop_iteration(preprocessor_loop *loop, text_stream *valu @ Lines from the prototype (or sometimes from files spliced in) are read, one at a time, by the following. -Note that |define| and |end-define| are not themselves macros, and are handled +Note that [[define]] and [[end-define]] are not themselves macros, and are handled directly here. So you cannot use repeat loops to define multiple macros with parametrised names: but then, nor should you. -= +<<*>>= void Preprocessor::scan_line(text_stream *line, text_file_position *tfp, void *X) { preprocessor_state *PPS = (preprocessor_state *) X; - @; - @; - @; + <>; + <>; + <>; Preprocessor::expand(line, tfp, PPS); - @; + <>; } @ A line is a comment to the preprocessor if its first non-whitespace character -is the special comment character: often |#|, but not necessarily. +is the special comment character: often [[#]], but not necessarily. -@ = +<>= LOOP_THROUGH_TEXT(pos, line) { wchar_t c = Str::get(pos); if (c == PPS->comment_character) return; if (Characters::is_whitespace(c) == FALSE) break; } -@ = +<>= for (int i = 0; i < Str::len(line); i++) { wchar_t c = Str::get_at(line, i); if (c == '\\') { @@ -151,15 +153,15 @@ is the special comment character: often |#|, but not necessarily. } } -@ = +<>= match_results mr = Regexp::create_mr(); - if (Regexp::match(&mr, line, L" *{define: *(%C+) *} *")) @; - if (Regexp::match(&mr, line, L" *{define: *(%C+) (%c*)} *")) @; - if (Regexp::match(&mr, line, L" *{end-define} *")) @; - if (PPS->defining) @; + if (Regexp::match(&mr, line, L" *{define: *(%C+) *} *")) <>; + if (Regexp::match(&mr, line, L" *{define: *(%C+) (%c*)} *")) <>; + if (Regexp::match(&mr, line, L" *{end-define} *")) <>; + if (PPS->defining) <>; Regexp::dispose_of(&mr); -@ = +<>= if (PPS->defining) Errors::in_text_file("nested definitions are not allowed", tfp); text_stream *name = mr.exp[0]; @@ -169,7 +171,7 @@ is the special comment character: often |#|, but not necessarily. Regexp::dispose_of(&mr); return; -@ = +<>= if (PPS->defining) Errors::in_text_file("nested definitions are not allowed", tfp); text_stream *name = mr.exp[0]; @@ -179,19 +181,19 @@ is the special comment character: often |#|, but not necessarily. Regexp::dispose_of(&mr); return; -@ = +<>= Preprocessor::add_line_to_macro(PPS->defining, line, tfp); Regexp::dispose_of(&mr); return; -@ = +<>= if (PPS->defining == NULL) Errors::in_text_file("{end-define} without {define: ...}", tfp); PPS->defining = NULL; Regexp::dispose_of(&mr); return; -@ = +<>= if (PPS->suppress_newline == FALSE) { text_stream *OUT = PPS->dest; if (Str::len(line) == 0) { @@ -208,13 +210,13 @@ is the special comment character: often |#|, but not necessarily. (i) Does not contain any newlines; -(ii) Contains braces |{ ... }| used in nested pairs (unless there is a syntax +(ii) Contains braces [[{ ... }]] used in nested pairs (unless there is a syntax error in the prototype, in which case we must complain). The idea is to pass everything straight through except any braced matter, which needs special attention. -= +<<*>>= void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocessor_state *PPS) { TEMPORARY_TEXT(before_matter) TEMPORARY_TEXT(braced_matter) @@ -238,7 +240,7 @@ void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocess } if (bl > 0) Errors::in_text_file("too many '{'s", tfp); if (after_times) { - @; + <>; } else { WRITE_TO(PPS->dest, "%S", text); } @@ -247,13 +249,13 @@ void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocess DISCARD_TEXT(after_matter) } -@ Suppose we are expanding the text |this {ADJECTIVE} ocean {BEHAVIOUR}|: then -the |before_matter| will be |this |, the |braced_matter| will be |ADJECTIVE|, -and the |after_matter| will be | ocean {BEHAVIOUR}|. +@ Suppose we are expanding the text [[this {ADJECTIVE} ocean {BEHAVIOUR}]]: then +the [[before_matter]] will be [[this ]], the [[braced_matter]] will be [[ADJECTIVE]], +and the [[after_matter]] will be [[ ocean {BEHAVIOUR}]]. -@ = +<>= if (Preprocessor::acceptable_variable_name(braced_matter)) { - @; + <>; } else { text_stream *identifier = braced_matter; text_stream *parameter_settings = NULL; @@ -262,7 +264,7 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|. identifier = mr.exp[0]; parameter_settings = mr.exp[1]; } - @; + <>; preprocessor_macro *mm = Preprocessor::find_macro(PPS->known_macros, identifier); if (mm == NULL) { @@ -271,15 +273,15 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|. Errors::in_text_file_S(erm, tfp); DISCARD_TEXT(erm) } else { - @; + <>; } Regexp::dispose_of(&mr); } -@ So, for example, the identifier |repeat| would be changed here either to -|repeat-block| or |repeat-span|: see above for an explanation. +@ So, for example, the identifier [[repeat]] would be changed here either to +[[repeat-block]] or [[repeat-span]]: see above for an explanation. -@ = +<>= preprocessor_macro *loop_mm; LOOP_OVER_LINKED_LIST(loop_mm, preprocessor_macro, PPS->known_macros) if (Str::len(loop_mm->loop_name) > 0) { @@ -308,27 +310,27 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|. } @ Note that if we are inside a loop, we do not perform expansion on the variable -name, and instead pass it through unchanged -- still as, say, |{NAME}|. This +name, and instead pass it through unchanged -- still as, say, [[{NAME}]]. This is because it won't be expanded until later, when the expander reaches the end of the loop body. -@ = +<>= Preprocessor::expand(before_matter, tfp, PPS); if (PPS->repeat_sp > 0) { WRITE_TO(PPS->dest, "{%S}", braced_matter); } else { - @; + <>; } Preprocessor::expand(after_matter, tfp, PPS); @ Similarly, we don't expand macros inside the body of a loop, except that we -need to expand the |{end-repeat-block}| (or similar) which closes that loop +need to expand the [[{end-repeat-block}]] (or similar) which closes that loop body, so that we can escape back into normal mode. Because loop constructs may be nested, we need to react to (but not expand) loop openings, too. The "shadow stack pointer" shows how deep we are inside these shadowy, not-yet-acted-on, loops. -@ = +<>= if (mm->suppress_whitespace_when_expanding) { while (Characters::is_whitespace(Str::get_last_char(before_matter))) Str::delete_last_character(before_matter); @@ -348,16 +350,16 @@ not-yet-acted-on, loops. if ((divert_if_repeating) && (PPS->repeat_sp > 0)) { WRITE_TO(PPS->dest, "{%S}", braced_matter); } else { - @; + <>; if (mm->suppress_newline_after_expanding) PPS->suppress_newline = TRUE; } Preprocessor::expand(after_matter, tfp, PPS); -@ We can now forget about the |before_matter|, the |after_matter|, or whether +@ We can now forget about the [[before_matter]], the [[after_matter]], or whether we ought not to expand after all: that's all taken care of. A variable expands to its value: -@ = +<>= preprocessor_variable *var = Preprocessor::find_variable(braced_matter, PPS->stack_frame); if (var) { @@ -370,24 +372,24 @@ to its value: } @ This looks fussy, but really it delegates the work by calling a function -attached to the macro, the |expander|. +attached to the macro, the [[expander]]. -@ = +<>= text_stream *parameter_values[MAX_PP_MACRO_PARAMETERS]; for (int i=0; i; - @; + <>; + <>; preprocessor_loop *loop = NULL; - if (mm->begins_loop) @; + if (mm->begins_loop) <>; (*(mm->expander))(mm, PPS, parameter_values, loop, tfp); @ Note that textual values of the parameters are themselves expanded before use: they might contain variables, or even macros. Parameter names are not. -So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|. +So you can have [[in: {WHATEVER}]] but not [[{WHATEVER}: this]]. -@ = +<>= match_results mr = Regexp::create_mr(); while (Regexp::match(&mr, parameter_settings, L" *(%C+): *(%c*)")) { text_stream *setting = mr.exp[0]; @@ -422,7 +424,7 @@ So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|. if (Str::is_whitespace(parameter_settings) == FALSE) Errors::in_text_file("parameter list is malformed", tfp); -@ = +<>= for (int i=0; ino_parameters; i++) if (parameter_values[i] == NULL) if (mm->parameters[i]->optional == FALSE) { @@ -432,11 +434,11 @@ So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|. DISCARD_TEXT(erm) } -@ The following code is a little misleading. At present, |PPS->repeat_sp| is +@ The following code is a little misleading. At present, [[PPS->repeat_sp]] is always either 0 or 1, no matter how deep loop nesting is: but that's just an artefact of the current scanning algorithm, which might some day change. -@ = +<>= if (PPS->repeat_sp >= MAX_PREPROCESSOR_LOOP_DEPTH) { Errors::in_text_file("repetition too deep", tfp); } else { @@ -450,10 +452,10 @@ artefact of the current scanning algorithm, which might some day change. PPS->dest = Str::new(); } -@h Variables. +@ \section{Variables.} Names of variables should conform to: -= +<<*>>= int Preprocessor::acceptable_variable_name(text_stream *name) { LOOP_THROUGH_TEXT(pos, name) { wchar_t c = Str::get(pos); @@ -467,7 +469,7 @@ int Preprocessor::acceptable_variable_name(text_stream *name) { @ Variables are all textual: -= +<<*>>= typedef struct preprocessor_variable { struct text_stream *name; struct text_stream *value; @@ -483,13 +485,13 @@ void Preprocessor::write_variable(preprocessor_variable *var, text_stream *val) var->value = Str::duplicate(val); } -@ Each variable belongs to a single "set". If |EXAMPLE| has one meaning outside a +@ Each variable belongs to a single "set". If [[EXAMPLE]] has one meaning outside a definition and another insider, that's two variables with a common name, not one variable belonging to two sets at once. -= +<<*>>= typedef struct preprocessor_variable_set { - struct linked_list *variables; /* of |preprocessor_variable| */ + struct linked_list *variables; /* of [[preprocessor_variable]] */ struct preprocessor_variable_set *outer; CLASS_DEFINITION } preprocessor_variable_set; @@ -524,7 +526,7 @@ preprocessor_variable *Preprocessor::find_variable(text_stream *name, @ This creates a variable if it doesn't already exist in the given set. (If it exists in some outer set, that doesn't count.) -= +<<*>>= preprocessor_variable *Preprocessor::ensure_variable(text_stream *name, preprocessor_variable_set *in_set) { if (in_set == NULL) internal_error("variable without set"); @@ -538,29 +540,30 @@ preprocessor_variable *Preprocessor::ensure_variable(text_stream *name, return var; } -@h Macros. +@ \section{Macros.} For the most part, each macro seen by users corresponds to a single //preprocessor_macro//, but loop constructs are an exception. When the user -types |{repeat ...}|, this is a reference to |repeat-block| if the body of -what to repeat occupies multiple lines, but to |repeat-span| if only one. +types [[{repeat ...}]], this is a reference to [[repeat-block]] if the body of +what to repeat occupies multiple lines, but to [[repeat-span]] if only one. + +For example, the first [[repeat]] loop here uses the macros [[repeat-block]] and +[[end-repeat-block]], and the second uses [[repeat-span]] and [[end-repeat-span]]. -For example, the first |repeat| loop here uses the macros |repeat-block| and -|end-repeat-block|, and the second uses |repeat-span| and |end-repeat-span|. -= (text) {repeat with SEA in Black, Caspian} Welcome to the SEA Sea. {end-repeat} ... Seas available:{repeat with SEA in Sargasso, Libyan} {SEA} Sea;{end-repeat} -= + @ There are (for now, anyway) hard but harmlessly large limits on the number of parameters and the length of a macro: -@d MAX_PP_MACRO_PARAMETERS 8 -@d MAX_PP_MACRO_LINES 128 +<<*>>= +#define MAX_PP_MACRO_PARAMETERS 8 +#define MAX_PP_MACRO_LINES 128 -= +<<*>>= typedef struct preprocessor_macro { /* syntax */ struct text_stream *identifier; @@ -573,10 +576,10 @@ typedef struct preprocessor_macro { void (*expander)(struct preprocessor_macro *, struct preprocessor_state *, struct text_stream **, struct preprocessor_loop *, struct text_file_position *); /* loop construct if any */ - int begins_loop; /* |TRUE| for e.g. |repeat-block| or |repeat-span| */ - int ends_loop; /* |TRUE| for e.g. |end-repeat-block| */ - struct text_stream *loop_name; /* e.g. |repeat| */ - int span; /* |TRUE| for e.g. |end-repeat-span| or |repeat-span| */ + int begins_loop; /* [[TRUE| for e.g. |repeat-block| or |repeat-span]] */ + int ends_loop; /* [[TRUE| for e.g. |end-repeat-block]] */ + struct text_stream *loop_name; /* e.g. [[repeat]] */ + int span; /* [[TRUE| for e.g. |end-repeat-span| or |repeat-span]] */ /* textual behaviour */ int suppress_newline_after_expanding; @@ -592,16 +595,16 @@ typedef struct preprocessor_macro_parameter { CLASS_DEFINITION } preprocessor_macro_parameter; -@ The following creates a new macro and adds it to the list |L|. By default, it +@ The following creates a new macro and adds it to the list [[L]]. By default, it has an empty definition (i.e., no lines), but may have a meaning provided by its -|expander| function regardless. The |parameter_specification| is as in the -textual declaration: for example, |in: IN ?towards: WAY| would be valid, with -|in| being compulsory and |towards| optional when the macro is used. +[[expander]] function regardless. The [[parameter_specification]] is as in the +textual declaration: for example, [[in: IN ?towards: WAY]] would be valid, with +[[in]] being compulsory and [[towards]] optional when the macro is used. If we expected 10000 macros, a dictionary would be better than a list. But in fact we expect more like 10. -= +<<*>>= preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name, text_stream *parameter_specification, void (*expander)(preprocessor_macro *, preprocessor_state *, text_stream **, preprocessor_loop *, text_file_position *), @@ -609,13 +612,13 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name, if (Preprocessor::find_macro(L, name)) Errors::in_text_file("a macro with this name already exists", tfp); preprocessor_macro *new_macro = CREATE(preprocessor_macro); - @; - @; + <>; + <>; ADD_TO_LINKED_LIST(new_macro, preprocessor_macro, L); return new_macro; } -@ = +<>= new_macro->identifier = Str::duplicate(name); new_macro->no_parameters = 0; @@ -629,7 +632,7 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name, new_macro->suppress_newline_after_expanding = TRUE; new_macro->suppress_whitespace_when_expanding = TRUE; -@ = +<>= text_stream *spec = Str::duplicate(parameter_specification); match_results mr = Regexp::create_mr(); while (Regexp::match(&mr, spec, L" *(%C+): *(%C+) *(%c*)")) { @@ -640,14 +643,14 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name, if (new_macro->no_parameters >= MAX_PP_MACRO_PARAMETERS) { Errors::in_text_file("too many parameters in this definition", tfp); } else { - @; + <>; } } Regexp::dispose_of(&mr); if (Str::is_whitespace(spec) == FALSE) Errors::in_text_file("parameter list for this definition is malformed", tfp); -@ = +<>= preprocessor_macro_parameter *new_parameter = CREATE(preprocessor_macro_parameter); new_parameter->name = Str::duplicate(par_name); new_parameter->definition_token = Str::duplicate(token_name); @@ -661,7 +664,7 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name, @ We can then add lines to a macro (though this will only have an effect if its expander function is //Preprocessor::default_expander//). -= +<<*>>= void Preprocessor::add_line_to_macro(preprocessor_macro *mm, text_stream *line, text_file_position *tfp) { if (mm->no_lines >= MAX_PP_MACRO_LINES) { @@ -671,15 +674,15 @@ void Preprocessor::add_line_to_macro(preprocessor_macro *mm, text_stream *line, } } -@h Reserved macros. +@ \section{Reserved macros.} A few macros are "reserved", that is, have built-in meanings, and use expander functions other than //Preprocessor::default_expander//. -Some of these, the |special_macros|, are supplied by the code calling the +Some of these, the [[special_macros]], are supplied by the code calling the preprocessor. Those will provide domain-specific functionality. But a few are built in here and therefore work in every domain: -= +<<*>>= linked_list *Preprocessor::list_of_reserved_macros(linked_list *special_macros) { linked_list *L = NEW_LINKED_LIST(preprocessor_macro); Preprocessor::new_loop_macro(L, I"repeat", I"with: WITH in: IN", @@ -736,7 +739,7 @@ void Preprocessor::new_loop_macro(linked_list *L, text_stream *name, @ Finding a macro in a list: -= +<<*>>= preprocessor_macro *Preprocessor::find_macro(linked_list *L, text_stream *name) { preprocessor_macro *mm; LOOP_OVER_LINKED_LIST(mm, preprocessor_macro, L) @@ -745,13 +748,13 @@ preprocessor_macro *Preprocessor::find_macro(linked_list *L, text_stream *name) return NULL; } -@h The expander for user-defined macros. -All macros created by |{define: ...}| are expanded by the following function. +@ \section{The expander for user-defined macros.} +All macros created by [[{define: ...}]] are expanded by the following function. It creates a local "stack frame" making the parameters available as variables, then runs the definition lines through the scanner, then dismantles the stack frame again. -= +<<*>>= void Preprocessor::default_expander(preprocessor_macro *mm, preprocessor_state *PPS, text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) { PPS->stack_frame = Preprocessor::new_variable_set(PPS->stack_frame); @@ -765,10 +768,10 @@ void Preprocessor::default_expander(preprocessor_macro *mm, preprocessor_state * PPS->stack_frame = PPS->stack_frame->outer; } -@h The set expander. +@ \section{The set expander.} An easy one. -= +<<*>>= void Preprocessor::set_expander(preprocessor_macro *mm, preprocessor_state *PPS, text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) { text_stream *name = parameter_values[0]; @@ -781,9 +784,9 @@ void Preprocessor::set_expander(preprocessor_macro *mm, preprocessor_state *PPS, Preprocessor::write_variable(var, value); } -@h The repeat expander. +@ \section{The repeat expander.} -= +<<*>>= void Preprocessor::repeat_expander(preprocessor_macro *mm, preprocessor_state *PPS, text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) { text_stream *with = parameter_values[0]; @@ -803,7 +806,7 @@ void Preprocessor::repeat_expander(preprocessor_macro *mm, preprocessor_state *P Preprocessor::add_loop_iteration(loop, value); } -@h The expander used for all loop ends. +@ \section{The expander used for all loop ends.} The macros which open a loop just store up the name of the variable and the range of its values: otherwise, they do nothing. It's only when the end of a loop is reached that any expansion happens, and this is where. @@ -812,7 +815,7 @@ We create a new stack frame inside the current one, and put the loop variable into it. Then we run through the iteration values, setting the variable to each in turn, and expand the material. -= +<<*>>= void Preprocessor::end_loop_expander(preprocessor_macro *mm, preprocessor_state *PPS, text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) { PPS->shadow_sp = 0; @@ -826,12 +829,12 @@ void Preprocessor::end_loop_expander(preprocessor_macro *mm, preprocessor_state Preprocessor::ensure_variable(loop->loop_var_name, PPS->stack_frame); text_stream *value; LOOP_OVER_LINKED_LIST(value, text_stream, loop->iterations) - @; + <>; PPS->stack_frame = PPS->stack_frame->outer; } } -@ = +<>= Preprocessor::write_variable(loop_var, value); if (mm->span) { Preprocessor::expand(matter, tfp, PPS); diff --git a/foundation-module/Chapter_4/String Manipulation.w b/foundation-module/Chapter_4/String_Manipulation.nw similarity index 91% rename from foundation-module/Chapter_4/String Manipulation.w rename to foundation-module/Chapter_4/String_Manipulation.nw index 6a48b71..7ab3347 100644 --- a/foundation-module/Chapter_4/String Manipulation.w +++ b/foundation-module/Chapter_4/String_Manipulation.nw @@ -2,10 +2,10 @@ Convenient routines for manipulating strings of text. -@h Strings are streams. +@ \section{Strings are streams.} Although Foundation provides limited facilities for handling standard or -wide C-style strings -- that is, null-terminated arrays of |char| or -|wchar_t| -- these are not encouraged. +wide C-style strings -- that is, null-terminated arrays of [[char]] or +[[wchar_t]] -- these are not encouraged. Instead, a standard string for a program using Foundation is nothing more than a text stream (see Chapter 2). These are unbounded in size, with memory @@ -22,19 +22,19 @@ against buffer overruns. The present section of code provides convenient routines for creating, duplicating, modifying and examining such strings. -@h New strings. +@ \section{New strings.} Sometimes we want to make a new string in the sense of allocating more memory to hold it. These objects won't automatically be destroyed, so we shouldn't call these routines too casually. If we need a string just for some space to play with for a short while, it's better to create one -with |TEMPORARY_TEXT| and then get rid of it with |DISCARD_TEXT|, macros +with [[TEMPORARY_TEXT]] and then get rid of it with [[DISCARD_TEXT]], macros defined in Chapter 2. The capacity of these strings is unlimited in principle, and the number here is just the size of the initial memory block, which is fastest to access. -= +<<*>>= text_stream *Str::new(void) { return Str::new_with_capacity(32); } @@ -50,10 +50,10 @@ void Str::dispose_of(text_stream *text) { } @ Duplication of an existing string is complicated only by the issue that -we want the duplicate always to be writeable, so that |NULL| can't be -duplicated as |NULL|. +we want the duplicate always to be writeable, so that [[NULL]] can't be +duplicated as [[NULL]]. -= +<<*>>= text_stream *Str::duplicate(text_stream *E) { if (E == NULL) return Str::new(); text_stream *S = CREATE(text_stream); @@ -64,12 +64,12 @@ text_stream *Str::duplicate(text_stream *E) { return NULL; } -@h Converting from C strings. +@ \section{Converting from C strings.} Here we open text streams initially equal to the given C strings, and with the capacity of the initial block large enough to hold the whole thing plus a little extra, for efficiency's sake. -= +<<*>>= text_stream *Str::new_from_wide_string(const wchar_t *C_string) { text_stream *S = CREATE(text_stream); if (Streams::open_from_wide_string(S, C_string)) return S; @@ -96,7 +96,7 @@ text_stream *Str::new_from_locale_string(const char *C_string) { @ And sometimes we want to use an existing stream object: -= +<<*>>= text_stream *Str::from_wide_string(text_stream *S, wchar_t *c_string) { if (Streams::open_from_wide_string(S, c_string) == FALSE) return NULL; return S; @@ -107,9 +107,9 @@ text_stream *Str::from_locale_string(text_stream *S, char *c_string) { return S; } -@h Converting to C strings. +@ \section{Converting to C strings.} -= +<<*>>= void Str::copy_to_ISO_string(char *C_string, text_stream *S, int buffer_size) { Streams::write_as_ISO_string(C_string, S, buffer_size); } @@ -126,9 +126,9 @@ void Str::copy_to_locale_string(char *C_string, text_stream *S, int buffer_size) Streams::write_as_locale_string(C_string, S, buffer_size); } -@h Converting to integers. +@ \section{Converting to integers.} -= +<<*>>= int Str::atoi(text_stream *S, int index) { char buffer[32]; int i = 0; @@ -139,15 +139,15 @@ int Str::atoi(text_stream *S, int index) { return atoi(buffer); } -@h Length. -A puritan would return a |size_t| here, but I am not a puritan. +@ \section{Length.} +A puritan would return a [[size_t]] here, but I am not a puritan. -= +<<*>>= int Str::len(text_stream *S) { return Streams::get_position(S); } -@h Position markers. +@ \section{Position markers.} A position marker is a lightweight way to refer to a particular position in a given string. Position 0 is before the first character; if, for example, the string contains the word "gazpacho", then position 8 represents @@ -155,7 +155,7 @@ the end of the string, after the "o". Negative positions are not allowed, but positive ones well past the end of the string are legal. (Doing things at those positions may well not be, of course.) -= +<<*>>= typedef struct string_position { struct text_stream *S; int index; @@ -163,7 +163,7 @@ typedef struct string_position { @ You can then find a position in a given string thus: -= +<<*>>= string_position Str::start(text_stream *S) { string_position P; P.S = S; P.index = 0; return P; } @@ -180,7 +180,7 @@ string_position Str::end(text_stream *S) { @ And you can step forwards or backwards: -= +<<*>>= string_position Str::back(string_position P) { if (P.index > 0) P.index--; return P; } @@ -209,18 +209,19 @@ int Str::index(string_position P) { @ This leads to the following convenient loop macros: -@d LOOP_THROUGH_TEXT(P, ST) +<<*>>= +#define LOOP_THROUGH_TEXT(P, ST) for (string_position P = Str::start(ST); P.index < Str::len(P.S); P.index++) -@d LOOP_BACKWARDS_THROUGH_TEXT(P, ST) +#define LOOP_BACKWARDS_THROUGH_TEXT(P, ST) for (string_position P = Str::back(Str::end(ST)); P.index >= 0; P.index--) -@h Character operations. +@ \section{Character operations.} How to get at individual characters, then, now that we can refer to positions: -= +<<*>>= wchar_t Str::get(string_position P) { - if ((P.S == NULL) || (P.index < 0)) return 0; + if ((P.S == NULL) [[]] (P.index < 0)) return 0; return Streams::get_char_at_index(P.S, P.index); } @@ -239,7 +240,7 @@ wchar_t Str::get_last_char(text_stream *S) { return Str::get(Str::at(S, L-1)); } -@ = +<<*>>= void Str::put(string_position P, wchar_t C) { if (P.index < 0) internal_error("wrote before start of string"); if (P.S == NULL) internal_error("wrote to null stream"); @@ -256,9 +257,9 @@ void Str::put_at(text_stream *S, int index, wchar_t C) { Str::put(Str::at(S, index), C); } -@h Truncation. +@ \section{Truncation.} -= +<<*>>= void Str::clear(text_stream *S) { Str::truncate(S, 0); } @@ -268,9 +269,9 @@ void Str::truncate(text_stream *S, int len) { if (len < Str::len(S)) Str::put(Str::at(S, len), 0); } -@h Indentation. +@ \section{Indentation.} -= +<<*>>= int Str::remove_indentation(text_stream *S, int spaces_per_tab) { int spaces_in = 0, tab_stops_of_indentation = 0; while (Characters::is_space_or_tab(Str::get_first_char(S))) { @@ -307,9 +308,9 @@ void Str::rectify_indentation(text_stream *S, int spaces_per_tab) { DISCARD_TEXT(tail) } -@h Copying. +@ \section{Copying.} -= +<<*>>= void Str::concatenate(text_stream *S1, text_stream *S2) { Streams::copy(S1, S2); } @@ -330,7 +331,7 @@ void Str::copy_tail(text_stream *S1, text_stream *S2, int from) { @ A subtly different operation is to set a string equal to a given C string: -= +<<*>>= void Str::copy_ISO_string(text_stream *S, char *C_string) { Str::clear(S); Streams::write_ISO_string(S, C_string); @@ -346,10 +347,10 @@ void Str::copy_wide_string(text_stream *S, wchar_t *C_string) { Streams::write_wide_string(S, C_string); } -@h Comparisons. +@ \section{Comparisons.} We provide both case sensitive and insensitive versions. -= +<<*>>= int Str::eq(text_stream *S1, text_stream *S2) { if (Str::cmp(S1, S2) == 0) return TRUE; return FALSE; @@ -371,10 +372,10 @@ int Str::ne_insensitive(text_stream *S1, text_stream *S2) { } @ These two routines produce a numerical string difference suitable for -alphabetic sorting, like |strlen| in the C standard library. +alphabetic sorting, like [[strlen]] in the C standard library. This would be a more elegant implementation: -= (text as InC) + for (string_position P = Str::start(S1), Q = Str::start(S2); (P.index < Str::len(S1)) && (Q.index < Str::len(S2)); P = Str::forward(P), Q = Str::forward(Q)) { @@ -382,11 +383,11 @@ This would be a more elegant implementation: if (d != 0) return d; } return Str::len(S1) - Str::len(S2); -= + But profiling shows that the following speeds up the Inform 7 compiler by around 1%. -= +<<*>>= int Str::cmp(text_stream *S1, text_stream *S2) { int L1 = Str::len(S1), L2 = Str::len(S2), M = L1; if (L2 < M) M = L2; @@ -408,16 +409,16 @@ int Str::cmp_insensitive(text_stream *S1, text_stream *S2) { } @ It's sometimes useful to see whether two strings agree on their last -|N| characters, or their first |N|. For example, -= (text as code) +[[N]] characters, or their first [[N]]. For example, + Str::suffix_eq(I"wayzgoose", I"snow goose", N) -= -will return |TRUE| for |N| equal to 0 to 5, and |FALSE| thereafter. + +will return [[TRUE]] for [[N]] equal to 0 to 5, and [[FALSE]] thereafter. (The Oxford English Dictionary defines a "wayzgoose" as a holiday outing for the staff of a publishing house.) -= +<<*>>= int Str::prefix_eq(text_stream *S1, text_stream *S2, int N) { int L1 = Str::len(S1), L2 = Str::len(S2); if ((N > L1) || (N > L2)) return FALSE; @@ -454,7 +455,7 @@ int Str::ends_with_wide_string(text_stream *S, wchar_t *suffix) { return TRUE; } -@ = +<<*>>= int Str::eq_wide_string(text_stream *S1, wchar_t *S2) { if (S2 == NULL) return (Str::len(S1) == 0)?TRUE:FALSE; if (Str::len(S1) == (int) wcslen(S2)) { @@ -481,9 +482,9 @@ int Str::ne_wide_string(text_stream *S1, wchar_t *S2) { return (Str::eq_wide_string(S1, S2)?FALSE:TRUE); } -@h White space. +@ \section{White space.} -= +<<*>>= int Str::is_whitespace(text_stream *S) { LOOP_THROUGH_TEXT(pos, S) if (Characters::is_space_or_tab(Str::get(pos)) == FALSE) @@ -493,7 +494,7 @@ int Str::is_whitespace(text_stream *S) { @ This removes spaces and tabs from both ends: -= +<<*>>= void Str::trim_white_space(text_stream *S) { int len = Str::len(S), i = 0, j = 0; string_position F = Str::start(S); @@ -545,9 +546,9 @@ int Str::trim_all_white_space_at_end(text_stream *S) { return shortened; } -@h Deleting characters. +@ \section{Deleting characters.} -= +<<*>>= void Str::delete_first_character(text_stream *S) { Str::delete_nth_character(S, 0); } @@ -572,9 +573,9 @@ void Str::delete_n_characters(text_stream *S, int n) { } } -@h Substrings. +@ \section{Substrings.} -= +<<*>>= void Str::substr(OUTPUT_STREAM, string_position from, string_position to) { if (from.S != to.S) internal_error("substr on two different strings"); for (int i = from.index; i < to.index; i++) @@ -632,29 +633,29 @@ int Str::includes_at(text_stream *line, int i, text_stream *pattern) { return TRUE; } -@h Shim for literal storage. +@ \section{Shim for literal storage.} This is where all of those I-literals created by Inweb are stored at run-time. -Note that every instance of, say, |I"fish"| would return the same string, -that is, the same |text_stream *| value. To prevent nasty accidents, this +Note that every instance of, say, [[I"fish"]] would return the same string, +that is, the same [[text_stream *]] value. To prevent nasty accidents, this is marked so that the stream value, "fish", cannot be modified at run-time. The dictionary look-up here would not be thread-safe, so it's protected by a mutex. There's no real performance concern because the following routine is run just once per I-literal in the source code, when the program starts up. -= +<<*>>= dictionary *string_literals_dictionary = NULL; text_stream *Str::literal(wchar_t *wide_C_string) { text_stream *answer = NULL; CREATE_MUTEX(mutex); LOCK_MUTEX(mutex); - @; + <>; UNLOCK_MUTEX(mutex); return answer; } -@ = +<>= if (string_literals_dictionary == NULL) string_literals_dictionary = Dictionaries::new(100, TRUE); answer = Dictionaries::get_text_literal(string_literals_dictionary, wide_C_string); diff --git a/foundation-module/Chapter_4/Text Files.w b/foundation-module/Chapter_4/Text_Files.nw similarity index 76% rename from foundation-module/Chapter_4/Text Files.w rename to foundation-module/Chapter_4/Text_Files.nw index 22dc6e6..70ec2fa 100755 --- a/foundation-module/Chapter_4/Text Files.w +++ b/foundation-module/Chapter_4/Text_Files.nw @@ -2,7 +2,7 @@ To read text files of whatever flavour, one line at a time. -@h Text files. +@ \section{Text files.} Foundation was written mainly to support command-line tools which, of their nature, deal with a lot of text files: source code of programs, configuration files, HTML, XML and so on. The main aim of this section is to provide a @@ -10,13 +10,13 @@ standard way to read in and iterate through lines of a text file. First, though, here is a perhaps clumsy but effective way to test if a file actually exists on disc at a given filename. Note that under the C standard, -it's entirely legal for |fopen| to behave more or less as it likes if asked to +it's entirely legal for [[fopen]] to behave more or less as it likes if asked to open a directory as a file; and on MacOS, it sometimes opens a directory exactly as if it were an empty text file. The safest way to ensure that a directory is -never confused with a file seems to be to try |opendir| on it, and the following +never confused with a file seems to be to try [[opendir]] on it, and the following does essentially that. -= +<<*>>= int TextFiles::exists(filename *F) { TEMPORARY_TEXT(pn) WRITE_TO(pn, "%f", F); @@ -32,10 +32,10 @@ int TextFiles::exists(filename *F) { return TRUE; } -@h Text file positions. +@ \section{Text file positions.} Here's how we record a position in a text file: -= +<<*>>= typedef struct text_file_position { struct filename *text_file_filename; FILE *handle_when_open; @@ -48,7 +48,7 @@ typedef struct text_file_position { @ For access: -= +<<*>>= int TextFiles::get_line_count(text_file_position *tfp) { if (tfp == NULL) return 0; return tfp->line_count; @@ -56,7 +56,7 @@ int TextFiles::get_line_count(text_file_position *tfp) { @ And this is for a real nowhere man: -= +<<*>>= text_file_position TextFiles::nowhere(void) { text_file_position tfp; tfp.text_file_filename = NULL; @@ -74,26 +74,26 @@ text_file_position TextFiles::at(filename *F, int line) { return tfp; } -@h Text file scanner. +@ \section{Text file scanner.} We read lines in, delimited by any of the standard line-ending characters, -and send them one at a time to a function called |iterator|. Throughout, -we preserve a pointer called |state| to some object being used by the +and send them one at a time to a function called [[iterator]]. Throughout, +we preserve a pointer called [[state]] to some object being used by the client. -= +<<*>>= int TextFiles::read(filename *F, int escape_oddities, char *message, int serious, void (iterator)(text_stream *, text_file_position *, void *), text_file_position *start_at, void *state) { text_file_position tfp; tfp.ufb = TextFiles::create_ufb(); - @; - @; - @; + <>; + <>; + <>; fclose(tfp.handle_when_open); return tfp.line_count; } -@ = +<>= tfp.handle_when_open = Filenames::fopen(F, "rb"); if (tfp.handle_when_open == NULL) { if (message == NULL) return 0; @@ -101,12 +101,12 @@ int TextFiles::read(filename *F, int escape_oddities, char *message, int serious else { Errors::with_file(message, F); return 0; } } -@ The ANSI definition of |ftell| and |fseek| says that, with text files, the +@ The ANSI definition of [[ftell]] and [[fseek]] says that, with text files, the only definite position value is 0 -- meaning the beginning of the file -- and -this is what we initialise |line_position| to. We must otherwise only write -values returned by |ftell| into this field. +this is what we initialise [[line_position]] to. We must otherwise only write +values returned by [[ftell]] into this field. -@ = +<>= if (start_at == NULL) { tfp.line_count = 1; tfp.line_position = 0; @@ -122,48 +122,48 @@ values returned by |ftell| into this field. tfp.actively_scanning = TRUE; tfp.text_file_filename = F; -@ We aim to get this right whether the lines are terminated by |0A|, |0D|, -|0A 0D| or |0D 0A|. The final line is not required to be terminated. +@ We aim to get this right whether the lines are terminated by [[0A]], [[0D]], +[[0A 0D]] or [[0D 0A]]. The final line is not required to be terminated. -@ = +<>= TEMPORARY_TEXT(line) int i = 0, c = ' '; while ((c != EOF) && (tfp.actively_scanning)) { c = TextFiles::utf8_fgetc(tfp.handle_when_open, NULL, escape_oddities, &tfp.ufb); - if ((c == EOF) || (c == '\x0a') || (c == '\x0d')) { + if ((c == EOF) [[| (c == '\x0a') |]] (c == '\x0d')) { Str::put_at(line, i, 0); - if ((i > 0) || (c != tfp.skip_terminator)) { - @; + if ((i > 0) [[]] (c != tfp.skip_terminator)) { + <>; if (c == '\x0a') tfp.skip_terminator = '\x0d'; if (c == '\x0d') tfp.skip_terminator = '\x0a'; } else tfp.skip_terminator = 'X'; - @; + <>; i = 0; } else { Str::put_at(line, i++, (wchar_t) c); } } if ((i > 0) && (tfp.actively_scanning)) - @; + <>; DISCARD_TEXT(line) @ We update the line counter only when a line is actually sent: -@ = +<>= iterator(line, &tfp, state); tfp.line_count++; @ But we update the text file position after every apparent line terminator. This is because we might otherwise, on a Windows text file, end up with an -|ftell| position in between the |CR| and the |LF|; if we resume at that point, +[[ftell]] position in between the [[CR]] and the [[LF]]; if we resume at that point, later on, we'll then have an off-by-one error in the line numbering in the resumption as compared to during the original pass. -Properly speaking, |ftell| returns a long |int|, not an |int|, but on a +Properly speaking, [[ftell]] returns a long [[int]], not an [[int]], but on a 32-bit-or-more integer machine, this gives us room for files to run to 2GB. Text files seldom come that large. -@ = +<>= tfp.line_position = (int) (ftell(tfp.handle_when_open)); if (tfp.line_position == -1) { if (serious) @@ -172,15 +172,15 @@ Text files seldom come that large. Errors::with_file("unable to determine position in file", F); } -@ = +<<*>>= void TextFiles::read_line(OUTPUT_STREAM, int escape_oddities, text_file_position *tfp) { Str::clear(OUT); int i = 0, c = ' '; while ((c != EOF) && (tfp->actively_scanning)) { c = TextFiles::utf8_fgetc(tfp->handle_when_open, NULL, escape_oddities, &tfp->ufb); - if ((c == EOF) || (c == '\x0a') || (c == '\x0d')) { + if ((c == EOF) [[| (c == '\x0a') |]] (c == '\x0d')) { Str::put_at(OUT, i, 0); - if ((i > 0) || (c != tfp->skip_terminator)) { + if ((i > 0) [[]] (c != tfp->skip_terminator)) { if (c == '\x0a') tfp->skip_terminator = '\x0d'; if (c == '\x0d') tfp->skip_terminator = '\x0a'; } else tfp->skip_terminator = 'X'; @@ -196,35 +196,35 @@ void TextFiles::read_line(OUTPUT_STREAM, int escape_oddities, text_file_position @ The routine being iterated can indicate that it has had enough by calling the following: -= +<<*>>= void TextFiles::lose_interest(text_file_position *tfp) { tfp->actively_scanning = FALSE; } -@h Reading UTF-8 files. +@ \section{Reading UTF-8 files.} The following routine reads a sequence of Unicode characters from a UTF-8 encoded file, but returns them as a sequence of ISO Latin-1 characters, a trick it can only pull off by escaping non-ISO characters. This is done by -taking character number |N| and feeding it out, one character at a time, as -the text |[unicode N]|, writing the number in decimal. Only one UTF-8 +taking character number [[N]] and feeding it out, one character at a time, as +the text [[[unicode N]]], writing the number in decimal. Only one UTF-8 file like this will be being read at a time, and the routine will be -repeatedly called until |EOF| or a line division. +repeatedly called until [[EOF]] or a line division. Strictly speaking, we transmit not as ISO Latin-1 but as that subset of ISO which have corresponding (different) codes in the ZSCII character set. This excludes some typewriter symbols and a handful of letterforms, as we shall see. -There are two exceptions: |TextFiles::utf8_fgetc| can also return the usual C -end-of-file pseudo-character |EOF|, and it can also return the Unicode BOM +There are two exceptions: [[TextFiles::utf8_fgetc]] can also return the usual C +end-of-file pseudo-character [[EOF]], and it can also return the Unicode BOM (byte-ordering marker) pseudo-character, which is legal at the start of a file and which is automatically prepended by some text editors and word-processors when they save a UTF-8 file (though in fact it is not -required by the UTF-8 specification). Anyone calling |TextFiles::utf8_fgetc| must -check the return value for |EOF| every time, and for |0xFEFF| every time we +required by the UTF-8 specification). Anyone calling [[TextFiles::utf8_fgetc]] must +check the return value for [[EOF]] every time, and for [[0xFEFF]] every time we might be at the start of the file being read. -= +<<*>>= typedef struct unicode_file_buffer { char unicode_feed_buffer[32]; /* holds a single escape such as "[unicode 3106]" */ int ufb_counter; /* position in the unicode feed buffer */ @@ -247,9 +247,9 @@ int TextFiles::utf8_fgetc(FILE *from, const char **or_from, int escape_oddities, if (c == EOF) return c; /* ruling out EOF leaves a genuine byte from the file */ if (c<0x80) return c; /* in all other cases, a UTF-8 continuation sequence begins */ - @; - @; - if (escape_oddities) @; + <>; + <>; + if (escape_oddities) <>; if (c == 0xFEFF) return c; /* the Unicode BOM non-character */ @@ -268,7 +268,7 @@ fatal error (which is pretty well the only alternative here). The user is likely to see problem messages later on which arise from the question marks, and that will have to do. -@ = +<>= if (c<0xC0) return '?'; /* malformed UTF-8 */ if (c<0xE0) { c = c & 0x1f; conts = 1; } else if (c<0xF0) { c = c & 0xf; conts = 2; } @@ -293,12 +293,12 @@ fiction offerings. Had they been collaborating with J. R. R. Tolkien rather than Douglas Adams, they might have filled this gap. As it was, "eth" never occurred in any of their works.) -We let the multiplication sign |0xd7| through even though ZSCII doesn't +We let the multiplication sign [[0xd7]] through even though ZSCII doesn't support it, but convert it to an "x": this is so that we can parse numbers in scientific notation. -@ = - if ((c == 0xa1) || (c == 0xa3) || (c == 0xbf)) return c; /* pound sign, inverted ! and ? */ +<>= + if ((c == 0xa1) [[| (c == 0xa3) |]] (c == 0xbf)) return c; /* pound sign, inverted ! and ? */ if (c == 0xd7) return 'x'; /* convert multiplication sign to lower case "x" */ if ((c >= 0xc0) && (c <= 0xff)) { /* accented West European letters, but... */ if ((c != 0xd0) && (c != 0xf0) && /* not Icelandic eths */ @@ -312,7 +312,7 @@ where we would normally expect hyphens and ordinary spaces: this is intended for the benefit of users with helpful word-processors which autocorrect hyphens into em-rules when they are flanked by spaces, and so on. -@ = +<>= if (c == 0x85) return '\x0d'; /* NEL, or "next line" */ if (c == 0xa0) return ' '; /* non-breaking space */ if ((c >= 0x2000) && (c <= 0x200a)) return ' '; /* space variants */ diff --git a/foundation-module/Chapter_4/Tries and Avinues.w b/foundation-module/Chapter_4/Tries_and_Avinues.nw similarity index 75% rename from foundation-module/Chapter_4/Tries and Avinues.w rename to foundation-module/Chapter_4/Tries_and_Avinues.nw index fd7cc8a..daa2478 100644 --- a/foundation-module/Chapter_4/Tries and Avinues.w +++ b/foundation-module/Chapter_4/Tries_and_Avinues.nw @@ -2,7 +2,7 @@ To examine heads and tails of text, to see how it may inflect. -@h Tries. +@ \section{Tries.} The standard data structure for searches through possible prefixes or suffixes is a "trie". The term goes back to Edward Fredkin in 1961; some pronounce it "try" and some "tree", and either would be a fair @@ -17,28 +17,29 @@ front of a text, whereas an end head represents matching from the back. (b) "Choices". A choice node has a given match character, say an "f", and represents which node to go to next if this is the current character in the -text. It must either be a valid Unicode character or |TRIE_ANYTHING|, which +text. It must either be a valid Unicode character or [[TRIE_ANYTHING]], which is a wildcard representing "any text of any length here". Since a choice -must always lead somewhere, |on_success| must point to another node. +must always lead somewhere, [[on_success]] must point to another node. There can be any number of choices at a given position, so choice nodes -are always organised in linked lists joined by |next|. +are always organised in linked lists joined by [[next]]. (c) "Terminals", always leaves, which have match character set to the -impossible value |TRIE_STOP|, and for which |match_outcome| is non-null; thus, +impossible value [[TRIE_STOP]], and for which [[match_outcome]] is non-null; thus, different terminal nodes can result in different outcomes if they are ever reached at the end of a successful scan. A terminal node is always the only item in a list. -@d TRIE_START -1 /* head: the root of a trie parsing forwards from the start */ -@d TRIE_END -2 /* head: the root of a trie parsing backwards from the end */ -@d TRIE_ANYTHING 10003 /* choice: match any text here */ -@d TRIE_ANY_GROUP 10001 /* choice: match any character from this group */ -@d TRIE_NOT_GROUP 10002 /* choice: match any character not in this group */ -@d TRIE_STOP -3 /* terminal: here's the outcome */ +<<*>>= +#define TRIE_START -1 /* head: the root of a trie parsing forwards from the start */ +#define TRIE_END -2 /* head: the root of a trie parsing backwards from the end */ +#define TRIE_ANYTHING 10003 /* choice: match any text here */ +#define TRIE_ANY_GROUP 10001 /* choice: match any character from this group */ +#define TRIE_NOT_GROUP 10002 /* choice: match any character not in this group */ +#define TRIE_STOP -3 /* terminal: here's the outcome */ -@d MAX_TRIE_GROUP_SIZE 26 /* size of the allowable groups of characters */ +#define MAX_TRIE_GROUP_SIZE 26 /* size of the allowable groups of characters */ -= +<<*>>= typedef struct match_trie { int match_character; /* or one of the special cases above */ wchar_t group_characters[MAX_TRIE_GROUP_SIZE+1]; @@ -48,40 +49,41 @@ typedef struct match_trie { } match_trie; @ We have just one routine for extending and scanning the trie: it either -tries to find whether a text |p| leads to any outcome in the existing trie, +tries to find whether a text [[p]] leads to any outcome in the existing trie, or else forcibly extends the existing trie to ensure that it does. -It might look as if calling |Tries::search| always returns |add_outcome| when +It might look as if calling [[Tries::search]] always returns [[add_outcome]] when this is set, but this isn't true: if the trie already contains a node -representing how to deal with |p|, we get whatever outcome is already +representing how to deal with [[p]], we get whatever outcome is already established. -There are two motions to keep track of: our progress through the text |p| +There are two motions to keep track of: our progress through the text [[p]] being scanned, and our progress through the trie which tells us how to scan it. We scan the text either forwards or backwards, starting with the first or last character and then working through, finishing with a 0 terminator. (This is true even if working backwards: we pretend the character stored -before the text began is 0.) |i| represents the index of our current position -in |p|, and runs either from 0 up to |N| or from |N-1| down to |-1|, -where |N| is the number of characters in |p|. +before the text began is 0.) [[i]] represents the index of our current position +in [[p]], and runs either from 0 up to [[N]] or from [[N-1]] down to [[-1]], +where [[N]] is the number of characters in [[p]]. -We scan the trie using a pair of pointers. |prev| is the last node we -successfully left, and |pos| is one we are currently at, which can be +We scan the trie using a pair of pointers. [[prev]] is the last node we +successfully left, and [[pos]] is one we are currently at, which can be either a terminal node or a choice node (in which case it's the head of a linked list of such nodes). -@d MAX_TRIE_REWIND 10 /* that should be far, far more rewinding than necessary */ +<<*>>= +#define MAX_TRIE_REWIND 10 /* that should be far, far more rewinding than necessary */ -= +<<*>>= wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) { if (T == NULL) internal_error("no trie to search"); int start, endpoint, delta; - @; + <>; match_trie *prev = NULL, *pos = T; - @; + <>; int rewind_sp = 0; int rewind_points[MAX_TRIE_REWIND]; @@ -113,7 +115,7 @@ wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) { if (c == '*') endpoint -= delta; RewindHere: - @; + <>; if (add_outcome == NULL) { if (rewind_sp > 0) { i = rewind_points[rewind_sp-1]; @@ -124,19 +126,19 @@ wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) { } return NULL; /* failure! */ } - @; + <>; } - if ((pos) && (pos->match_character == TRIE_ANYTHING)) @; + if ((pos) && (pos->match_character == TRIE_ANYTHING)) <>; if ((pos) && (pos->match_outcome)) return pos->match_outcome; /* success! */ if (add_outcome == NULL) return NULL; /* failure! */ if (pos == NULL) - @ + <> else - @; + <>; } -@ = +<>= start = 0; endpoint = Str::len(p); delta = 1; if (T->match_character == TRIE_END) { start = Str::len(p)-1; endpoint = -1; delta = -1; } @@ -146,10 +148,10 @@ this tends to make commonly used exits migrate upwards and rarities downwards. But we aren't going to search these tries anything like intensively enough to make it worth the trouble. -(The following cannot be a |while| loop since C does not allow us to |break| -or |continue| out of an outer loop from an inner one.) +(The following cannot be a [[while]] loop since C does not allow us to [[break]] +or [[continue]] out of an outer loop from an inner one.) -@ = +<>= int ambig = 0, unambig = 0; match_trie *point; for (point = pos; point; point = point->next) @@ -158,7 +160,7 @@ or |continue| out of an outer loop from an inner one.) FauxWhileLoop: if (pos) { - if ((add_outcome == NULL) || (Tries::is_ambiguous(pos) == FALSE)) + if ((add_outcome == NULL) [[]] (Tries::is_ambiguous(pos) == FALSE)) if (Tries::matches(pos, c)) { if (pos->match_character == TRIE_ANYTHING) break; if ((add_outcome == NULL) && (ambig > 0) && (ambig+unambig > 1) @@ -168,14 +170,14 @@ or |continue| out of an outer loop from an inner one.) rewind_prev_positions[rewind_sp] = prev; rewind_sp++; } - @; + <>; continue; } pos = pos->next; goto FauxWhileLoop; } -@ = +<>= match_trie *new_pos = NULL; if (g > 0) { int nt = TRIE_ANY_GROUP; @@ -211,29 +213,29 @@ or |continue| out of an outer loop from an inner one.) } pos = new_pos; - @; continue; + <>; continue; -@ = +<>= if (pos == NULL) internal_error("trie invariant broken"); prev = pos; pos = prev->on_success; -@ If |pos| is |NULL| then it follows that |prev->on_success| is |NULL|, since -this is how |pos| was calculated; so to add a new terminal node we simply add +@ If [[pos]] is [[NULL]] then it follows that [[prev->on_success]] is [[NULL]], since +this is how [[pos]] was calculated; so to add a new terminal node we simply add it there. -@ = +<>= prev->on_success = Tries::new(TRIE_STOP); prev->on_success->match_outcome = add_outcome; return add_outcome; -@ = +<>= prev->on_success = Tries::new(TRIE_STOP); prev->on_success->match_outcome = add_outcome; return add_outcome; @ Single nodes are matched thus: -= +<<*>>= int Tries::matches(match_trie *pos, int c) { if (pos->match_character == TRIE_ANYTHING) return TRUE; if (pos->match_character == TRIE_ANY_GROUP) { @@ -263,7 +265,7 @@ int Tries::is_ambiguous(match_trie *pos) { @ Where: -= +<<*>>= match_trie *Tries::new(int mc) { match_trie *T = CREATE(match_trie); T->match_character = mc; @@ -273,14 +275,14 @@ match_trie *Tries::new(int mc) { return T; } -@h Avinues. +@ \section{Avinues.} A trie is only a limited form of finite state machine. We're not going to need the whole power of these, but we do find it useful to chain a series of tries together. The idea is to scan against one trie, then, if there's no result, start again with the next, and so on. Inform therefore often matches text against a linked list of tries: we'll call that an "avinue". -= +<<*>>= typedef struct match_avinue { struct match_trie *the_trie; struct match_avinue *next; @@ -289,7 +291,7 @@ typedef struct match_avinue { @ An avinue starts out with a single trie, which itself has just a single head node (of either sort). -= +<<*>>= match_avinue *Tries::new_avinue(int from_start) { match_avinue *A = CREATE(match_avinue); A->next = NULL; @@ -298,14 +300,14 @@ match_avinue *Tries::new_avinue(int from_start) { } void Tries::add_to_avinue(match_avinue *mt, text_stream *from, wchar_t *to) { - if ((mt == NULL) || (mt->the_trie == NULL)) internal_error("null trie"); + if ((mt == NULL) [[]] (mt->the_trie == NULL)) internal_error("null trie"); Tries::search(mt->the_trie, from, to); } @ The following duplicates an avinue, pointing to the same sequence of tries. -= +<<*>>= match_avinue *Tries::duplicate_avinue(match_avinue *A) { match_avinue *F = NULL, *FL = NULL; while (A) { @@ -323,7 +325,7 @@ match_avinue *Tries::duplicate_avinue(match_avinue *A) { @ As noted above, searching an avinue is a matter of searching with each trie in turn until one matches (if it does). -= +<<*>>= wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) { wchar_t *result = NULL; while ((T) && (result == NULL)) { @@ -333,9 +335,9 @@ wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) { return result; } -@h Logging. +@ \section{Logging.} -= +<<*>>= void Tries::log_avinue(OUTPUT_STREAM, void *vA) { match_avinue *A = (match_avinue *) vA; WRITE("Avinue:\n"); INDENT; diff --git a/foundation-module/Chapter_4/Wide Strings.w b/foundation-module/Chapter_4/Wide_Strings.nw similarity index 77% rename from foundation-module/Chapter_4/Wide Strings.w rename to foundation-module/Chapter_4/Wide_Strings.nw index 40db2e8..f99dbf3 100644 --- a/foundation-module/Chapter_4/Wide Strings.w +++ b/foundation-module/Chapter_4/Wide_Strings.nw @@ -2,27 +2,27 @@ A minimal library for handling wide C strings. -@ By "wide string", we mean an array of |wchar_t|. A pointer to this type -is what is returned by an L-literal in ANSI C, such as |L"look, I'm wide"|. +@ By "wide string", we mean an array of [[wchar_t]]. A pointer to this type +is what is returned by an L-literal in ANSI C, such as [[L"look, I'm wide"]]. A wide string is essentially a C string but with characters stored in full words instead of bytes. The character values should be Unicode code points. We will do as little as possible with wide strings, and the following wrappers simply abstract the standard C library's handling. -= +<<*>>= int Wide::len(wchar_t *p) { return (int) wcslen(p); } @ On the rare occasions when we need to sort alphabetically we'll also call: -= +<<*>>= int Wide::cmp(wchar_t *A, wchar_t *B) { return wcscmp(A, B); } -@ = +<<*>>= int Wide::atoi(wchar_t *p) { return (int) wcstol(p, NULL, 10); }