foundation-module: Chapter 4: Nowebify.

This commit is contained in:
AwesomeAdam54321 2024-03-09 11:17:42 +08:00
parent dce27ab9d8
commit f7bbaba2b4
9 changed files with 570 additions and 560 deletions

View file

@ -2,8 +2,8 @@
A minimal library for handling C-style strings. A minimal library for handling C-style strings.
@ Programs using Foundation store text in |text_stream| structures almost all @ Programs using Foundation store text in [[text_stream]] structures almost all
of the time, but old-style, null-terminated |char *| array strings are of the time, but old-style, null-terminated [[char *]] array strings are
still occasionally needed. still occasionally needed.
We need to handle C strings long enough to contain any plausible filename, and We need to handle C strings long enough to contain any plausible filename, and
@ -11,17 +11,18 @@ any run of a dozen or so lines of code; but we have no real need to handle
strings of unlimited length, nor to be parsimonious with memory. strings of unlimited length, nor to be parsimonious with memory.
The following defines a type for a string long enough for our purposes. The following defines a type for a string long enough for our purposes.
It should be at least as long as the constant sometimes called |PATH_MAX|, It should be at least as long as the constant sometimes called [[PATH_MAX]],
the maximum length of a pathname, which is 1024 on Mac OS X. the maximum length of a pathname, which is 1024 on Mac OS X.
@d MAX_STRING_LENGTH 8*1024 <<*>>=
#define MAX_STRING_LENGTH 8*1024
= <<*>>=
typedef char string[MAX_STRING_LENGTH+1]; typedef char string[MAX_STRING_LENGTH+1];
@ Occasionally we need access to the real, unbounded strlen: @ Occasionally we need access to the real, unbounded strlen:
= <<*>>=
int CStrings::strlen_unbounded(const char *p) { int CStrings::strlen_unbounded(const char *p) {
return (int) strlen(p); return (int) strlen(p);
} }
@ -31,16 +32,16 @@ an attempt to continue execution after a string overflow might conceivably
result in a malformatted shell command being passed to the operating system, result in a malformatted shell command being passed to the operating system,
which we cannot risk. which we cannot risk.
= <<*>>=
int CStrings::check_len(int n) { int CStrings::check_len(int n) {
if ((n > MAX_STRING_LENGTH) || (n < 0)) Errors::fatal("String overflow\n"); if ((n > MAX_STRING_LENGTH) [[]] (n < 0)) Errors::fatal("String overflow\n");
return n; return n;
} }
@ The following is then protected from reading out of range if given a @ The following is then protected from reading out of range if given a
non-terminated string, though this should never actually happen. non-terminated string, though this should never actually happen.
= <<*>>=
int CStrings::len(char *str) { int CStrings::len(char *str) {
for (int i=0; i<=MAX_STRING_LENGTH; i++) for (int i=0; i<=MAX_STRING_LENGTH; i++)
if (str[i] == 0) return i; if (str[i] == 0) return i;
@ -48,10 +49,10 @@ int CStrings::len(char *str) {
return MAX_STRING_LENGTH; return MAX_STRING_LENGTH;
} }
@ We then have a replacement for |strcpy|, identical except that it's @ We then have a replacement for [[strcpy]], identical except that it's
bounds-checked: bounds-checked:
= <<*>>=
void CStrings::copy(char *to, char *from) { void CStrings::copy(char *to, char *from) {
CStrings::check_len(CStrings::len(from)); CStrings::check_len(CStrings::len(from));
int i; int i;
@ -59,9 +60,9 @@ void CStrings::copy(char *to, char *from) {
to[i] = 0; to[i] = 0;
} }
@ String comparisons will be done with the following, not |strcmp| directly: @ String comparisons will be done with the following, not [[strcmp]] directly:
= <<*>>=
int CStrings::eq(char *A, char *B) { int CStrings::eq(char *A, char *B) {
return (CStrings::cmp(A, B) == 0)?TRUE:FALSE; return (CStrings::cmp(A, B) == 0)?TRUE:FALSE;
} }
@ -72,20 +73,20 @@ int CStrings::ne(char *A, char *B) {
@ On the rare occasions when we need to sort alphabetically we'll also call: @ On the rare occasions when we need to sort alphabetically we'll also call:
= <<*>>=
int CStrings::cmp(char *A, char *B) { int CStrings::cmp(char *A, char *B) {
if ((A == NULL) || (A[0] == 0)) { if ((A == NULL) [[]] (A[0] == 0)) {
if ((B == NULL) || (B[0] == 0)) return 0; if ((B == NULL) [[]] (B[0] == 0)) return 0;
return -1; return -1;
} }
if ((B == NULL) || (B[0] == 0)) return 1; if ((B == NULL) [[]] (B[0] == 0)) return 1;
return strcmp(A, B); return strcmp(A, B);
} }
@ And the following is needed to deal with extension filenames on platforms @ And the following is needed to deal with extension filenames on platforms
whose locale is encoded as UTF-8. whose locale is encoded as UTF-8.
= <<*>>=
void CStrings::transcode_ISO_string_to_UTF8(char *p, char *dest) { void CStrings::transcode_ISO_string_to_UTF8(char *p, char *dest) {
int i, j; int i, j;
for (i=0, j=0; p[i]; i++) { for (i=0, j=0; p[i]; i++) {
@ -100,33 +101,33 @@ void CStrings::transcode_ISO_string_to_UTF8(char *p, char *dest) {
dest[j] = 0; dest[j] = 0;
} }
@ I dislike to use |strncpy| because, and for some reason this surprises @ I dislike to use [[strncpy]] because, and for some reason this surprises
me every time, it truncates but fails to write a null termination character me every time, it truncates but fails to write a null termination character
if the string to be copied is larger than the buffer to write to: the if the string to be copied is larger than the buffer to write to: the
result is therefore not a well-formed string and we have to fix matters by result is therefore not a well-formed string and we have to fix matters by
hand. This I think makes for opaque code. So: hand. This I think makes for opaque code. So:
= <<*>>=
void CStrings::truncated_strcpy(char *to, char *from, int max) { void CStrings::truncated_strcpy(char *to, char *from, int max) {
int i; int i;
for (i=0; ((from[i]) && (i<max-1)); i++) to[i] = from[i]; for (i=0; ((from[i]) && (i<max-1)); i++) to[i] = from[i];
to[i] = 0; to[i] = 0;
} }
@h Text storage. @ \section{Text storage.}
The following is convenient for parking a read-only string of text somewhere The following is convenient for parking a read-only string of text somewhere
safe in memory. Since the length can't be extended, it's usually unsafe safe in memory. Since the length can't be extended, it's usually unsafe
to write to the result. (Inform tools very seldom use this, because C strings to write to the result. (Inform tools very seldom use this, because C strings
are almost always best avoided.) are almost always best avoided.)
= <<*>>=
typedef struct string_storage_area { typedef struct string_storage_area {
char *storage_at; char *storage_at;
int capacity; int capacity;
CLASS_DEFINITION CLASS_DEFINITION
} string_storage_area; } string_storage_area;
@ = <<*>>=
char *CStrings::park_string(char *from) { char *CStrings::park_string(char *from) {
string_storage_area *ssa = CREATE(string_storage_area); string_storage_area *ssa = CREATE(string_storage_area);
ssa->capacity = (int) CStrings::strlen_unbounded(from) + 1; ssa->capacity = (int) CStrings::strlen_unbounded(from) + 1;
@ -137,7 +138,7 @@ char *CStrings::park_string(char *from) {
@ And here we free any SSAs needed in the course of the run. @ And here we free any SSAs needed in the course of the run.
= <<*>>=
void CStrings::free_ssas(void) { void CStrings::free_ssas(void) {
string_storage_area *ssa; string_storage_area *ssa;
LOOP_OVER(ssa, string_storage_area) LOOP_OVER(ssa, string_storage_area)

View file

@ -2,9 +2,9 @@
Individual characters. Individual characters.
@h Character classes. @ \section{Character classes.}
= <<*>>=
wchar_t Characters::tolower(wchar_t c) { wchar_t Characters::tolower(wchar_t c) {
return (wchar_t) tolower((int) c); return (wchar_t) tolower((int) c);
} }
@ -31,19 +31,19 @@ int Characters::iscntrl(wchar_t c) {
return ((i >= 0) && (i < 32)); return ((i >= 0) && (i < 32));
} }
int Characters::vowel(wchar_t c) { int Characters::vowel(wchar_t c) {
if ((c == 'a') || (c == 'e') || (c == 'i') || (c == 'o') || (c == 'u')) return TRUE; if ((c == 'a') [[| (c == 'e') || (c == 'i') || (c == 'o') |]] (c == 'u')) return TRUE;
return FALSE; return FALSE;
} }
@ White space classes: @ White space classes:
= <<*>>=
int Characters::is_space_or_tab(int c) { int Characters::is_space_or_tab(int c) {
if ((c == ' ') || (c == '\t')) return TRUE; if ((c == ' ') [[]] (c == '\t')) return TRUE;
return FALSE; return FALSE;
} }
int Characters::is_whitespace(int c) { int Characters::is_whitespace(int c) {
if ((c == ' ') || (c == '\t') || (c == '\n')) return TRUE; if ((c == ' ') [[| (c == '\t') |]] (c == '\n')) return TRUE;
return FALSE; return FALSE;
} }
@ -51,18 +51,18 @@ int Characters::is_whitespace(int c) {
sense of the Treaty of Babel rules on leading and trailing spaces in sense of the Treaty of Babel rules on leading and trailing spaces in
iFiction records. iFiction records.
= <<*>>=
int Characters::is_babel_whitespace(int c) { int Characters::is_babel_whitespace(int c) {
if ((c == ' ') || (c == '\t') || (c == '\x0a') if ((c == ' ') [[| (c == '\t') |]] (c == '\x0a')
|| (c == '\x0d') || (c == NEWLINE_IN_STRING)) return TRUE; [[| (c == '\x0d') |]] (c == NEWLINE_IN_STRING)) return TRUE;
return FALSE; return FALSE;
} }
@h Unicode composition. @ \section{Unicode composition.}
A routine which converts the Unicode combining accents with letters, A routine which converts the Unicode combining accents with letters,
sufficient correctly to handle all characters in the ZSCII set. sufficient correctly to handle all characters in the ZSCII set.
= <<*>>=
int Characters::combine_accent(int accent, int letter) { int Characters::combine_accent(int accent, int letter) {
switch(accent) { switch(accent) {
case 0x0300: /* Unicode combining grave */ case 0x0300: /* Unicode combining grave */
@ -112,11 +112,11 @@ int Characters::combine_accent(int accent, int letter) {
return '?'; return '?';
} }
@h Accent stripping. @ \section{Accent stripping.}
It's occasionally useful to simplify text used as a filename by removing It's occasionally useful to simplify text used as a filename by removing
the more obvious accents from it. the more obvious accents from it.
= <<*>>=
int Characters::make_filename_safe(int charcode) { int Characters::make_filename_safe(int charcode) {
charcode = Characters::remove_accent(charcode); charcode = Characters::remove_accent(charcode);
if (charcode >= 128) charcode = '-'; if (charcode >= 128) charcode = '-';
@ -131,7 +131,7 @@ wchar_t Characters::make_wchar_t_filename_safe(wchar_t charcode) {
@ The following strips the accent, if present, from an ISO Latin-1 character: @ The following strips the accent, if present, from an ISO Latin-1 character:
= <<*>>=
int Characters::remove_accent(int charcode) { int Characters::remove_accent(int charcode) {
switch (charcode) { switch (charcode) {
case 0xC0: case 0xC1: case 0xC2: case 0xC3: case 0xC0: case 0xC1: case 0xC2: case 0xC3:
@ -165,7 +165,7 @@ wchar_t Characters::remove_wchar_t_accent(wchar_t charcode) {
@ This will do until we properly use Unicode character classes some day: @ This will do until we properly use Unicode character classes some day:
= <<*>>=
int Characters::isalphabetic(int letter) { int Characters::isalphabetic(int letter) {
return Characters::isalpha((wchar_t) Characters::remove_accent(letter)); return Characters::isalpha((wchar_t) Characters::remove_accent(letter));
} }

View file

@ -2,7 +2,7 @@
To read, validate and write JSON data interchange material. To read, validate and write JSON data interchange material.
@h Introduction. @ \section{Introduction.}
JSON (Douglas Crockford, c. 2000) stands for "JavaScript Object Notation", but is JSON (Douglas Crockford, c. 2000) stands for "JavaScript Object Notation", but is
now a //standardised data interchange format -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf// now a //standardised data interchange format -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf//
used in many contexts. It's especially suitable for passing small amounts of data used in many contexts. It's especially suitable for passing small amounts of data
@ -13,17 +13,17 @@ or preference files.
This section provides encoding and decoding facilities. It is intended to comply This section provides encoding and decoding facilities. It is intended to comply
with //ECMA-404 -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf//, with //ECMA-404 -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf//,
except that (i) it disallows repetition the same key in the same object, and (ii) except that (i) it disallows repetition the same key in the same object, and (ii)
text can only be used in the Basic Multilingual Plane of Unicode points |0x0000| text can only be used in the Basic Multilingual Plane of Unicode points [[0x0000]]
to |0xffff|. to [[0xffff]].
There are no size maxima or limitations. Still, this code was written at typing speed, There are no size maxima or limitations. Still, this code was written at typing speed,
and no effort has gone into reducing memory usage or running time in the face of and no effort has gone into reducing memory usage or running time in the face of
large (or malicious) JSON content. Error reporting is also limited in fulsomeness. large (or malicious) JSON content. Error reporting is also limited in fulsomeness.
See the |foundation-test| test case |json| for many exercises of the code below; See the [[foundation-test]] test case [[json]] for many exercises of the code below;
do not change this section without checking that it continues to pass. do not change this section without checking that it continues to pass.
@h Data model. @ \section{Data model.}
JSON has a simple data model which we need to replicate in memory. Each value JSON has a simple data model which we need to replicate in memory. Each value
will be a pointer to a (permanently held in memory) //JSON_value// object. will be a pointer to a (permanently held in memory) //JSON_value// object.
This is in effect a union, in that its type is always one of the following, This is in effect a union, in that its type is always one of the following,
@ -31,19 +31,20 @@ and then only certain elements are meaningful depending on type.
These are exactly the JSON types except that numbers are split between integer These are exactly the JSON types except that numbers are split between integer
and floating-point versions (the conflation of the two is where the Javascript and floating-point versions (the conflation of the two is where the Javascript
origins of JSON show through), and that the type |ERROR_JSONTYPE| represents origins of JSON show through), and that the type [[ERROR_JSONTYPE]] represents
invalid data resulting from attempting to decode erroneous JSON. invalid data resulting from attempting to decode erroneous JSON.
@e NUMBER_JSONTYPE from 1 <<*>>=
@e DOUBLE_JSONTYPE enum NUMBER_JSONTYPE from 1
@e STRING_JSONTYPE enum DOUBLE_JSONTYPE
@e BOOLEAN_JSONTYPE enum STRING_JSONTYPE
@e ARRAY_JSONTYPE enum BOOLEAN_JSONTYPE
@e OBJECT_JSONTYPE enum ARRAY_JSONTYPE
@e NULL_JSONTYPE enum OBJECT_JSONTYPE
@e ERROR_JSONTYPE enum NULL_JSONTYPE
enum ERROR_JSONTYPE
= <<*>>=
void JSON::write_type(OUTPUT_STREAM, int t) { void JSON::write_type(OUTPUT_STREAM, int t) {
switch (t) { switch (t) {
case NUMBER_JSONTYPE: WRITE("number"); break; case NUMBER_JSONTYPE: WRITE("number"); break;
@ -60,23 +61,23 @@ void JSON::write_type(OUTPUT_STREAM, int t) {
@ @
= <<*>>=
typedef struct JSON_value { typedef struct JSON_value {
int JSON_type; int JSON_type;
int if_integer; int if_integer;
double if_double; double if_double;
struct text_stream *if_string; struct text_stream *if_string;
int if_boolean; int if_boolean;
struct linked_list *if_list; /* of |JSON_value| */ struct linked_list *if_list; /* of [[JSON_value]] */
struct dictionary *dictionary_if_object; /* to |JSON_value| */ struct dictionary *dictionary_if_object; /* to [[JSON_value]] */
struct linked_list *list_if_object; /* of |text_stream| */ struct linked_list *list_if_object; /* of [[text_stream]] */
struct text_stream *if_error; struct text_stream *if_error;
CLASS_DEFINITION CLASS_DEFINITION
} JSON_value; } JSON_value;
@ Now some constructor functions to create data of each JSON type: @ Now some constructor functions to create data of each JSON type:
= <<*>>=
JSON_value *JSON::new_null(void) { JSON_value *JSON::new_null(void) {
JSON_value *value = CREATE(JSON_value); JSON_value *value = CREATE(JSON_value);
value->JSON_type = NULL_JSONTYPE; value->JSON_type = NULL_JSONTYPE;
@ -123,7 +124,7 @@ JSON_value *JSON::new_string(text_stream *S) {
@ JSON arrays -- lists, in effect -- should be created in an empty state, and @ JSON arrays -- lists, in effect -- should be created in an empty state, and
then have entries added sequentially: then have entries added sequentially:
= <<*>>=
JSON_value *JSON::new_array(void) { JSON_value *JSON::new_array(void) {
JSON_value *value = JSON::new_null(); JSON_value *value = JSON::new_null();
value->JSON_type = ARRAY_JSONTYPE; value->JSON_type = ARRAY_JSONTYPE;
@ -144,7 +145,7 @@ JSON_value *JSON::add_to_array(JSON_value *array, JSON_value *new_entry) {
@ Similarly, JSON objects -- dictionaries of key-value pairs, in effect -- @ Similarly, JSON objects -- dictionaries of key-value pairs, in effect --
should be created in an empty state, and then have key-value pairs added as needed: should be created in an empty state, and then have key-value pairs added as needed:
= <<*>>=
JSON_value *JSON::new_object(void) { JSON_value *JSON::new_object(void) {
JSON_value *value = JSON::new_null(); JSON_value *value = JSON::new_null();
value->JSON_type = OBJECT_JSONTYPE; value->JSON_type = OBJECT_JSONTYPE;
@ -166,10 +167,10 @@ JSON_value *JSON::add_to_object(JSON_value *obj, text_stream *key, JSON_value *v
return obj; return obj;
} }
@ The following looks up a key in an object, returning |NULL| if and only if @ The following looks up a key in an object, returning [[NULL]] if and only if
it is not present: it is not present:
= <<*>>=
JSON_value *JSON::look_up_object(JSON_value *obj, text_stream *key) { JSON_value *JSON::look_up_object(JSON_value *obj, text_stream *key) {
if (obj == NULL) internal_error("no object"); if (obj == NULL) internal_error("no object");
if (obj->JSON_type == ERROR_JSONTYPE) return NULL; if (obj->JSON_type == ERROR_JSONTYPE) return NULL;
@ -182,7 +183,7 @@ JSON_value *JSON::look_up_object(JSON_value *obj, text_stream *key) {
@ One last constructor creates an invalid JSON value resulting from incorrect @ One last constructor creates an invalid JSON value resulting from incorrect
JSON input: JSON input:
= <<*>>=
JSON_value *JSON::error(text_stream *msg) { JSON_value *JSON::error(text_stream *msg) {
JSON_value *value = JSON::new_null(); JSON_value *value = JSON::new_null();
value->JSON_type = ERROR_JSONTYPE; value->JSON_type = ERROR_JSONTYPE;
@ -193,7 +194,7 @@ JSON_value *JSON::error(text_stream *msg) {
@ This is a very limited form of comparison, since it cannot test equality @ This is a very limited form of comparison, since it cannot test equality
of arrays or objects. of arrays or objects.
= <<*>>=
int JSON::eq(JSON_value *val1, JSON_value *val2) { int JSON::eq(JSON_value *val1, JSON_value *val2) {
if ((val1 == NULL) && (val2)) return FALSE; if ((val1 == NULL) && (val2)) return FALSE;
if ((val1) && (val2 == NULL)) return FALSE; if ((val1) && (val2 == NULL)) return FALSE;
@ -208,16 +209,16 @@ int JSON::eq(JSON_value *val1, JSON_value *val2) {
return FALSE; return FALSE;
} }
@h Decoding JSON. @ \section{Decoding JSON.}
We do no actual file-handling in this section, but the following decoder can We do no actual file-handling in this section, but the following decoder can
be pointed to the contents of UTF-8 text file as needed. be pointed to the contents of UTF-8 text file as needed.
The decoder returns a non-|NULL| pointer in all cases. If the text contains The decoder returns a non-[[NULL]] pointer in all cases. If the text contains
any malformed JSON anywhere inside it, this pointer will be to a value of type any malformed JSON anywhere inside it, this pointer will be to a value of type
|ERROR_JSONTYPE|. Such a value should be thrown away as soon as the error [[ERROR_JSONTYPE]]. Such a value should be thrown away as soon as the error
message is made use of. message is made use of.
= <<*>>=
JSON_value *JSON::decode(text_stream *T, text_file_position *tfp) { JSON_value *JSON::decode(text_stream *T, text_file_position *tfp) {
return JSON::decode_range(T, 0, Str::len(T), tfp); return JSON::decode_range(T, 0, Str::len(T), tfp);
} }
@ -248,19 +249,19 @@ JSON_value *JSON::decode_error_q(text_stream *err, text_file_position *tfp,
return value; return value;
} }
@ This decodes the text in the character position range |[from, to)| as a @ This decodes the text in the character position range [[[from, to)]] as a
JSON value. JSON value.
The possibilities here are |[ ... ]| for an array, |{ ... }| for an object, The possibilities here are [[[ ... ]]] for an array, [[{ ... }]] for an object,
|"..."| for a string, a token beginning with a digit or a minus sign for a [["..."]] for a string, a token beginning with a digit or a minus sign for a
number (note that |+| and |.| are not allowed to open a number according to number (note that [[+]] and [[.]] are not allowed to open a number according to
the JSON standard), and the special cases |true|, |false| and |null|. the JSON standard), and the special cases [[true]], [[false]] and [[null]].
= <<*>>=
JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_position *tfp) { JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_position *tfp) {
int first_nws = -1, last_nws = -1; int first_nws = -1, last_nws = -1;
wchar_t first_c = 0, last_c = 0; wchar_t first_c = 0, last_c = 0;
@<Find the first and last non-whitespace character@>; <<Find the first and last non-whitespace character>>;
switch (first_c) { switch (first_c) {
case '[': case '[':
if (last_c != ']') return JSON::decode_error(I"mismatched '[' ... ']'", tfp); if (last_c != ']') return JSON::decode_error(I"mismatched '[' ... ']'", tfp);
@ -274,7 +275,7 @@ JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_posit
if (last_c != '"') return JSON::decode_error(I"mismatched quotation marks", tfp); if (last_c != '"') return JSON::decode_error(I"mismatched quotation marks", tfp);
return JSON::decode_string(T, first_nws+1, last_nws, tfp); return JSON::decode_string(T, first_nws+1, last_nws, tfp);
} }
if ((Characters::isdigit(first_c)) || (first_c == '-')) if ((Characters::isdigit(first_c)) [[]] (first_c == '-'))
return JSON::decode_number(T, first_nws, last_nws+1, tfp); return JSON::decode_number(T, first_nws, last_nws+1, tfp);
if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3)) if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3))
return JSON::new_boolean(TRUE); return JSON::new_boolean(TRUE);
@ -285,7 +286,7 @@ JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_posit
return JSON::decode_error(I"unknown JSON value", tfp); return JSON::decode_error(I"unknown JSON value", tfp);
} }
@<Find the first and last non-whitespace character@> = <<Find the first and last non-whitespace character>>=
for (int i=from; i<to; i++) for (int i=from; i<to; i++)
if (Characters::is_whitespace(Str::get_at(T, i)) == FALSE) { if (Characters::is_whitespace(Str::get_at(T, i)) == FALSE) {
first_nws = i; break; first_nws = i; break;
@ -299,11 +300,11 @@ JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_posit
last_c = Str::get_at(T, last_nws); last_c = Str::get_at(T, last_nws);
@ So now we have individual decoder functions for each type. First, arrays, where @ So now we have individual decoder functions for each type. First, arrays, where
now the range |[from, to)| represents what is inside the square brackets: this now the range [[[from, to)]] represents what is inside the square brackets: this
needs to be a comma-separated list. We follow ECMA strictly in disallowing a final needs to be a comma-separated list. We follow ECMA strictly in disallowing a final
comma before the |]|, unlike some JSON-like parsers. comma before the [[]]], unlike some JSON-like parsers.
= <<*>>=
JSON_value *JSON::decode_array(JSON_value *array, text_stream *T, int from, int to, JSON_value *JSON::decode_array(JSON_value *array, text_stream *T, int from, int to,
text_file_position *tfp) { text_file_position *tfp) {
int content = FALSE; int content = FALSE;
@ -340,7 +341,7 @@ JSON_value *JSON::decode_array_entry(JSON_value *array, text_stream *T, int from
@ And similarly for objects. @ And similarly for objects.
= <<*>>=
JSON_value *JSON::decode_object(JSON_value *obj, text_stream *T, int from, int to, JSON_value *JSON::decode_object(JSON_value *obj, text_stream *T, int from, int to,
text_file_position *tfp) { text_file_position *tfp) {
int content = FALSE; int content = FALSE;
@ -370,13 +371,13 @@ JSON_value *JSON::decode_object(JSON_value *obj, text_stream *T, int from, int t
} }
@ Note that we allow key names to include all kinds of unconscionable garbage, @ Note that we allow key names to include all kinds of unconscionable garbage,
as ECMA requires. |\u0003\"\t\t\t| is a valid JSON key name; so is the empty string. as ECMA requires. [[\u0003\"\t\t\t]] is a valid JSON key name; so is the empty string.
We are however slightly stricter than ECMA in that we disallow duplicate keys We are however slightly stricter than ECMA in that we disallow duplicate keys
in the same object. ECMA says this is a "semantic consideration that may be defined in the same object. ECMA says this is a "semantic consideration that may be defined
by JSON processors". We are hereby defining it. by JSON processors". We are hereby defining it.
= <<*>>=
JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from, int to, JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from, int to,
text_file_position *tfp) { text_file_position *tfp) {
while (Characters::is_whitespace(Str::get_at(T, from))) from++; while (Characters::is_whitespace(Str::get_at(T, from))) from++;
@ -399,7 +400,7 @@ JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from,
} }
if (ended == FALSE) return JSON::decode_error_q(I"key does not end with quotation mark", tfp, T, saved_from, saved_to); if (ended == FALSE) return JSON::decode_error_q(I"key does not end with quotation mark", tfp, T, saved_from, saved_to);
while (Characters::is_whitespace(Str::get_at(T, from))) from++; while (Characters::is_whitespace(Str::get_at(T, from))) from++;
if ((from >= to) || (Str::get_at(T, from) != ':')) if ((from >= to) [[]] (Str::get_at(T, from) != ':'))
return JSON::decode_error_q(I"key is not followed by ':'", tfp, T, saved_from, saved_to); return JSON::decode_error_q(I"key is not followed by ':'", tfp, T, saved_from, saved_to);
from++; from++;
if (JSON::look_up_object(obj, key)) return JSON::decode_error_q(I"duplicate key", tfp, T, saved_from, saved_to); if (JSON::look_up_object(obj, key)) return JSON::decode_error_q(I"duplicate key", tfp, T, saved_from, saved_to);
@ -412,9 +413,9 @@ JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from,
@ Numbers are annoying to decode since they can be given either in a restricted @ Numbers are annoying to decode since they can be given either in a restricted
floating-point syntax, or in decimal. ECMA is slippery on the question of exactly floating-point syntax, or in decimal. ECMA is slippery on the question of exactly
what floating-point numbers can be represented, but it's common to consider what floating-point numbers can be represented, but it's common to consider
them as being |double|, so we'll follow suit. them as being [[double]], so we'll follow suit.
= <<*>>=
JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_position *tfp) { JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_position *tfp) {
while (Characters::is_whitespace(Str::get_at(T, from))) from++; while (Characters::is_whitespace(Str::get_at(T, from))) from++;
while ((to > from) && (Characters::is_whitespace(Str::get_at(T, to-1)))) to--; while ((to > from) && (Characters::is_whitespace(Str::get_at(T, to-1)))) to--;
@ -426,8 +427,8 @@ JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_posi
for (int i=at; i<to; i++) for (int i=at; i<to; i++)
if (Characters::isdigit(Str::get_at(T, i))) if (Characters::isdigit(Str::get_at(T, i)))
PUT_TO(integer, Str::get_at(T, i)); PUT_TO(integer, Str::get_at(T, i));
else if ((Str::get_at(T, i) == 'E') || (Str::get_at(T, i) == 'e') || else if ((Str::get_at(T, i) == 'E') [[| (Str::get_at(T, i) == 'e') |]]
(Str::get_at(T, i) == '.') || (Str::get_at(T, i) == '+')) (Str::get_at(T, i) == '.') [[]] (Str::get_at(T, i) == '+'))
double_me = TRUE; double_me = TRUE;
else else
return JSON::decode_error(I"number is not a decimal integer", tfp); return JSON::decode_error(I"number is not a decimal integer", tfp);
@ -451,7 +452,7 @@ JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_posi
@ Strings are easy except for escape characters. I have no idea why JSON wants @ Strings are easy except for escape characters. I have no idea why JSON wants
to allow the escaping of forward slash, but the standard requires it. to allow the escaping of forward slash, but the standard requires it.
= <<*>>=
JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_position *tfp) { JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_position *tfp) {
TEMPORARY_TEXT(string) TEMPORARY_TEXT(string)
for (int i=from; i<to; i++) { for (int i=from; i<to; i++) {
@ -469,7 +470,7 @@ JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_posi
case '\\': break; case '\\': break;
case '/': break; case '/': break;
case '"': break; case '"': break;
case 'u': @<Decode a hexadecimal Unicode escape@>; break; case 'u': <<Decode a hexadecimal Unicode escape>>; break;
default: return JSON::decode_error(I"bad '\\' escape in string", tfp); default: return JSON::decode_error(I"bad '\\' escape in string", tfp);
} }
PUT_TO(string, c); PUT_TO(string, c);
@ -484,9 +485,9 @@ JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_posi
@ We don't quite fully implement ECMA here: the following is fine for code points @ We don't quite fully implement ECMA here: the following is fine for code points
in the Basic Multilingual Plane, but we don't handle the curious UTF-16 surrogate pair in the Basic Multilingual Plane, but we don't handle the curious UTF-16 surrogate pair
rule for code points between |0x10000| and |0x10fff|. rule for code points between [[0x10000]] and [[0x10fff]].
@<Decode a hexadecimal Unicode escape@> = <<Decode a hexadecimal Unicode escape>>=
if (i+4 >= to) return JSON::decode_error(I"incomplete '\\u' escape", tfp); if (i+4 >= to) return JSON::decode_error(I"incomplete '\\u' escape", tfp);
int hex = 0; int hex = 0;
for (int j=0; j<4; j++) { for (int j=0; j<4; j++) {
@ -501,9 +502,9 @@ rule for code points between |0x10000| and |0x10fff|.
c = (wchar_t) hex; c = (wchar_t) hex;
i += 4; i += 4;
@h Encoding JSON. @ \section{Encoding JSON.}
= <<*>>=
void JSON::encode(OUTPUT_STREAM, JSON_value *J) { void JSON::encode(OUTPUT_STREAM, JSON_value *J) {
if (J == NULL) internal_error("no JSON value supplied"); if (J == NULL) internal_error("no JSON value supplied");
switch (J->JSON_type) { switch (J->JSON_type) {
@ -563,7 +564,7 @@ void JSON::encode(OUTPUT_STREAM, JSON_value *J) {
@ Note that we elect not to escape the slash character, or any Unicode code @ Note that we elect not to escape the slash character, or any Unicode code
points above 32. points above 32.
= <<*>>=
void JSON::encode_string(OUTPUT_STREAM, text_stream *T) { void JSON::encode_string(OUTPUT_STREAM, text_stream *T) {
LOOP_THROUGH_TEXT(pos, T) { LOOP_THROUGH_TEXT(pos, T) {
wchar_t c = Str::get(pos); wchar_t c = Str::get(pos);
@ -582,7 +583,7 @@ void JSON::encode_string(OUTPUT_STREAM, text_stream *T) {
} }
} }
@h Requirements. @ \section{Requirements.}
Of course, the trouble with JSON is that it's a soup of undifferentiated data. Of course, the trouble with JSON is that it's a soup of undifferentiated data.
Just because you're expecting a pair of numbers, there's no reason to suppose Just because you're expecting a pair of numbers, there's no reason to suppose
that's what you've been given. that's what you've been given.
@ -591,9 +592,9 @@ A //JSON_requirement// is a sort of JSON schema: a specification for the structu
of a //JSON_value//. At the top level, it's a list of one or more equally of a //JSON_value//. At the top level, it's a list of one or more equally
good alternative specifications. Note that the empty list is not allowed. good alternative specifications. Note that the empty list is not allowed.
= <<*>>=
typedef struct JSON_requirement { typedef struct JSON_requirement {
struct linked_list *alternatives; /* of |JSON_single_requirement| */ struct linked_list *alternatives; /* of [[JSON_single_requirement]] */
CLASS_DEFINITION CLASS_DEFINITION
} JSON_requirement; } JSON_requirement;
@ -615,7 +616,7 @@ JSON_requirement *JSON::add_alternative(JSON_requirement *so_far,
if Javascript actually had types. It can communicate something like "a number" if Javascript actually had types. It can communicate something like "a number"
or "a list of strings"; but it can also say "the value has to be exactly this". or "a list of strings"; but it can also say "the value has to be exactly this".
= <<*>>=
typedef struct JSON_single_requirement { typedef struct JSON_single_requirement {
struct JSON_requirement *this_requirement; struct JSON_requirement *this_requirement;
struct JSON_value *this_value; struct JSON_value *this_value;
@ -623,10 +624,10 @@ typedef struct JSON_single_requirement {
CLASS_DEFINITION CLASS_DEFINITION
} JSON_single_requirement; } JSON_single_requirement;
@ Exactly one of |this_requirement|, |this_value| and |this_type| should be @ Exactly one of [[this_requirement]], [[this_value]] and [[this_type]] should be
non-|NULL|, so we have one constructor function for each case: non-[[NULL]], so we have one constructor function for each case:
= <<*>>=
JSON_single_requirement *JSON::require_requirement(JSON_requirement *req) { JSON_single_requirement *JSON::require_requirement(JSON_requirement *req) {
JSON_single_requirement *sing = CREATE(JSON_single_requirement); JSON_single_requirement *sing = CREATE(JSON_single_requirement);
sing->this_requirement = req; sing->this_requirement = req;
@ -653,15 +654,15 @@ JSON_single_requirement *JSON::require_type(int t) {
@ JSON types, in our model, look very like //JSON_value//s. @ JSON types, in our model, look very like //JSON_value//s.
= <<*>>=
typedef struct JSON_type { typedef struct JSON_type {
int JSON_type; int JSON_type;
struct linked_list *if_list; /* of |JSON_requirement| */ struct linked_list *if_list; /* of [[JSON_requirement]] */
struct JSON_requirement *all_if_list; struct JSON_requirement *all_if_list;
struct dictionary *dictionary_if_object; /* to |JSON_pair_requirement| */ struct dictionary *dictionary_if_object; /* to [[JSON_pair_requirement]] */
struct linked_list *list_if_object; /* of |text_stream| */ struct linked_list *list_if_object; /* of [[text_stream]] */
struct text_stream *if_error; struct text_stream *if_error;
CLASS_DEFINITION CLASS_DEFINITION
@ -698,9 +699,9 @@ JSON_type *JSON::new_type_requirement(int t) {
} }
@ A convenience for "the value must be an array of any number of entries, each @ A convenience for "the value must be an array of any number of entries, each
of which meets the requirement |E_req|": of which meets the requirement [[E_req]]":
= <<*>>=
JSON_single_requirement *JSON::require_array_of(JSON_requirement *E_req) { JSON_single_requirement *JSON::require_array_of(JSON_requirement *E_req) {
JSON_single_requirement *req = JSON::require_type(ARRAY_JSONTYPE); JSON_single_requirement *req = JSON::require_type(ARRAY_JSONTYPE);
req->this_type->all_if_list = E_req; req->this_type->all_if_list = E_req;
@ -708,22 +709,22 @@ JSON_single_requirement *JSON::require_array_of(JSON_requirement *E_req) {
} }
@ If an array wants to be a tuple with a fixed number of entries, each with @ If an array wants to be a tuple with a fixed number of entries, each with
its own requirement, then instead call |JSON::require_type(ARRAY_JSONTYPE)| and its own requirement, then instead call [[JSON::require_type(ARRAY_JSONTYPE)]] and
then make a number of calls to the following in sequence: then make a number of calls to the following in sequence:
= <<*>>=
void JSON::require_entry(JSON_single_requirement *array_sr, JSON_requirement *entry_sr) { void JSON::require_entry(JSON_single_requirement *array_sr, JSON_requirement *entry_sr) {
if (array_sr == NULL) internal_error("no array"); if (array_sr == NULL) internal_error("no array");
if ((array_sr->this_type == NULL) || if ((array_sr->this_type == NULL) [[]]
(array_sr->this_type->JSON_type != ARRAY_JSONTYPE)) internal_error("not an array"); (array_sr->this_type->JSON_type != ARRAY_JSONTYPE)) internal_error("not an array");
if (entry_sr == NULL) internal_error("no new entry"); if (entry_sr == NULL) internal_error("no new entry");
ADD_TO_LINKED_LIST(entry_sr, JSON_requirement, array_sr->this_type->if_list); ADD_TO_LINKED_LIST(entry_sr, JSON_requirement, array_sr->this_type->if_list);
} }
@ Similarly, create an object requirement with |JSON::require_type(OBJECT_JSONTYPE)| @ Similarly, create an object requirement with [[JSON::require_type(OBJECT_JSONTYPE)]]
and then either require or allow key-value pairs with: and then either require or allow key-value pairs with:
= <<*>>=
void JSON::require_pair(JSON_single_requirement *obj_sr, text_stream *key, JSON_requirement *req) { void JSON::require_pair(JSON_single_requirement *obj_sr, text_stream *key, JSON_requirement *req) {
JSON::require_pair_inner(obj_sr, key, req, FALSE); JSON::require_pair_inner(obj_sr, key, req, FALSE);
} }
@ -735,7 +736,7 @@ void JSON::allow_pair(JSON_single_requirement *obj_sr, text_stream *key, JSON_re
void JSON::require_pair_inner(JSON_single_requirement *obj_sr, text_stream *key, void JSON::require_pair_inner(JSON_single_requirement *obj_sr, text_stream *key,
JSON_requirement *req, int opt) { JSON_requirement *req, int opt) {
if (obj_sr == NULL) internal_error("no object"); if (obj_sr == NULL) internal_error("no object");
if ((obj_sr->this_type == NULL) || if ((obj_sr->this_type == NULL) [[]]
(obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object"); (obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object");
if (req == NULL) internal_error("no val req"); if (req == NULL) internal_error("no val req");
key = Str::duplicate(key); key = Str::duplicate(key);
@ -747,13 +748,13 @@ void JSON::require_pair_inner(JSON_single_requirement *obj_sr, text_stream *key,
if (de) de->value = pr; if (de) de->value = pr;
} }
@ This then extracts the requirement on a given key, or returns |NULL| is if @ This then extracts the requirement on a given key, or returns [[NULL]] is if
is not permitted: is not permitted:
= <<*>>=
JSON_pair_requirement *JSON::look_up_pair(JSON_single_requirement *obj_sr, text_stream *key) { JSON_pair_requirement *JSON::look_up_pair(JSON_single_requirement *obj_sr, text_stream *key) {
if (obj_sr == NULL) internal_error("no object"); if (obj_sr == NULL) internal_error("no object");
if ((obj_sr->this_type == NULL) || if ((obj_sr->this_type == NULL) [[]]
(obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object"); (obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object");
dict_entry *de = Dictionaries::find(obj_sr->this_type->dictionary_if_object, key); dict_entry *de = Dictionaries::find(obj_sr->this_type->dictionary_if_object, key);
if (de == NULL) return NULL; if (de == NULL) return NULL;
@ -763,25 +764,25 @@ JSON_pair_requirement *JSON::look_up_pair(JSON_single_requirement *obj_sr, text_
@ This is used when parsing textual requirements, to indicate a syntax error; @ This is used when parsing textual requirements, to indicate a syntax error;
but it is not valid as a requirement itself. but it is not valid as a requirement itself.
= <<*>>=
JSON_single_requirement *JSON::error_sr(text_stream *msg) { JSON_single_requirement *JSON::error_sr(text_stream *msg) {
JSON_single_requirement *req = JSON::require_type(ERROR_JSONTYPE); JSON_single_requirement *req = JSON::require_type(ERROR_JSONTYPE);
req->this_type->if_error = Str::duplicate(msg); req->this_type->if_error = Str::duplicate(msg);
return req; return req;
} }
@h Validation. @ \section{Validation.}
To "validate" a JSON value is to determine that it meets some //JSON_requirement//. To "validate" a JSON value is to determine that it meets some //JSON_requirement//.
The following returns |TRUE| if the value meets the requirement in full; The following returns [[TRUE]] if the value meets the requirement in full;
if not, |FALSE|, and then if |errs| is not null, a list of error messages is if not, [[FALSE]], and then if [[errs]] is not null, a list of error messages is
appended to the linked list |errs|. appended to the linked list [[errs]].
The stack here is used to give better error messages by locating where the The stack here is used to give better error messages by locating where the
problem was: e.g. |"object.coordinates[1]"| is the result of the stack problem was: e.g. [["object.coordinates[1]"]] is the result of the stack
holding |"object" > ".cooordinates" > "[1]"|. holding [["object" > ".cooordinates" > "[1]"]].
= <<*>>=
int JSON::validate(JSON_value *val, JSON_requirement *req, linked_list *errs) { int JSON::validate(JSON_value *val, JSON_requirement *req, linked_list *errs) {
lifo_stack *location = NEW_LIFO_STACK(text_stream); lifo_stack *location = NEW_LIFO_STACK(text_stream);
if ((val) && (val->JSON_type == ARRAY_JSONTYPE)) { if ((val) && (val->JSON_type == ARRAY_JSONTYPE)) {
@ -815,7 +816,7 @@ value must match one of the single requirements in the list. (We can stop as
soon as it has met one.) If it meets none of them, we produce error messages soon as it has met one.) If it meets none of them, we produce error messages
for the reason it fails just the first. for the reason it fails just the first.
= <<*>>=
int JSON::validate_r(JSON_value *val, JSON_requirement *req, linked_list *errs, int JSON::validate_r(JSON_value *val, JSON_requirement *req, linked_list *errs,
lifo_stack *location) { lifo_stack *location) {
if (val == NULL) internal_error("no value"); if (val == NULL) internal_error("no value");
@ -834,7 +835,7 @@ int JSON::validate_r(JSON_value *val, JSON_requirement *req, linked_list *errs,
@ Bad data always fails, and otherwise we split into the three cases. @ Bad data always fails, and otherwise we split into the three cases.
= <<*>>=
int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req, int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
linked_list *errs, lifo_stack *location) { linked_list *errs, lifo_stack *location) {
if (val->JSON_type == ERROR_JSONTYPE) { if (val->JSON_type == ERROR_JSONTYPE) {
@ -842,16 +843,16 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
I"erroneous JSON value from parsing bad text", location); I"erroneous JSON value from parsing bad text", location);
return FALSE; return FALSE;
} }
if (req->this_requirement) @<Validate against this requirement@>; if (req->this_requirement) <<Validate against this requirement>>;
if (req->this_value) @<Validate against this value@>; if (req->this_value) <<Validate against this value>>;
if (req->this_type) @<Validate against this type@>; if (req->this_type) <<Validate against this type>>;
internal_error("bad single requirement"); internal_error("bad single requirement");
} }
@<Validate against this requirement@> = <<Validate against this requirement>>=
return JSON::validate_r(val, req->this_requirement, errs, location); return JSON::validate_r(val, req->this_requirement, errs, location);
@<Validate against this value@> = <<Validate against this value>>=
if (JSON::eq(val, req->this_value) == FALSE) { if (JSON::eq(val, req->this_value) == FALSE) {
TEMPORARY_TEXT(msg) TEMPORARY_TEXT(msg)
WRITE_TO(msg, "value "); WRITE_TO(msg, "value ");
@ -863,16 +864,16 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
} }
return TRUE; return TRUE;
@<Validate against this type@> = <<Validate against this type>>=
@<Verify that the JSON type constructors match@>; <<Verify that the JSON type constructors match>>;
int outcome = TRUE; int outcome = TRUE;
if (val->JSON_type == ARRAY_JSONTYPE) if (val->JSON_type == ARRAY_JSONTYPE)
@<Verify that the array entries meet requirements@>; <<Verify that the array entries meet requirements>>;
if (val->JSON_type == OBJECT_JSONTYPE) if (val->JSON_type == OBJECT_JSONTYPE)
@<Verify that the object members meet requirements@>; <<Verify that the object members meet requirements>>;
return outcome; return outcome;
@<Verify that the JSON type constructors match@> = <<Verify that the JSON type constructors match>>=
if (val->JSON_type != req->this_type->JSON_type) { if (val->JSON_type != req->this_type->JSON_type) {
if (errs) { if (errs) {
TEMPORARY_TEXT(msg) TEMPORARY_TEXT(msg)
@ -886,7 +887,7 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
return FALSE; return FALSE;
} }
@<Verify that the array entries meet requirements@> = <<Verify that the array entries meet requirements>>=
int count = 0; int count = 0;
JSON_value *E; JSON_value *E;
LOOP_OVER_LINKED_LIST(E, JSON_value, val->if_list) { LOOP_OVER_LINKED_LIST(E, JSON_value, val->if_list) {
@ -912,19 +913,19 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
count++; count++;
} }
@<Verify that the object members meet requirements@> = <<Verify that the object members meet requirements>>=
text_stream *key; text_stream *key;
LOOP_OVER_LINKED_LIST(key, text_stream, val->list_if_object) LOOP_OVER_LINKED_LIST(key, text_stream, val->list_if_object)
@<Verify that the member with this key is allowed and contains the right data@>; <<Verify that the member with this key is allowed and contains the right data>>;
LOOP_OVER_LINKED_LIST(key, text_stream, req->this_type->list_if_object) { LOOP_OVER_LINKED_LIST(key, text_stream, req->this_type->list_if_object) {
JSON_pair_requirement *pr = JSON_pair_requirement *pr =
Dictionaries::read_value(req->this_type->dictionary_if_object, key); Dictionaries::read_value(req->this_type->dictionary_if_object, key);
if (pr == NULL) internal_error("broken JSON object requirement"); if (pr == NULL) internal_error("broken JSON object requirement");
if (pr->optional == FALSE) if (pr->optional == FALSE)
@<Verify that the value object does provide this mandatory member@>; <<Verify that the value object does provide this mandatory member>>;
} }
@<Verify that the member with this key is allowed and contains the right data@> = <<Verify that the member with this key is allowed and contains the right data>>=
JSON_value *E = Dictionaries::read_value(val->dictionary_if_object, key); JSON_value *E = Dictionaries::read_value(val->dictionary_if_object, key);
if (E == NULL) internal_error("broken JSON object dictionary"); if (E == NULL) internal_error("broken JSON object dictionary");
JSON_pair_requirement *pr = JSON::look_up_pair(req, key); JSON_pair_requirement *pr = JSON::look_up_pair(req, key);
@ -943,7 +944,7 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
POP_LIFO_STACK(text_stream, location); POP_LIFO_STACK(text_stream, location);
DISCARD_TEXT(at) DISCARD_TEXT(at)
@<Verify that the value object does provide this mandatory member@> = <<Verify that the value object does provide this mandatory member>>=
JSON_value *E = JSON::look_up_object(val, key); JSON_value *E = JSON::look_up_object(val, key);
if (E == NULL) { if (E == NULL) {
TEMPORARY_TEXT(msg) TEMPORARY_TEXT(msg)
@ -953,21 +954,20 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
outcome = FALSE; outcome = FALSE;
} }
@h Decoding JSON requirements. @ \section{Decoding JSON requirements.}
It's convenient to be able to read and write these requirements to textual It's convenient to be able to read and write these requirements to textual
form, exactly as we do with JSON itself, and here goes. form, exactly as we do with JSON itself, and here goes.
This is an example of the syntax we parse. It's JSON except that This is an example of the syntax we parse. It's JSON except that
(a) the type names |number|, |double|, |string|, |boolean| and |null| are (a) the type names [[number]], [[double]], [[string]], [[boolean]] and [[null]] are
used in place of their respective values; used in place of their respective values;
(b) a question mark |?| before the name of a key means that it is optional; (b) a question mark [[?]] before the name of a key means that it is optional;
(c) if an array has one entry followed by an asterisk |*|, it means (c) if an array has one entry followed by an asterisk [[*]], it means
"any number of entries, each of which must match this"; "any number of entries, each of which must match this";
(d) |<name>| refers to a requirement recorded in the |known_names| dictionary. (d) [[<name>]] refers to a requirement recorded in the [[known_names]] dictionary.
For example: For example:
= (text)
{ {
"coordinates": [ double, double, string ], "coordinates": [ double, double, string ],
?"jurisdiction": string, ?"jurisdiction": string,
@ -976,28 +976,27 @@ For example:
"entry": string "entry": string
}* ] }* ]
} }
=
This function is essentially the same as //JSON::decode//, but returning a This function is essentially the same as //JSON::decode//, but returning a
requirement rather than a value. requirement rather than a value.
Note that |known_names| can be |NULL| to have it not recognise any such names; Note that [[known_names]] can be [[NULL]] to have it not recognise any such names;
there's no need to create an empty dictionary if this feature is unwanted. there's no need to create an empty dictionary if this feature is unwanted.
= <<*>>=
JSON_requirement *JSON::decode_req(text_stream *T, dictionary *known_names) { JSON_requirement *JSON::decode_req(text_stream *T, dictionary *known_names) {
return JSON::decode_req_range(T, 0, Str::len(T), known_names); return JSON::decode_req_range(T, 0, Str::len(T), known_names);
} }
@ This decodes the text in the character position range |[from, to)| as a @ This decodes the text in the character position range [[[from, to)]] as a
JSON requirement. JSON requirement.
= <<*>>=
JSON_requirement *JSON::decode_req_range(text_stream *T, int from, int to, JSON_requirement *JSON::decode_req_range(text_stream *T, int from, int to,
dictionary *known_names) { dictionary *known_names) {
int first_nws = -1, last_nws = -1; int first_nws = -1, last_nws = -1;
wchar_t first_c = 0, last_c = 0; wchar_t first_c = 0, last_c = 0;
@<Find the first and last non-whitespace character in requirement@>; <<Find the first and last non-whitespace character in requirement>>;
if (first_c == '(') { if (first_c == '(') {
if (last_c != ')') if (last_c != ')')
return JSON::single_choice(JSON::error_sr(I"mismatched '(' ... ')'")); return JSON::single_choice(JSON::error_sr(I"mismatched '(' ... ')'"));
@ -1032,17 +1031,17 @@ JSON_requirement *JSON::decode_req_alternative(JSON_requirement *req, text_strea
return JSON::add_alternative(req, sing); return JSON::add_alternative(req, sing);
} }
@ Note that the keyword |null| is ambiguous in the grammar for JSON requirements: @ Note that the keyword [[null]] is ambiguous in the grammar for JSON requirements:
does it mean "the value |null|", or does it mean "any value of the type |null|"? does it mean "the value [[null]]", or does it mean "any value of the type [[null]]"?
This makes no difference because the type |null| admits only the value |null|, but This makes no difference because the type [[null]] admits only the value [[null]], but
for what it's worth, we opt for the value. for what it's worth, we opt for the value.
= <<*>>=
JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int to, JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int to,
dictionary *known_names) { dictionary *known_names) {
int first_nws = -1, last_nws = -1; int first_nws = -1, last_nws = -1;
wchar_t first_c = 0, last_c = 0; wchar_t first_c = 0, last_c = 0;
@<Find the first and last non-whitespace character in requirement@>; <<Find the first and last non-whitespace character in requirement>>;
if (first_nws < 0) return JSON::error_sr(I"whitespace where requirement expected"); if (first_nws < 0) return JSON::error_sr(I"whitespace where requirement expected");
switch (first_c) { switch (first_c) {
case '[': case '[':
@ -1073,7 +1072,7 @@ JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int t
int require_value = FALSE; int require_value = FALSE;
if ((first_c == '"') || (first_c == '-') || (Characters::isdigit(first_c))) if ((first_c == '"') [[| (first_c == '-') |]] (Characters::isdigit(first_c)))
require_value = TRUE; require_value = TRUE;
if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3)) if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3))
require_value = TRUE; require_value = TRUE;
@ -1110,7 +1109,7 @@ JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int t
return JSON::error_sr(msg); return JSON::error_sr(msg);
} }
@<Find the first and last non-whitespace character in requirement@> = <<Find the first and last non-whitespace character in requirement>>=
for (int i=from; i<to; i++) for (int i=from; i<to; i++)
if (Characters::is_whitespace(Str::get_at(T, i)) == FALSE) { if (Characters::is_whitespace(Str::get_at(T, i)) == FALSE) {
first_nws = i; break; first_nws = i; break;
@ -1124,7 +1123,7 @@ JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int t
@ Array requirements: @ Array requirements:
= <<*>>=
JSON_single_requirement *JSON::decode_req_array(JSON_single_requirement *array_sr, JSON_single_requirement *JSON::decode_req_array(JSON_single_requirement *array_sr,
text_stream *T, int from, int to, dictionary *known_names) { text_stream *T, int from, int to, dictionary *known_names) {
int content = FALSE; int content = FALSE;
@ -1167,7 +1166,7 @@ JSON_single_requirement *JSON::decode_req_array_entry(JSON_single_requirement *a
@ And similarly for objects. @ And similarly for objects.
= <<*>>=
JSON_single_requirement *JSON::decode_req_object(JSON_single_requirement *obj, JSON_single_requirement *JSON::decode_req_object(JSON_single_requirement *obj,
text_stream *T, int from, int to, dictionary *known_names) { text_stream *T, int from, int to, dictionary *known_names) {
int content = FALSE; int content = FALSE;
@ -1218,7 +1217,7 @@ JSON_single_requirement *JSON::decode_req_object_entry(JSON_single_requirement *
} }
if (ended == FALSE) return JSON::error_sr(I"key does not end with quotation mark"); if (ended == FALSE) return JSON::error_sr(I"key does not end with quotation mark");
while (Characters::is_whitespace(Str::get_at(T, from))) from++; while (Characters::is_whitespace(Str::get_at(T, from))) from++;
if ((from >= to) || (Str::get_at(T, from) != ':')) if ((from >= to) [[]] (Str::get_at(T, from) != ':'))
return JSON::error_sr(I"key is not followed by ':'"); return JSON::error_sr(I"key is not followed by ':'");
from++; from++;
if (JSON::look_up_pair(obj, key)) return JSON::error_sr(I"duplicate key"); if (JSON::look_up_pair(obj, key)) return JSON::error_sr(I"duplicate key");
@ -1229,21 +1228,21 @@ JSON_single_requirement *JSON::decode_req_object_entry(JSON_single_requirement *
return obj; return obj;
} }
@h Encoding JSON requirements. @ \section{Encoding JSON requirements.}
This is now simple, with one caveat. It's possible to set up requirement trees This is now simple, with one caveat. It's possible to set up requirement trees
so that they are not well-founded. For example: so that they are not well-founded. For example:
= (text as InC)
JSON_single_requirement *set = JSON::require_type(ARRAY_JSONTYPE); JSON_single_requirement *set = JSON::require_type(ARRAY_JSONTYPE);
set->all_if_list = JSON::single_choice(set); set->all_if_list = JSON::single_choice(set);
=
This is not useless: it matches, say, |[]|, |[ [] ]| and |[ [], [ [] ] ]|
This is not useless: it matches, say, [[[]]], [[[ [] ]]] and [[[ [], [ [] ] ]]]
and other constructions giving amusement to set theorists. But it would cause and other constructions giving amusement to set theorists. But it would cause
the following to hang. Note that requirements read in from files (see below) the following to hang. Note that requirements read in from files (see below)
are always well-founded, and so do not have this issue. are always well-founded, and so do not have this issue.
= <<*>>=
void JSON::encode_req(OUTPUT_STREAM, JSON_requirement *req) { void JSON::encode_req(OUTPUT_STREAM, JSON_requirement *req) {
JSON::encode_req_r(OUT, req); JSON::encode_req_r(OUT, req);
} }
@ -1311,32 +1310,32 @@ void JSON::encode_type(OUTPUT_STREAM, JSON_type *type) {
} }
} }
@h Reading requirements files. @ \section{Reading requirements files.}
This convenient function reads in a set of requirements from a text file. Each This convenient function reads in a set of requirements from a text file. Each
requirement should begin |<name> ::=|, and then continues until the next such requirement should begin [[<name> ::=]], and then continues until the next such
header, or the end of the file. So for example: header, or the end of the file. So for example:
= (text)
! My scheme for JSON files describing geographical locations ! My scheme for JSON files describing geographical locations
<optional-letter> ::= ( "alpha" | "beta" | null ) <optional-letter> ::= ( "alpha" [[ "beta" ]] null )
<position> ::= { <position> ::= {
"category": <optional-letter>, "category": <optional-letter>,
"latitude": double, "latitude": double,
"longitude": double, "longitude": double,
} }
=
is a valid file declaring two requirements. Forward references are not allowed -- is a valid file declaring two requirements. Forward references are not allowed --
e.g., <position> can refer to <optional-letter> but not vice versa -- and e.g., <position> can refer to <optional-letter> but not vice versa -- and
therefore the requirements read in will always be well-founded. Comments are therefore the requirements read in will always be well-founded. Comments are
lines beginning with |!|; other than comments, only white space is permitted lines beginning with [[!]]; other than comments, only white space is permitted
before the first requirement begins. before the first requirement begins.
Note that the function //JSON::read_requirements_file// returns a dictionary Note that the function //JSON::read_requirements_file// returns a dictionary
of the requirements it has read, by name (but without their angle-brackets): of the requirements it has read, by name (but without their angle-brackets):
here, it would have two keys, |optional-letter| and |position|. here, it would have two keys, [[optional-letter]] and [[position]].
= <<*>>=
typedef struct JSON_rrf_state { typedef struct JSON_rrf_state {
struct text_stream *name; struct text_stream *name;
struct text_stream *defn; struct text_stream *defn;
@ -1379,7 +1378,7 @@ void JSON::read_requirements_file_helper(text_stream *text, text_file_position *
@ This is called when the end of a definition is reached, either because another @ This is called when the end of a definition is reached, either because another
is about to start, or because the end of the file has come: is about to start, or because the end of the file has come:
= <<*>>=
void JSON::process_req_defn(JSON_rrf_state *state) { void JSON::process_req_defn(JSON_rrf_state *state) {
if (Str::len(state->name) > 0) { if (Str::len(state->name) > 0) {
JSON_requirement *req = JSON_requirement *req =

View file

@ -2,36 +2,36 @@
To provide a limited regular-expression parser. To provide a limited regular-expression parser.
@h Character types. @ \section{Character types.}
We will define white space as spaces and tabs only, since the various kinds We will define white space as spaces and tabs only, since the various kinds
of line terminator will always be stripped out before this is applied. of line terminator will always be stripped out before this is applied.
= <<*>>=
int Regexp::white_space(int c) { int Regexp::white_space(int c) {
if ((c == ' ') || (c == '\t')) return TRUE; if ((c == ' ') [[]] (c == '\t')) return TRUE;
return FALSE; return FALSE;
} }
@ The presence of |:| here is perhaps a bit surprising, since it's illegal in @ The presence of [[:]] here is perhaps a bit surprising, since it's illegal in
C and has other meanings in other languages, but it's legal in C-for-Inform C and has other meanings in other languages, but it's legal in C-for-Inform
identifiers. identifiers.
= <<*>>=
int Regexp::identifier_char(int c) { int Regexp::identifier_char(int c) {
if ((c == '_') || (c == ':') || if ((c == '_') [[| (c == ':') |]]
((c >= 'A') && (c <= 'Z')) || ((c >= 'A') && (c <= 'Z')) [[]]
((c >= 'a') && (c <= 'z')) || ((c >= 'a') && (c <= 'z')) [[]]
((c >= '0') && (c <= '9'))) return TRUE; ((c >= '0') && (c <= '9'))) return TRUE;
return FALSE; return FALSE;
} }
@h Simple parsing. @ \section{Simple parsing.}
The following finds the earliest minimal-length substring of a string, The following finds the earliest minimal-length substring of a string,
delimited by two pairs of characters: for example, |<<| and |>>|. This could delimited by two pairs of characters: for example, [[<<]] and [[>>]]. This could
easily be done as a regular expression using |Regexp::match|, but the routine easily be done as a regular expression using [[Regexp::match]], but the routine
here is much quicker. here is much quicker.
= <<*>>=
int Regexp::find_expansion(text_stream *text, wchar_t on1, wchar_t on2, int Regexp::find_expansion(text_stream *text, wchar_t on1, wchar_t on2,
wchar_t off1, wchar_t off2, int *len) { wchar_t off1, wchar_t off2, int *len) {
for (int i = 0; i < Str::len(text); i++) for (int i = 0; i < Str::len(text); i++)
@ -47,7 +47,7 @@ int Regexp::find_expansion(text_stream *text, wchar_t on1, wchar_t on2,
@ Still more simply: @ Still more simply:
= <<*>>=
int Regexp::find_open_brace(text_stream *text) { int Regexp::find_open_brace(text_stream *text) {
for (int i=0; i < Str::len(text); i++) for (int i=0; i < Str::len(text); i++)
if (Str::get_at(text, i) == '{') if (Str::get_at(text, i) == '{')
@ -56,9 +56,9 @@ int Regexp::find_open_brace(text_stream *text) {
} }
@ Note that we count the empty string as being white space. Again, this is @ Note that we count the empty string as being white space. Again, this is
equivalent to |Regexp::match(p, " *")|, but much faster. equivalent to [[Regexp::match(p, " *")]], but much faster.
= <<*>>=
int Regexp::string_is_white_space(text_stream *text) { int Regexp::string_is_white_space(text_stream *text) {
LOOP_THROUGH_TEXT(P, text) LOOP_THROUGH_TEXT(P, text)
if (Regexp::white_space(Str::get(P)) == FALSE) if (Regexp::white_space(Str::get(P)) == FALSE)
@ -66,46 +66,47 @@ int Regexp::string_is_white_space(text_stream *text) {
return TRUE; return TRUE;
} }
@h A Worse PCRE. @ \section{A Worse PCRE.}
I originally wanted to call the function in this section |a_better_sscanf|, then I originally wanted to call the function in this section [[a_better_sscanf]], then
thought perhaps |a_worse_PCRE| would be more true. (PCRE is Philip Hazel's superb thought perhaps [[a_worse_PCRE]] would be more true. (PCRE is Philip Hazel's superb
C implementation of regular-expression parsing, but I didn't need its full strength, C implementation of regular-expression parsing, but I didn't need its full strength,
and I didn't want to complicate the build process by linking to it.) and I didn't want to complicate the build process by linking to it.)
This is a very minimal regular expression parser, simply for convenience of parsing This is a very minimal regular expression parser, simply for convenience of parsing
short texts against particularly simple patterns. Here is an example of use: short texts against particularly simple patterns. Here is an example of use:
= (text as code)
match_results mr = Regexp::create_mr(); match_results mr = Regexp::create_mr();
if (Regexp::match(&mr, text, L"fish (%d+) ([a-zA-Z_][a-zA-Z0-9_]*) *") { if (Regexp::match(&mr, text, L"fish (%d+) ([a-zA-Z_][a-zA-Z0-9_]*) *") {
PRINT("Fish number: %S\n", mr.exp[0]); PRINT("Fish number: %S\n", mr.exp[0]);
PRINT("Fish name: %S\n", mr.exp[1]); PRINT("Fish name: %S\n", mr.exp[1]);
} }
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
=
Note the |L| at the front of the regex itself: this is a wide string.
This tries to match the given |text| to see if it consists of the word fish, Note the [[L]] at the front of the regex itself: this is a wide string.
This tries to match the given [[text]] to see if it consists of the word fish,
then any amount of whitespace, then a string of digits which are copied into then any amount of whitespace, then a string of digits which are copied into
|mr->exp[0]|, then whitespace again, and then an alphanumeric identifier to be [[mr->exp[0]]], then whitespace again, and then an alphanumeric identifier to be
copied into |mr->exp[1]|, and finally optional whitespace. (If no match is copied into [[mr->exp[1]]], and finally optional whitespace. (If no match is
made, the contents of the found strings are undefined.) made, the contents of the found strings are undefined.)
Note that this differs from, for example, Perl's regular expression matcher Note that this differs from, for example, Perl's regular expression matcher
in several ways. The regular expression syntax is slightly different and in in several ways. The regular expression syntax is slightly different and in
general simpler. A match has to be made from start to end, so it's as if there general simpler. A match has to be made from start to end, so it's as if there
were an implicit |^| at the front and |$| at the back (in Perl terms). The were an implicit [[^]] at the front and [[$]] at the back (in Perl terms). The
full match text is therefore always the entire text put in, so there's no full match text is therefore always the entire text put in, so there's no
need to record this. In Perl, matching against |m/(.*) plus (.*)/| would need to record this. In Perl, matching against [[m/(.*) plus (.*)/]] would
set three subexpressions: number 0 would be the whole text matched, number set three subexpressions: number 0 would be the whole text matched, number
1 would be the first bracketed part, number 2 the second. Here, though, the 1 would be the first bracketed part, number 2 the second. Here, though, the
corresponding regex would be written |L"(%c*) plus (%c*)"|, and the bracketed corresponding regex would be written [[L"(%c*) plus (%c*)"]], and the bracketed
terms would be subexpressions 0 and 1. terms would be subexpressions 0 and 1.
@d MAX_BRACKETED_SUBEXPRESSIONS 5 /* this many bracketed subexpressions can be extracted */ <<*>>=
#define MAX_BRACKETED_SUBEXPRESSIONS 5 /* this many bracketed subexpressions can be extracted */
@ The internal state of the matcher is stored as follows: @ The internal state of the matcher is stored as follows:
= <<*>>=
typedef struct match_position { typedef struct match_position {
int tpos; /* position within text being matched */ int tpos; /* position within text being matched */
int ppos; /* position within pattern */ int ppos; /* position within pattern */
@ -120,9 +121,10 @@ typedef struct match_position {
@ It may appear that match texts are limited to 64 characters here, but they @ It may appear that match texts are limited to 64 characters here, but they
are not. They are simply a little faster to access if short. are not. They are simply a little faster to access if short.
@d MATCH_TEXT_INITIAL_ALLOCATION 64 <<*>>=
#define MATCH_TEXT_INITIAL_ALLOCATION 64
= <<*>>=
typedef struct match_result { typedef struct match_result {
wchar_t match_text_storage[MATCH_TEXT_INITIAL_ALLOCATION]; wchar_t match_text_storage[MATCH_TEXT_INITIAL_ALLOCATION];
struct text_stream match_text_struct; struct text_stream match_text_struct;
@ -140,7 +142,7 @@ explicitly. Note that the storage required is on the C stack (unless some
result strings grow very large), so that it's very quick to allocate and result strings grow very large), so that it's very quick to allocate and
deallocate. deallocate.
= <<*>>=
match_results Regexp::create_mr(void) { match_results Regexp::create_mr(void) {
match_results mr; match_results mr;
mr.no_matched_texts = 0; mr.no_matched_texts = 0;
@ -164,7 +166,7 @@ void Regexp::dispose_of(match_results *mr) {
@ So, then: the matcher itself. @ So, then: the matcher itself.
= <<*>>=
int Regexp::match(match_results *mr, text_stream *text, wchar_t *pattern) { int Regexp::match(match_results *mr, text_stream *text, wchar_t *pattern) {
if (mr) Regexp::prepare(mr); if (mr) Regexp::prepare(mr);
int rv = (Regexp::match_r(mr, text, pattern, NULL, FALSE) >= 0)?TRUE:FALSE; int rv = (Regexp::match_r(mr, text, pattern, NULL, FALSE) >= 0)?TRUE:FALSE;
@ -203,41 +205,41 @@ void Regexp::prepare(match_results *mr) {
} }
} }
@ = <<*>>=
int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern, int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern,
match_position *scan_from, int allow_partial) { match_position *scan_from, int allow_partial) {
match_position at; match_position at;
if (scan_from) at = *scan_from; if (scan_from) at = *scan_from;
else { at.tpos = 0; at.ppos = 0; at.bc = 0; at.bl = 0; } else { at.tpos = 0; at.ppos = 0; at.bc = 0; at.bl = 0; }
while ((Str::get_at(text, at.tpos)) || (pattern[at.ppos])) { while ((Str::get_at(text, at.tpos)) [[]] (pattern[at.ppos])) {
if ((allow_partial) && (pattern[at.ppos] == 0)) break; if ((allow_partial) && (pattern[at.ppos] == 0)) break;
@<Parentheses in the match pattern set up substrings to extract@>; <<Parentheses in the match pattern set up substrings to extract>>;
int chcl, /* what class of characters to match: a |*_CHARCLASS| value */ int chcl, /* what class of characters to match: a [[*_CHARCLASS]] value */
range_from, range_to, /* for |LITERAL_CHARCLASS| only */ range_from, range_to, /* for [[LITERAL_CHARCLASS]] only */
reverse = FALSE; /* require a non-match rather than a match */ reverse = FALSE; /* require a non-match rather than a match */
@<Extract the character class to match from the pattern@>; <<Extract the character class to match from the pattern>>;
int rep_from = 1, rep_to = 1; /* minimum and maximum number of repetitions */ int rep_from = 1, rep_to = 1; /* minimum and maximum number of repetitions */
int greedy = TRUE; /* go for a maximal-length match if possible */ int greedy = TRUE; /* go for a maximal-length match if possible */
@<Extract repetition markers from the pattern@>; <<Extract repetition markers from the pattern>>;
int reps = 0; int reps = 0;
@<Count how many repetitions can be made here@>; <<Count how many repetitions can be made here>>;
if (reps < rep_from) return -1; if (reps < rep_from) return -1;
/* we can now accept anything from |rep_from| to |reps| repetitions */ /* we can now accept anything from [[rep_from| to |reps]] repetitions */
if (rep_from == reps) { at.tpos += reps; continue; } if (rep_from == reps) { at.tpos += reps; continue; }
@<Try all possible match lengths until we find a match@>; <<Try all possible match lengths until we find a match>>;
/* no match length worked, so no match */ /* no match length worked, so no match */
return -1; return -1;
} }
@<Copy the bracketed texts found into the global strings@>; <<Copy the bracketed texts found into the global strings>>;
return at.tpos; return at.tpos;
} }
@<Parentheses in the match pattern set up substrings to extract@> = <<Parentheses in the match pattern set up substrings to extract>>=
if (pattern[at.ppos] == '(') { if (pattern[at.ppos] == '(') {
if (at.bl < MAX_BRACKETED_SUBEXPRESSIONS) at.bracket_nesting[at.bl] = -1; if (at.bl < MAX_BRACKETED_SUBEXPRESSIONS) at.bracket_nesting[at.bl] = -1;
if (at.bc < MAX_BRACKETED_SUBEXPRESSIONS) { if (at.bc < MAX_BRACKETED_SUBEXPRESSIONS) {
@ -255,7 +257,7 @@ int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern,
continue; continue;
} }
@<Extract the character class to match from the pattern@> = <<Extract the character class to match from the pattern>>=
if (pattern[at.ppos] == 0) return -1; if (pattern[at.ppos] == 0) return -1;
int len = 0; int len = 0;
chcl = Regexp::get_cclass(pattern, at.ppos, &len, &range_from, &range_to, &reverse); chcl = Regexp::get_cclass(pattern, at.ppos, &len, &range_from, &range_to, &reverse);
@ -265,7 +267,7 @@ int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern,
@ This is standard regular-expression notation, except that I haven't bothered @ This is standard regular-expression notation, except that I haven't bothered
to implement numeric repetition counts, which we won't need: to implement numeric repetition counts, which we won't need:
@<Extract repetition markers from the pattern@> = <<Extract repetition markers from the pattern>>=
if (chcl == WHITESPACE_CHARCLASS) { if (chcl == WHITESPACE_CHARCLASS) {
rep_from = 1; rep_to = Str::len(text)-at.tpos; rep_from = 1; rep_to = Str::len(text)-at.tpos;
} }
@ -276,13 +278,13 @@ to implement numeric repetition counts, which we won't need:
} }
if (pattern[at.ppos] == '?') { greedy = FALSE; at.ppos++; } if (pattern[at.ppos] == '?') { greedy = FALSE; at.ppos++; }
@<Count how many repetitions can be made here@> = <<Count how many repetitions can be made here>>=
for (reps = 0; ((Str::get_at(text, at.tpos+reps)) && (reps < rep_to)); reps++) for (reps = 0; ((Str::get_at(text, at.tpos+reps)) && (reps < rep_to)); reps++)
if (Regexp::test_cclass(Str::get_at(text, at.tpos+reps), chcl, if (Regexp::test_cclass(Str::get_at(text, at.tpos+reps), chcl,
range_from, range_to, pattern, reverse) == FALSE) range_from, range_to, pattern, reverse) == FALSE)
break; break;
@<Try all possible match lengths until we find a match@> = <<Try all possible match lengths until we find a match>>=
int from = rep_from, to = reps, dj = 1, from_tpos = at.tpos; int from = rep_from, to = reps, dj = 1, from_tpos = at.tpos;
if (greedy) { from = reps; to = rep_from; dj = -1; } if (greedy) { from = reps; to = rep_from; dj = -1; }
for (int j = from; j != to+dj; j += dj) { for (int j = from; j != to+dj; j += dj) {
@ -291,7 +293,7 @@ to implement numeric repetition counts, which we won't need:
if (try >= 0) return try; if (try >= 0) return try;
} }
@<Copy the bracketed texts found into the global strings@> = <<Copy the bracketed texts found into the global strings>>=
if (mr) { if (mr) {
for (int i=0; i<at.bc; i++) { for (int i=0; i<at.bc; i++) {
Str::clear(mr->exp[i]); Str::clear(mr->exp[i]);
@ -303,38 +305,39 @@ to implement numeric repetition counts, which we won't need:
} }
@ So then: most characters in the pattern are taken literally (if the pattern @ So then: most characters in the pattern are taken literally (if the pattern
says |q|, the only match is with a lower-case letter "q"), except that: says [[q]], the only match is with a lower-case letter "q"), except that:
(a) a space means "one or more characters of white space"; (a) a space means "one or more characters of white space";
(b) |%d| means any decimal digit; (b) [[%d]] means any decimal digit;
(c) |%c| means any character at all; (c) [[%c]] means any character at all;
(d) |%C| means any character which isn't white space; (d) [[%C]] means any character which isn't white space;
(e) |%i| means any character from the identifier class (see above); (e) [[%i]] means any character from the identifier class (see above);
(f) |%p| means any character which can be used in the name of a Preform (f) [[%p]] means any character which can be used in the name of a Preform
nonterminal, which is to say, an identifier character or a hyphen; nonterminal, which is to say, an identifier character or a hyphen;
(g) |%P| means the same or else a colon; (g) [[%P]] means the same or else a colon;
(h) |%t| means a tab; (h) [[%t]] means a tab;
(i) |%q| means a double-quote. (i) [[%q]] means a double-quote.
|%| otherwise makes a literal escape; a space means any whitespace character; [[%]] otherwise makes a literal escape; a space means any whitespace character;
square brackets enclose literal alternatives, and note as usual with grep square brackets enclose literal alternatives, and note as usual with grep
engines that |[]xyz]| is legal and makes a set of four possibilities, the engines that [[[]xyz]]] is legal and makes a set of four possibilities, the
first of which is a literal close square; within a set, a hyphen makes a first of which is a literal close square; within a set, a hyphen makes a
character range; an initial |^| negates the result; and otherwise everything character range; an initial [[^]] negates the result; and otherwise everything
is literal. is literal.
@d ANY_CHARCLASS 1 <<*>>=
@d DIGIT_CHARCLASS 2 #define ANY_CHARCLASS 1
@d WHITESPACE_CHARCLASS 3 #define DIGIT_CHARCLASS 2
@d NONWHITESPACE_CHARCLASS 4 #define WHITESPACE_CHARCLASS 3
@d IDENTIFIER_CHARCLASS 5 #define NONWHITESPACE_CHARCLASS 4
@d PREFORM_CHARCLASS 6 #define IDENTIFIER_CHARCLASS 5
@d PREFORMC_CHARCLASS 7 #define PREFORM_CHARCLASS 6
@d LITERAL_CHARCLASS 8 #define PREFORMC_CHARCLASS 7
@d TAB_CHARCLASS 9 #define LITERAL_CHARCLASS 8
@d QUOTE_CHARCLASS 10 #define TAB_CHARCLASS 9
#define QUOTE_CHARCLASS 10
= <<*>>=
int Regexp::get_cclass(wchar_t *pattern, int ppos, int *len, int *from, int *to, int *reverse) { int Regexp::get_cclass(wchar_t *pattern, int ppos, int *len, int *from, int *to, int *reverse) {
if (pattern[ppos] == '^') { ppos++; *reverse = TRUE; } else { *reverse = FALSE; } if (pattern[ppos] == '^') { ppos++; *reverse = TRUE; } else { *reverse = FALSE; }
switch (pattern[ppos]) { switch (pattern[ppos]) {
@ -364,7 +367,7 @@ int Regexp::get_cclass(wchar_t *pattern, int ppos, int *len, int *from, int *to,
*len = 1; *from = ppos; *to = ppos; return LITERAL_CHARCLASS; *len = 1; *from = ppos; *to = ppos; return LITERAL_CHARCLASS;
} }
@ = <<*>>=
int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *drawn_from, int reverse) { int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *drawn_from, int reverse) {
int match = FALSE; int match = FALSE;
switch (chcl) { switch (chcl) {
@ -375,11 +378,11 @@ int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *
case NONWHITESPACE_CHARCLASS: if (!(Characters::is_whitespace(c))) match = TRUE; break; case NONWHITESPACE_CHARCLASS: if (!(Characters::is_whitespace(c))) match = TRUE; break;
case QUOTE_CHARCLASS: if (c != '\"') match = TRUE; break; case QUOTE_CHARCLASS: if (c != '\"') match = TRUE; break;
case IDENTIFIER_CHARCLASS: if (Regexp::identifier_char(c)) match = TRUE; break; case IDENTIFIER_CHARCLASS: if (Regexp::identifier_char(c)) match = TRUE; break;
case PREFORM_CHARCLASS: if ((c == '-') || (c == '_') || case PREFORM_CHARCLASS: if ((c == '-') [[| (c == '_') |]]
((c >= 'a') && (c <= 'z')) || ((c >= 'a') && (c <= 'z')) [[]]
((c >= '0') && (c <= '9'))) match = TRUE; break; ((c >= '0') && (c <= '9'))) match = TRUE; break;
case PREFORMC_CHARCLASS: if ((c == '-') || (c == '_') || (c == ':') || case PREFORMC_CHARCLASS: if ((c == '-') [[| (c == '_') || (c == ':') |]]
((c >= 'a') && (c <= 'z')) || ((c >= 'a') && (c <= 'z')) [[]]
((c >= '0') && (c <= '9'))) match = TRUE; break; ((c >= '0') && (c <= '9'))) match = TRUE; break;
case LITERAL_CHARCLASS: case LITERAL_CHARCLASS:
if ((range_to > range_from) && (drawn_from[range_from] == '^')) { if ((range_to > range_from) && (drawn_from[range_from] == '^')) {
@ -398,20 +401,21 @@ int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *
return match; return match;
} }
@h Replacement. @ \section{Replacement.}
And this routine conveniently handles searching and replacing. This time we And this routine conveniently handles searching and replacing. This time we
can match at substrings of the |text| (i.e., we are not forced to match can match at substrings of the [[text]] (i.e., we are not forced to match
from the start right to the end), and multiple replacements can be made. from the start right to the end), and multiple replacements can be made.
For example, For example,
= (text as code)
Regexp::replace(text, L"[aeiou]", L"!", REP_REPEATING); Regexp::replace(text, L"[aeiou]", L"!", REP_REPEATING);
=
will turn the |text| "goose eggs" into "g!!s! !ggs".
@d REP_REPEATING 1 will turn the [[text]] "goose eggs" into "g!!s! !ggs".
@d REP_ATSTART 2
= <<*>>=
#define REP_REPEATING 1
#define REP_ATSTART 2
<<*>>=
int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, int options) { int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, int options) {
TEMPORARY_TEXT(altered) TEMPORARY_TEXT(altered)
match_results mr = Regexp::create_mr(); match_results mr = Regexp::create_mr();
@ -439,10 +443,10 @@ int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, i
changes++; changes++;
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
L = Str::len(text); i = L-left-1; L = Str::len(text); i = L-left-1;
if ((options & REP_REPEATING) == 0) { @<Add the rest@>; break; } if ((options & REP_REPEATING) == 0) { <<Add the rest>>; break; }
continue; continue;
} else PUT_TO(altered, Str::get_at(text, i)); } else PUT_TO(altered, Str::get_at(text, i));
if (options & REP_ATSTART) { @<Add the rest@>; break; } if (options & REP_ATSTART) { <<Add the rest>>; break; }
} }
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
if (changes > 0) Str::copy(text, altered); if (changes > 0) Str::copy(text, altered);
@ -450,6 +454,6 @@ int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, i
return changes; return changes;
} }
@<Add the rest@> = <<Add the rest>>=
for (i++; i<L; i++) for (i++; i<L; i++)
PUT_TO(altered, Str::get_at(text, i)); PUT_TO(altered, Str::get_at(text, i));

View file

@ -3,28 +3,29 @@
A simple, general-purpose preprocessor for text files, expanding macros and A simple, general-purpose preprocessor for text files, expanding macros and
performing repetitions. performing repetitions.
@h Scanner. @ \section{Scanner.}
Writing a general-purpose preprocessor really is coding like it's 1974, but Writing a general-purpose preprocessor really is coding like it's 1974, but
it turns out to be useful for multiple applications in the Inform project, and it turns out to be useful for multiple applications in the Inform project, and
saves us having to have dependencies on behemoths like the mighty |m4|. saves us having to have dependencies on behemoths like the mighty [[m4]].
For documentation on the markup notation, see //inweb: Webs, Tangling and Weaving//. For documentation on the markup notation, see //inweb: Webs, Tangling and Weaving//.
To use the preprocessor, call: To use the preprocessor, call:
= (text as InC)
Preprocessor::preprocess(from, to, header, special_macros, specifics, encoding) Preprocessor::preprocess(from, to, header, special_macros, specifics, encoding)
=
where |from| and |to| are filenames, |header| is text to place at the top of
the file (if any), |special_macros| is a |linked_list| of |preprocessor_macro|s
set up with special meanings to the situation, and |specifics| is a general
pointer to any data those special meanings need to use. |encoding| should be
one of |UTF8_ENC| or |ISO_ENC|.
@d PROTECTED_OPEN_BRACE_PPCHAR 0x25A0 where [[from]] and [[to]] are filenames, [[header]] is text to place at the top of
@d PROTECTED_CLOSE_BRACE_PPCHAR 0x25A1 the file (if any), [[special_macros]] is a [[linked_list]] of [[preprocessor_macro]]s
@d PROTECTED_BLANK_PPCHAR 0x25A2 set up with special meanings to the situation, and [[specifics]] is a general
pointer to any data those special meanings need to use. [[encoding]] should be
one of [[UTF8_ENC]] or [[ISO_ENC]].
= <<*>>=
#define PROTECTED_OPEN_BRACE_PPCHAR 0x25A0
#define PROTECTED_CLOSE_BRACE_PPCHAR 0x25A1
#define PROTECTED_BLANK_PPCHAR 0x25A2
<<*>>=
void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *header, void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *header,
linked_list *special_macros, general_pointer specifics, wchar_t comment_char, linked_list *special_macros, general_pointer specifics, wchar_t comment_char,
int encoding) { int encoding) {
@ -35,7 +36,7 @@ void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *hea
WRITE("%S", header); WRITE("%S", header);
preprocessor_state PPS; preprocessor_state PPS;
@<Initialise the preprocessor state@>; <<Initialise the preprocessor state>>;
TextFiles::read(prototype, FALSE, "can't open prototype file", TextFiles::read(prototype, FALSE, "can't open prototype file",
TRUE, Preprocessor::scan_line, NULL, &PPS); TRUE, Preprocessor::scan_line, NULL, &PPS);
for (int i=0; i<Str::len(PPS.dest); i++) { for (int i=0; i<Str::len(PPS.dest); i++) {
@ -50,9 +51,10 @@ void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *hea
@ The following imposing-looking set of state data is used as we work through @ The following imposing-looking set of state data is used as we work through
the prototype file line-by-line: the prototype file line-by-line:
@d MAX_PREPROCESSOR_LOOP_DEPTH 8 <<*>>=
#define MAX_PREPROCESSOR_LOOP_DEPTH 8
= <<*>>=
typedef struct preprocessor_state { typedef struct preprocessor_state {
struct text_stream *dest; struct text_stream *dest;
struct preprocessor_macro *defining; /* a "define" body being scanned */ struct preprocessor_macro *defining; /* a "define" body being scanned */
@ -63,19 +65,19 @@ typedef struct preprocessor_state {
int last_line_was_blank; /* used to suppress runs of multiple blank lines */ int last_line_was_blank; /* used to suppress runs of multiple blank lines */
struct preprocessor_variable_set *global_variables; struct preprocessor_variable_set *global_variables;
struct preprocessor_variable_set *stack_frame; struct preprocessor_variable_set *stack_frame;
struct linked_list *known_macros; /* of |preprocessor_macro| */ struct linked_list *known_macros; /* of [[preprocessor_macro]] */
struct general_pointer specifics; struct general_pointer specifics;
wchar_t comment_character; wchar_t comment_character;
} preprocessor_state; } preprocessor_state;
typedef struct preprocessor_loop { typedef struct preprocessor_loop {
struct text_stream *loop_var_name; struct text_stream *loop_var_name;
struct linked_list *iterations; /* of |text_stream| */ struct linked_list *iterations; /* of [[text_stream]] */
int repeat_is_block; int repeat_is_block;
struct text_stream *repeat_saved_dest; struct text_stream *repeat_saved_dest;
} preprocessor_loop; } preprocessor_loop;
@<Initialise the preprocessor state@> = <<Initialise the preprocessor state>>=
PPS.dest = Str::new(); PPS.dest = Str::new();
PPS.suppress_newline = FALSE; PPS.suppress_newline = FALSE;
PPS.last_line_was_blank = TRUE; PPS.last_line_was_blank = TRUE;
@ -91,7 +93,7 @@ typedef struct preprocessor_loop {
@ Conceptually, each loop runs a variable with a given name through a series @ Conceptually, each loop runs a variable with a given name through a series
of textual values in sequence, and we store that data here: of textual values in sequence, and we store that data here:
= <<*>>=
void Preprocessor::set_loop_var_name(preprocessor_loop *loop, text_stream *name) { void Preprocessor::set_loop_var_name(preprocessor_loop *loop, text_stream *name) {
loop->loop_var_name = Str::duplicate(name); loop->loop_var_name = Str::duplicate(name);
} }
@ -102,31 +104,31 @@ void Preprocessor::add_loop_iteration(preprocessor_loop *loop, text_stream *valu
@ Lines from the prototype (or sometimes from files spliced in) are read, one @ Lines from the prototype (or sometimes from files spliced in) are read, one
at a time, by the following. at a time, by the following.
Note that |define| and |end-define| are not themselves macros, and are handled Note that [[define]] and [[end-define]] are not themselves macros, and are handled
directly here. So you cannot use repeat loops to define multiple macros with directly here. So you cannot use repeat loops to define multiple macros with
parametrised names: but then, nor should you. parametrised names: but then, nor should you.
= <<*>>=
void Preprocessor::scan_line(text_stream *line, text_file_position *tfp, void *X) { void Preprocessor::scan_line(text_stream *line, text_file_position *tfp, void *X) {
preprocessor_state *PPS = (preprocessor_state *) X; preprocessor_state *PPS = (preprocessor_state *) X;
@<Skip comments@>; <<Skip comments>>;
@<Make backslash literals safe@>; <<Make backslash literals safe>>;
@<Deal with textual definitions of new macros@>; <<Deal with textual definitions of new macros>>;
Preprocessor::expand(line, tfp, PPS); Preprocessor::expand(line, tfp, PPS);
@<Sometimes, but only sometimes, output a newline@>; <<Sometimes, but only sometimes, output a newline>>;
} }
@ A line is a comment to the preprocessor if its first non-whitespace character @ A line is a comment to the preprocessor if its first non-whitespace character
is the special comment character: often |#|, but not necessarily. is the special comment character: often [[#]], but not necessarily.
@<Skip comments@> = <<Skip comments>>=
LOOP_THROUGH_TEXT(pos, line) { LOOP_THROUGH_TEXT(pos, line) {
wchar_t c = Str::get(pos); wchar_t c = Str::get(pos);
if (c == PPS->comment_character) return; if (c == PPS->comment_character) return;
if (Characters::is_whitespace(c) == FALSE) break; if (Characters::is_whitespace(c) == FALSE) break;
} }
@<Make backslash literals safe@> = <<Make backslash literals safe>>=
for (int i = 0; i < Str::len(line); i++) { for (int i = 0; i < Str::len(line); i++) {
wchar_t c = Str::get_at(line, i); wchar_t c = Str::get_at(line, i);
if (c == '\\') { if (c == '\\') {
@ -151,15 +153,15 @@ is the special comment character: often |#|, but not necessarily.
} }
} }
@<Deal with textual definitions of new macros@> = <<Deal with textual definitions of new macros>>=
match_results mr = Regexp::create_mr(); match_results mr = Regexp::create_mr();
if (Regexp::match(&mr, line, L" *{define: *(%C+) *} *")) @<Begin a bare definition@>; if (Regexp::match(&mr, line, L" *{define: *(%C+) *} *")) <<Begin a bare definition>>;
if (Regexp::match(&mr, line, L" *{define: *(%C+) (%c*)} *")) @<Begin a definition@>; if (Regexp::match(&mr, line, L" *{define: *(%C+) (%c*)} *")) <<Begin a definition>>;
if (Regexp::match(&mr, line, L" *{end-define} *")) @<End a definition@>; if (Regexp::match(&mr, line, L" *{end-define} *")) <<End a definition>>;
if (PPS->defining) @<Continue a definition@>; if (PPS->defining) <<Continue a definition>>;
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
@<Begin a bare definition@> = <<Begin a bare definition>>=
if (PPS->defining) if (PPS->defining)
Errors::in_text_file("nested definitions are not allowed", tfp); Errors::in_text_file("nested definitions are not allowed", tfp);
text_stream *name = mr.exp[0]; text_stream *name = mr.exp[0];
@ -169,7 +171,7 @@ is the special comment character: often |#|, but not necessarily.
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
return; return;
@<Begin a definition@> = <<Begin a definition>>=
if (PPS->defining) if (PPS->defining)
Errors::in_text_file("nested definitions are not allowed", tfp); Errors::in_text_file("nested definitions are not allowed", tfp);
text_stream *name = mr.exp[0]; text_stream *name = mr.exp[0];
@ -179,19 +181,19 @@ is the special comment character: often |#|, but not necessarily.
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
return; return;
@<Continue a definition@> = <<Continue a definition>>=
Preprocessor::add_line_to_macro(PPS->defining, line, tfp); Preprocessor::add_line_to_macro(PPS->defining, line, tfp);
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
return; return;
@<End a definition@> = <<End a definition>>=
if (PPS->defining == NULL) if (PPS->defining == NULL)
Errors::in_text_file("{end-define} without {define: ...}", tfp); Errors::in_text_file("{end-define} without {define: ...}", tfp);
PPS->defining = NULL; PPS->defining = NULL;
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
return; return;
@<Sometimes, but only sometimes, output a newline@> = <<Sometimes, but only sometimes, output a newline>>=
if (PPS->suppress_newline == FALSE) { if (PPS->suppress_newline == FALSE) {
text_stream *OUT = PPS->dest; text_stream *OUT = PPS->dest;
if (Str::len(line) == 0) { if (Str::len(line) == 0) {
@ -208,13 +210,13 @@ is the special comment character: often |#|, but not necessarily.
(i) Does not contain any newlines; (i) Does not contain any newlines;
(ii) Contains braces |{ ... }| used in nested pairs (unless there is a syntax (ii) Contains braces [[{ ... }]] used in nested pairs (unless there is a syntax
error in the prototype, in which case we must complain). error in the prototype, in which case we must complain).
The idea is to pass everything straight through except any braced matter, The idea is to pass everything straight through except any braced matter,
which needs special attention. which needs special attention.
= <<*>>=
void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocessor_state *PPS) { void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocessor_state *PPS) {
TEMPORARY_TEXT(before_matter) TEMPORARY_TEXT(before_matter)
TEMPORARY_TEXT(braced_matter) TEMPORARY_TEXT(braced_matter)
@ -238,7 +240,7 @@ void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocess
} }
if (bl > 0) Errors::in_text_file("too many '{'s", tfp); if (bl > 0) Errors::in_text_file("too many '{'s", tfp);
if (after_times) { if (after_times) {
@<Expand braced matter@>; <<Expand braced matter>>;
} else { } else {
WRITE_TO(PPS->dest, "%S", text); WRITE_TO(PPS->dest, "%S", text);
} }
@ -247,13 +249,13 @@ void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocess
DISCARD_TEXT(after_matter) DISCARD_TEXT(after_matter)
} }
@ Suppose we are expanding the text |this {ADJECTIVE} ocean {BEHAVIOUR}|: then @ Suppose we are expanding the text [[this {ADJECTIVE} ocean {BEHAVIOUR}]]: then
the |before_matter| will be |this |, the |braced_matter| will be |ADJECTIVE|, the [[before_matter]] will be [[this ]], the [[braced_matter]] will be [[ADJECTIVE]],
and the |after_matter| will be | ocean {BEHAVIOUR}|. and the [[after_matter]] will be [[ ocean {BEHAVIOUR}]].
@<Expand braced matter@> = <<Expand braced matter>>=
if (Preprocessor::acceptable_variable_name(braced_matter)) { if (Preprocessor::acceptable_variable_name(braced_matter)) {
@<Expand a variable name@>; <<Expand a variable name>>;
} else { } else {
text_stream *identifier = braced_matter; text_stream *identifier = braced_matter;
text_stream *parameter_settings = NULL; text_stream *parameter_settings = NULL;
@ -262,7 +264,7 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|.
identifier = mr.exp[0]; identifier = mr.exp[0];
parameter_settings = mr.exp[1]; parameter_settings = mr.exp[1];
} }
@<Work out which macro identifier is meant by a loop name@>; <<Work out which macro identifier is meant by a loop name>>;
preprocessor_macro *mm = Preprocessor::find_macro(PPS->known_macros, identifier); preprocessor_macro *mm = Preprocessor::find_macro(PPS->known_macros, identifier);
if (mm == NULL) { if (mm == NULL) {
@ -271,15 +273,15 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|.
Errors::in_text_file_S(erm, tfp); Errors::in_text_file_S(erm, tfp);
DISCARD_TEXT(erm) DISCARD_TEXT(erm)
} else { } else {
@<Expand a macro@>; <<Expand a macro>>;
} }
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
} }
@ So, for example, the identifier |repeat| would be changed here either to @ So, for example, the identifier [[repeat]] would be changed here either to
|repeat-block| or |repeat-span|: see above for an explanation. [[repeat-block]] or [[repeat-span]]: see above for an explanation.
@<Work out which macro identifier is meant by a loop name@> = <<Work out which macro identifier is meant by a loop name>>=
preprocessor_macro *loop_mm; preprocessor_macro *loop_mm;
LOOP_OVER_LINKED_LIST(loop_mm, preprocessor_macro, PPS->known_macros) LOOP_OVER_LINKED_LIST(loop_mm, preprocessor_macro, PPS->known_macros)
if (Str::len(loop_mm->loop_name) > 0) { if (Str::len(loop_mm->loop_name) > 0) {
@ -308,27 +310,27 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|.
} }
@ Note that if we are inside a loop, we do not perform expansion on the variable @ Note that if we are inside a loop, we do not perform expansion on the variable
name, and instead pass it through unchanged -- still as, say, |{NAME}|. This name, and instead pass it through unchanged -- still as, say, [[{NAME}]]. This
is because it won't be expanded until later, when the expander reaches the is because it won't be expanded until later, when the expander reaches the
end of the loop body. end of the loop body.
@<Expand a variable name@> = <<Expand a variable name>>=
Preprocessor::expand(before_matter, tfp, PPS); Preprocessor::expand(before_matter, tfp, PPS);
if (PPS->repeat_sp > 0) { if (PPS->repeat_sp > 0) {
WRITE_TO(PPS->dest, "{%S}", braced_matter); WRITE_TO(PPS->dest, "{%S}", braced_matter);
} else { } else {
@<Definitely expand a variable name@>; <<Definitely expand a variable name>>;
} }
Preprocessor::expand(after_matter, tfp, PPS); Preprocessor::expand(after_matter, tfp, PPS);
@ Similarly, we don't expand macros inside the body of a loop, except that we @ Similarly, we don't expand macros inside the body of a loop, except that we
need to expand the |{end-repeat-block}| (or similar) which closes that loop need to expand the [[{end-repeat-block}]] (or similar) which closes that loop
body, so that we can escape back into normal mode. Because loop constructs body, so that we can escape back into normal mode. Because loop constructs
may be nested, we need to react to (but not expand) loop openings, too. may be nested, we need to react to (but not expand) loop openings, too.
The "shadow stack pointer" shows how deep we are inside these shadowy, The "shadow stack pointer" shows how deep we are inside these shadowy,
not-yet-acted-on, loops. not-yet-acted-on, loops.
@<Expand a macro@> = <<Expand a macro>>=
if (mm->suppress_whitespace_when_expanding) { if (mm->suppress_whitespace_when_expanding) {
while (Characters::is_whitespace(Str::get_last_char(before_matter))) while (Characters::is_whitespace(Str::get_last_char(before_matter)))
Str::delete_last_character(before_matter); Str::delete_last_character(before_matter);
@ -348,16 +350,16 @@ not-yet-acted-on, loops.
if ((divert_if_repeating) && (PPS->repeat_sp > 0)) { if ((divert_if_repeating) && (PPS->repeat_sp > 0)) {
WRITE_TO(PPS->dest, "{%S}", braced_matter); WRITE_TO(PPS->dest, "{%S}", braced_matter);
} else { } else {
@<Definitely expand a macro@>; <<Definitely expand a macro>>;
if (mm->suppress_newline_after_expanding) PPS->suppress_newline = TRUE; if (mm->suppress_newline_after_expanding) PPS->suppress_newline = TRUE;
} }
Preprocessor::expand(after_matter, tfp, PPS); Preprocessor::expand(after_matter, tfp, PPS);
@ We can now forget about the |before_matter|, the |after_matter|, or whether @ We can now forget about the [[before_matter]], the [[after_matter]], or whether
we ought not to expand after all: that's all taken care of. A variable expands we ought not to expand after all: that's all taken care of. A variable expands
to its value: to its value:
@<Definitely expand a variable name@> = <<Definitely expand a variable name>>=
preprocessor_variable *var = preprocessor_variable *var =
Preprocessor::find_variable(braced_matter, PPS->stack_frame); Preprocessor::find_variable(braced_matter, PPS->stack_frame);
if (var) { if (var) {
@ -370,24 +372,24 @@ to its value:
} }
@ This looks fussy, but really it delegates the work by calling a function @ This looks fussy, but really it delegates the work by calling a function
attached to the macro, the |expander|. attached to the macro, the [[expander]].
@<Definitely expand a macro@> = <<Definitely expand a macro>>=
text_stream *parameter_values[MAX_PP_MACRO_PARAMETERS]; text_stream *parameter_values[MAX_PP_MACRO_PARAMETERS];
for (int i=0; i<MAX_PP_MACRO_PARAMETERS; i++) parameter_values[i] = NULL; for (int i=0; i<MAX_PP_MACRO_PARAMETERS; i++) parameter_values[i] = NULL;
@<Parse the parameters supplied@>; <<Parse the parameters supplied>>;
@<Check that all compulsory parameters have been supplied@>; <<Check that all compulsory parameters have been supplied>>;
preprocessor_loop *loop = NULL; preprocessor_loop *loop = NULL;
if (mm->begins_loop) @<Initialise repetition data for the loop@>; if (mm->begins_loop) <<Initialise repetition data for the loop>>;
(*(mm->expander))(mm, PPS, parameter_values, loop, tfp); (*(mm->expander))(mm, PPS, parameter_values, loop, tfp);
@ Note that textual values of the parameters are themselves expanded before @ Note that textual values of the parameters are themselves expanded before
use: they might contain variables, or even macros. Parameter names are not. use: they might contain variables, or even macros. Parameter names are not.
So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|. So you can have [[in: {WHATEVER}]] but not [[{WHATEVER}: this]].
@<Parse the parameters supplied@> = <<Parse the parameters supplied>>=
match_results mr = Regexp::create_mr(); match_results mr = Regexp::create_mr();
while (Regexp::match(&mr, parameter_settings, L" *(%C+): *(%c*)")) { while (Regexp::match(&mr, parameter_settings, L" *(%C+): *(%c*)")) {
text_stream *setting = mr.exp[0]; text_stream *setting = mr.exp[0];
@ -422,7 +424,7 @@ So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|.
if (Str::is_whitespace(parameter_settings) == FALSE) if (Str::is_whitespace(parameter_settings) == FALSE)
Errors::in_text_file("parameter list is malformed", tfp); Errors::in_text_file("parameter list is malformed", tfp);
@<Check that all compulsory parameters have been supplied@> = <<Check that all compulsory parameters have been supplied>>=
for (int i=0; i<mm->no_parameters; i++) for (int i=0; i<mm->no_parameters; i++)
if (parameter_values[i] == NULL) if (parameter_values[i] == NULL)
if (mm->parameters[i]->optional == FALSE) { if (mm->parameters[i]->optional == FALSE) {
@ -432,11 +434,11 @@ So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|.
DISCARD_TEXT(erm) DISCARD_TEXT(erm)
} }
@ The following code is a little misleading. At present, |PPS->repeat_sp| is @ The following code is a little misleading. At present, [[PPS->repeat_sp]] is
always either 0 or 1, no matter how deep loop nesting is: but that's just an always either 0 or 1, no matter how deep loop nesting is: but that's just an
artefact of the current scanning algorithm, which might some day change. artefact of the current scanning algorithm, which might some day change.
@<Initialise repetition data for the loop@> = <<Initialise repetition data for the loop>>=
if (PPS->repeat_sp >= MAX_PREPROCESSOR_LOOP_DEPTH) { if (PPS->repeat_sp >= MAX_PREPROCESSOR_LOOP_DEPTH) {
Errors::in_text_file("repetition too deep", tfp); Errors::in_text_file("repetition too deep", tfp);
} else { } else {
@ -450,10 +452,10 @@ artefact of the current scanning algorithm, which might some day change.
PPS->dest = Str::new(); PPS->dest = Str::new();
} }
@h Variables. @ \section{Variables.}
Names of variables should conform to: Names of variables should conform to:
= <<*>>=
int Preprocessor::acceptable_variable_name(text_stream *name) { int Preprocessor::acceptable_variable_name(text_stream *name) {
LOOP_THROUGH_TEXT(pos, name) { LOOP_THROUGH_TEXT(pos, name) {
wchar_t c = Str::get(pos); wchar_t c = Str::get(pos);
@ -467,7 +469,7 @@ int Preprocessor::acceptable_variable_name(text_stream *name) {
@ Variables are all textual: @ Variables are all textual:
= <<*>>=
typedef struct preprocessor_variable { typedef struct preprocessor_variable {
struct text_stream *name; struct text_stream *name;
struct text_stream *value; struct text_stream *value;
@ -483,13 +485,13 @@ void Preprocessor::write_variable(preprocessor_variable *var, text_stream *val)
var->value = Str::duplicate(val); var->value = Str::duplicate(val);
} }
@ Each variable belongs to a single "set". If |EXAMPLE| has one meaning outside a @ Each variable belongs to a single "set". If [[EXAMPLE]] has one meaning outside a
definition and another insider, that's two variables with a common name, not definition and another insider, that's two variables with a common name, not
one variable belonging to two sets at once. one variable belonging to two sets at once.
= <<*>>=
typedef struct preprocessor_variable_set { typedef struct preprocessor_variable_set {
struct linked_list *variables; /* of |preprocessor_variable| */ struct linked_list *variables; /* of [[preprocessor_variable]] */
struct preprocessor_variable_set *outer; struct preprocessor_variable_set *outer;
CLASS_DEFINITION CLASS_DEFINITION
} preprocessor_variable_set; } preprocessor_variable_set;
@ -524,7 +526,7 @@ preprocessor_variable *Preprocessor::find_variable(text_stream *name,
@ This creates a variable if it doesn't already exist in the given set. (If @ This creates a variable if it doesn't already exist in the given set. (If
it exists in some outer set, that doesn't count.) it exists in some outer set, that doesn't count.)
= <<*>>=
preprocessor_variable *Preprocessor::ensure_variable(text_stream *name, preprocessor_variable *Preprocessor::ensure_variable(text_stream *name,
preprocessor_variable_set *in_set) { preprocessor_variable_set *in_set) {
if (in_set == NULL) internal_error("variable without set"); if (in_set == NULL) internal_error("variable without set");
@ -538,29 +540,30 @@ preprocessor_variable *Preprocessor::ensure_variable(text_stream *name,
return var; return var;
} }
@h Macros. @ \section{Macros.}
For the most part, each macro seen by users corresponds to a single For the most part, each macro seen by users corresponds to a single
//preprocessor_macro//, but loop constructs are an exception. When the user //preprocessor_macro//, but loop constructs are an exception. When the user
types |{repeat ...}|, this is a reference to |repeat-block| if the body of types [[{repeat ...}]], this is a reference to [[repeat-block]] if the body of
what to repeat occupies multiple lines, but to |repeat-span| if only one. what to repeat occupies multiple lines, but to [[repeat-span]] if only one.
For example, the first [[repeat]] loop here uses the macros [[repeat-block]] and
[[end-repeat-block]], and the second uses [[repeat-span]] and [[end-repeat-span]].
For example, the first |repeat| loop here uses the macros |repeat-block| and
|end-repeat-block|, and the second uses |repeat-span| and |end-repeat-span|.
= (text)
{repeat with SEA in Black, Caspian} {repeat with SEA in Black, Caspian}
Welcome to the SEA Sea. Welcome to the SEA Sea.
{end-repeat} {end-repeat}
... ...
Seas available:{repeat with SEA in Sargasso, Libyan} {SEA} Sea;{end-repeat} Seas available:{repeat with SEA in Sargasso, Libyan} {SEA} Sea;{end-repeat}
=
@ There are (for now, anyway) hard but harmlessly large limits on the number of @ There are (for now, anyway) hard but harmlessly large limits on the number of
parameters and the length of a macro: parameters and the length of a macro:
@d MAX_PP_MACRO_PARAMETERS 8 <<*>>=
@d MAX_PP_MACRO_LINES 128 #define MAX_PP_MACRO_PARAMETERS 8
#define MAX_PP_MACRO_LINES 128
= <<*>>=
typedef struct preprocessor_macro { typedef struct preprocessor_macro {
/* syntax */ /* syntax */
struct text_stream *identifier; struct text_stream *identifier;
@ -573,10 +576,10 @@ typedef struct preprocessor_macro {
void (*expander)(struct preprocessor_macro *, struct preprocessor_state *, struct text_stream **, struct preprocessor_loop *, struct text_file_position *); void (*expander)(struct preprocessor_macro *, struct preprocessor_state *, struct text_stream **, struct preprocessor_loop *, struct text_file_position *);
/* loop construct if any */ /* loop construct if any */
int begins_loop; /* |TRUE| for e.g. |repeat-block| or |repeat-span| */ int begins_loop; /* [[TRUE| for e.g. |repeat-block| or |repeat-span]] */
int ends_loop; /* |TRUE| for e.g. |end-repeat-block| */ int ends_loop; /* [[TRUE| for e.g. |end-repeat-block]] */
struct text_stream *loop_name; /* e.g. |repeat| */ struct text_stream *loop_name; /* e.g. [[repeat]] */
int span; /* |TRUE| for e.g. |end-repeat-span| or |repeat-span| */ int span; /* [[TRUE| for e.g. |end-repeat-span| or |repeat-span]] */
/* textual behaviour */ /* textual behaviour */
int suppress_newline_after_expanding; int suppress_newline_after_expanding;
@ -592,16 +595,16 @@ typedef struct preprocessor_macro_parameter {
CLASS_DEFINITION CLASS_DEFINITION
} preprocessor_macro_parameter; } preprocessor_macro_parameter;
@ The following creates a new macro and adds it to the list |L|. By default, it @ The following creates a new macro and adds it to the list [[L]]. By default, it
has an empty definition (i.e., no lines), but may have a meaning provided by its has an empty definition (i.e., no lines), but may have a meaning provided by its
|expander| function regardless. The |parameter_specification| is as in the [[expander]] function regardless. The [[parameter_specification]] is as in the
textual declaration: for example, |in: IN ?towards: WAY| would be valid, with textual declaration: for example, [[in: IN ?towards: WAY]] would be valid, with
|in| being compulsory and |towards| optional when the macro is used. [[in]] being compulsory and [[towards]] optional when the macro is used.
If we expected 10000 macros, a dictionary would be better than a list. But in If we expected 10000 macros, a dictionary would be better than a list. But in
fact we expect more like 10. fact we expect more like 10.
= <<*>>=
preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name, preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
text_stream *parameter_specification, text_stream *parameter_specification,
void (*expander)(preprocessor_macro *, preprocessor_state *, text_stream **, preprocessor_loop *, text_file_position *), void (*expander)(preprocessor_macro *, preprocessor_state *, text_stream **, preprocessor_loop *, text_file_position *),
@ -609,13 +612,13 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
if (Preprocessor::find_macro(L, name)) if (Preprocessor::find_macro(L, name))
Errors::in_text_file("a macro with this name already exists", tfp); Errors::in_text_file("a macro with this name already exists", tfp);
preprocessor_macro *new_macro = CREATE(preprocessor_macro); preprocessor_macro *new_macro = CREATE(preprocessor_macro);
@<Initialise the macro@>; <<Initialise the macro>>;
@<Parse the parameter list@>; <<Parse the parameter list>>;
ADD_TO_LINKED_LIST(new_macro, preprocessor_macro, L); ADD_TO_LINKED_LIST(new_macro, preprocessor_macro, L);
return new_macro; return new_macro;
} }
@<Initialise the macro@> = <<Initialise the macro>>=
new_macro->identifier = Str::duplicate(name); new_macro->identifier = Str::duplicate(name);
new_macro->no_parameters = 0; new_macro->no_parameters = 0;
@ -629,7 +632,7 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
new_macro->suppress_newline_after_expanding = TRUE; new_macro->suppress_newline_after_expanding = TRUE;
new_macro->suppress_whitespace_when_expanding = TRUE; new_macro->suppress_whitespace_when_expanding = TRUE;
@<Parse the parameter list@> = <<Parse the parameter list>>=
text_stream *spec = Str::duplicate(parameter_specification); text_stream *spec = Str::duplicate(parameter_specification);
match_results mr = Regexp::create_mr(); match_results mr = Regexp::create_mr();
while (Regexp::match(&mr, spec, L" *(%C+): *(%C+) *(%c*)")) { while (Regexp::match(&mr, spec, L" *(%C+): *(%C+) *(%c*)")) {
@ -640,14 +643,14 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
if (new_macro->no_parameters >= MAX_PP_MACRO_PARAMETERS) { if (new_macro->no_parameters >= MAX_PP_MACRO_PARAMETERS) {
Errors::in_text_file("too many parameters in this definition", tfp); Errors::in_text_file("too many parameters in this definition", tfp);
} else { } else {
@<Add parameter to macro@>; <<Add parameter to macro>>;
} }
} }
Regexp::dispose_of(&mr); Regexp::dispose_of(&mr);
if (Str::is_whitespace(spec) == FALSE) if (Str::is_whitespace(spec) == FALSE)
Errors::in_text_file("parameter list for this definition is malformed", tfp); Errors::in_text_file("parameter list for this definition is malformed", tfp);
@<Add parameter to macro@> = <<Add parameter to macro>>=
preprocessor_macro_parameter *new_parameter = CREATE(preprocessor_macro_parameter); preprocessor_macro_parameter *new_parameter = CREATE(preprocessor_macro_parameter);
new_parameter->name = Str::duplicate(par_name); new_parameter->name = Str::duplicate(par_name);
new_parameter->definition_token = Str::duplicate(token_name); new_parameter->definition_token = Str::duplicate(token_name);
@ -661,7 +664,7 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
@ We can then add lines to a macro (though this will only have an effect if its @ We can then add lines to a macro (though this will only have an effect if its
expander function is //Preprocessor::default_expander//). expander function is //Preprocessor::default_expander//).
= <<*>>=
void Preprocessor::add_line_to_macro(preprocessor_macro *mm, text_stream *line, void Preprocessor::add_line_to_macro(preprocessor_macro *mm, text_stream *line,
text_file_position *tfp) { text_file_position *tfp) {
if (mm->no_lines >= MAX_PP_MACRO_LINES) { if (mm->no_lines >= MAX_PP_MACRO_LINES) {
@ -671,15 +674,15 @@ void Preprocessor::add_line_to_macro(preprocessor_macro *mm, text_stream *line,
} }
} }
@h Reserved macros. @ \section{Reserved macros.}
A few macros are "reserved", that is, have built-in meanings, and use expander A few macros are "reserved", that is, have built-in meanings, and use expander
functions other than //Preprocessor::default_expander//. functions other than //Preprocessor::default_expander//.
Some of these, the |special_macros|, are supplied by the code calling the Some of these, the [[special_macros]], are supplied by the code calling the
preprocessor. Those will provide domain-specific functionality. But a few are preprocessor. Those will provide domain-specific functionality. But a few are
built in here and therefore work in every domain: built in here and therefore work in every domain:
= <<*>>=
linked_list *Preprocessor::list_of_reserved_macros(linked_list *special_macros) { linked_list *Preprocessor::list_of_reserved_macros(linked_list *special_macros) {
linked_list *L = NEW_LINKED_LIST(preprocessor_macro); linked_list *L = NEW_LINKED_LIST(preprocessor_macro);
Preprocessor::new_loop_macro(L, I"repeat", I"with: WITH in: IN", Preprocessor::new_loop_macro(L, I"repeat", I"with: WITH in: IN",
@ -736,7 +739,7 @@ void Preprocessor::new_loop_macro(linked_list *L, text_stream *name,
@ Finding a macro in a list: @ Finding a macro in a list:
= <<*>>=
preprocessor_macro *Preprocessor::find_macro(linked_list *L, text_stream *name) { preprocessor_macro *Preprocessor::find_macro(linked_list *L, text_stream *name) {
preprocessor_macro *mm; preprocessor_macro *mm;
LOOP_OVER_LINKED_LIST(mm, preprocessor_macro, L) LOOP_OVER_LINKED_LIST(mm, preprocessor_macro, L)
@ -745,13 +748,13 @@ preprocessor_macro *Preprocessor::find_macro(linked_list *L, text_stream *name)
return NULL; return NULL;
} }
@h The expander for user-defined macros. @ \section{The expander for user-defined macros.}
All macros created by |{define: ...}| are expanded by the following function. All macros created by [[{define: ...}]] are expanded by the following function.
It creates a local "stack frame" making the parameters available as variables, It creates a local "stack frame" making the parameters available as variables,
then runs the definition lines through the scanner, then dismantles the stack then runs the definition lines through the scanner, then dismantles the stack
frame again. frame again.
= <<*>>=
void Preprocessor::default_expander(preprocessor_macro *mm, preprocessor_state *PPS, void Preprocessor::default_expander(preprocessor_macro *mm, preprocessor_state *PPS,
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) { text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
PPS->stack_frame = Preprocessor::new_variable_set(PPS->stack_frame); PPS->stack_frame = Preprocessor::new_variable_set(PPS->stack_frame);
@ -765,10 +768,10 @@ void Preprocessor::default_expander(preprocessor_macro *mm, preprocessor_state *
PPS->stack_frame = PPS->stack_frame->outer; PPS->stack_frame = PPS->stack_frame->outer;
} }
@h The set expander. @ \section{The set expander.}
An easy one. An easy one.
= <<*>>=
void Preprocessor::set_expander(preprocessor_macro *mm, preprocessor_state *PPS, void Preprocessor::set_expander(preprocessor_macro *mm, preprocessor_state *PPS,
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) { text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
text_stream *name = parameter_values[0]; text_stream *name = parameter_values[0];
@ -781,9 +784,9 @@ void Preprocessor::set_expander(preprocessor_macro *mm, preprocessor_state *PPS,
Preprocessor::write_variable(var, value); Preprocessor::write_variable(var, value);
} }
@h The repeat expander. @ \section{The repeat expander.}
= <<*>>=
void Preprocessor::repeat_expander(preprocessor_macro *mm, preprocessor_state *PPS, void Preprocessor::repeat_expander(preprocessor_macro *mm, preprocessor_state *PPS,
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) { text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
text_stream *with = parameter_values[0]; text_stream *with = parameter_values[0];
@ -803,7 +806,7 @@ void Preprocessor::repeat_expander(preprocessor_macro *mm, preprocessor_state *P
Preprocessor::add_loop_iteration(loop, value); Preprocessor::add_loop_iteration(loop, value);
} }
@h The expander used for all loop ends. @ \section{The expander used for all loop ends.}
The macros which open a loop just store up the name of the variable and the The macros which open a loop just store up the name of the variable and the
range of its values: otherwise, they do nothing. It's only when the end of a range of its values: otherwise, they do nothing. It's only when the end of a
loop is reached that any expansion happens, and this is where. loop is reached that any expansion happens, and this is where.
@ -812,7 +815,7 @@ We create a new stack frame inside the current one, and put the loop variable
into it. Then we run through the iteration values, setting the variable to into it. Then we run through the iteration values, setting the variable to
each in turn, and expand the material. each in turn, and expand the material.
= <<*>>=
void Preprocessor::end_loop_expander(preprocessor_macro *mm, preprocessor_state *PPS, void Preprocessor::end_loop_expander(preprocessor_macro *mm, preprocessor_state *PPS,
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) { text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
PPS->shadow_sp = 0; PPS->shadow_sp = 0;
@ -826,12 +829,12 @@ void Preprocessor::end_loop_expander(preprocessor_macro *mm, preprocessor_state
Preprocessor::ensure_variable(loop->loop_var_name, PPS->stack_frame); Preprocessor::ensure_variable(loop->loop_var_name, PPS->stack_frame);
text_stream *value; text_stream *value;
LOOP_OVER_LINKED_LIST(value, text_stream, loop->iterations) LOOP_OVER_LINKED_LIST(value, text_stream, loop->iterations)
@<Iterate with this value@>; <<Iterate with this value>>;
PPS->stack_frame = PPS->stack_frame->outer; PPS->stack_frame = PPS->stack_frame->outer;
} }
} }
@<Iterate with this value@> = <<Iterate with this value>>=
Preprocessor::write_variable(loop_var, value); Preprocessor::write_variable(loop_var, value);
if (mm->span) { if (mm->span) {
Preprocessor::expand(matter, tfp, PPS); Preprocessor::expand(matter, tfp, PPS);

View file

@ -2,10 +2,10 @@
Convenient routines for manipulating strings of text. Convenient routines for manipulating strings of text.
@h Strings are streams. @ \section{Strings are streams.}
Although Foundation provides limited facilities for handling standard or Although Foundation provides limited facilities for handling standard or
wide C-style strings -- that is, null-terminated arrays of |char| or wide C-style strings -- that is, null-terminated arrays of [[char]] or
|wchar_t| -- these are not encouraged. [[wchar_t]] -- these are not encouraged.
Instead, a standard string for a program using Foundation is nothing more than Instead, a standard string for a program using Foundation is nothing more than
a text stream (see Chapter 2). These are unbounded in size, with memory a text stream (see Chapter 2). These are unbounded in size, with memory
@ -22,19 +22,19 @@ against buffer overruns.
The present section of code provides convenient routines for creating, The present section of code provides convenient routines for creating,
duplicating, modifying and examining such strings. duplicating, modifying and examining such strings.
@h New strings. @ \section{New strings.}
Sometimes we want to make a new string in the sense of allocating more Sometimes we want to make a new string in the sense of allocating more
memory to hold it. These objects won't automatically be destroyed, so we memory to hold it. These objects won't automatically be destroyed, so we
shouldn't call these routines too casually. If we need a string just for shouldn't call these routines too casually. If we need a string just for
some space to play with for a short while, it's better to create one some space to play with for a short while, it's better to create one
with |TEMPORARY_TEXT| and then get rid of it with |DISCARD_TEXT|, macros with [[TEMPORARY_TEXT]] and then get rid of it with [[DISCARD_TEXT]], macros
defined in Chapter 2. defined in Chapter 2.
The capacity of these strings is unlimited in principle, and the number The capacity of these strings is unlimited in principle, and the number
here is just the size of the initial memory block, which is fastest to here is just the size of the initial memory block, which is fastest to
access. access.
= <<*>>=
text_stream *Str::new(void) { text_stream *Str::new(void) {
return Str::new_with_capacity(32); return Str::new_with_capacity(32);
} }
@ -50,10 +50,10 @@ void Str::dispose_of(text_stream *text) {
} }
@ Duplication of an existing string is complicated only by the issue that @ Duplication of an existing string is complicated only by the issue that
we want the duplicate always to be writeable, so that |NULL| can't be we want the duplicate always to be writeable, so that [[NULL]] can't be
duplicated as |NULL|. duplicated as [[NULL]].
= <<*>>=
text_stream *Str::duplicate(text_stream *E) { text_stream *Str::duplicate(text_stream *E) {
if (E == NULL) return Str::new(); if (E == NULL) return Str::new();
text_stream *S = CREATE(text_stream); text_stream *S = CREATE(text_stream);
@ -64,12 +64,12 @@ text_stream *Str::duplicate(text_stream *E) {
return NULL; return NULL;
} }
@h Converting from C strings. @ \section{Converting from C strings.}
Here we open text streams initially equal to the given C strings, and Here we open text streams initially equal to the given C strings, and
with the capacity of the initial block large enough to hold the whole with the capacity of the initial block large enough to hold the whole
thing plus a little extra, for efficiency's sake. thing plus a little extra, for efficiency's sake.
= <<*>>=
text_stream *Str::new_from_wide_string(const wchar_t *C_string) { text_stream *Str::new_from_wide_string(const wchar_t *C_string) {
text_stream *S = CREATE(text_stream); text_stream *S = CREATE(text_stream);
if (Streams::open_from_wide_string(S, C_string)) return S; if (Streams::open_from_wide_string(S, C_string)) return S;
@ -96,7 +96,7 @@ text_stream *Str::new_from_locale_string(const char *C_string) {
@ And sometimes we want to use an existing stream object: @ And sometimes we want to use an existing stream object:
= <<*>>=
text_stream *Str::from_wide_string(text_stream *S, wchar_t *c_string) { text_stream *Str::from_wide_string(text_stream *S, wchar_t *c_string) {
if (Streams::open_from_wide_string(S, c_string) == FALSE) return NULL; if (Streams::open_from_wide_string(S, c_string) == FALSE) return NULL;
return S; return S;
@ -107,9 +107,9 @@ text_stream *Str::from_locale_string(text_stream *S, char *c_string) {
return S; return S;
} }
@h Converting to C strings. @ \section{Converting to C strings.}
= <<*>>=
void Str::copy_to_ISO_string(char *C_string, text_stream *S, int buffer_size) { void Str::copy_to_ISO_string(char *C_string, text_stream *S, int buffer_size) {
Streams::write_as_ISO_string(C_string, S, buffer_size); Streams::write_as_ISO_string(C_string, S, buffer_size);
} }
@ -126,9 +126,9 @@ void Str::copy_to_locale_string(char *C_string, text_stream *S, int buffer_size)
Streams::write_as_locale_string(C_string, S, buffer_size); Streams::write_as_locale_string(C_string, S, buffer_size);
} }
@h Converting to integers. @ \section{Converting to integers.}
= <<*>>=
int Str::atoi(text_stream *S, int index) { int Str::atoi(text_stream *S, int index) {
char buffer[32]; char buffer[32];
int i = 0; int i = 0;
@ -139,15 +139,15 @@ int Str::atoi(text_stream *S, int index) {
return atoi(buffer); return atoi(buffer);
} }
@h Length. @ \section{Length.}
A puritan would return a |size_t| here, but I am not a puritan. A puritan would return a [[size_t]] here, but I am not a puritan.
= <<*>>=
int Str::len(text_stream *S) { int Str::len(text_stream *S) {
return Streams::get_position(S); return Streams::get_position(S);
} }
@h Position markers. @ \section{Position markers.}
A position marker is a lightweight way to refer to a particular position A position marker is a lightweight way to refer to a particular position
in a given string. Position 0 is before the first character; if, for in a given string. Position 0 is before the first character; if, for
example, the string contains the word "gazpacho", then position 8 represents example, the string contains the word "gazpacho", then position 8 represents
@ -155,7 +155,7 @@ the end of the string, after the "o". Negative positions are not allowed,
but positive ones well past the end of the string are legal. (Doing things but positive ones well past the end of the string are legal. (Doing things
at those positions may well not be, of course.) at those positions may well not be, of course.)
= <<*>>=
typedef struct string_position { typedef struct string_position {
struct text_stream *S; struct text_stream *S;
int index; int index;
@ -163,7 +163,7 @@ typedef struct string_position {
@ You can then find a position in a given string thus: @ You can then find a position in a given string thus:
= <<*>>=
string_position Str::start(text_stream *S) { string_position Str::start(text_stream *S) {
string_position P; P.S = S; P.index = 0; return P; string_position P; P.S = S; P.index = 0; return P;
} }
@ -180,7 +180,7 @@ string_position Str::end(text_stream *S) {
@ And you can step forwards or backwards: @ And you can step forwards or backwards:
= <<*>>=
string_position Str::back(string_position P) { string_position Str::back(string_position P) {
if (P.index > 0) P.index--; return P; if (P.index > 0) P.index--; return P;
} }
@ -209,18 +209,19 @@ int Str::index(string_position P) {
@ This leads to the following convenient loop macros: @ This leads to the following convenient loop macros:
@d LOOP_THROUGH_TEXT(P, ST) <<*>>=
#define LOOP_THROUGH_TEXT(P, ST)
for (string_position P = Str::start(ST); P.index < Str::len(P.S); P.index++) for (string_position P = Str::start(ST); P.index < Str::len(P.S); P.index++)
@d LOOP_BACKWARDS_THROUGH_TEXT(P, ST) #define LOOP_BACKWARDS_THROUGH_TEXT(P, ST)
for (string_position P = Str::back(Str::end(ST)); P.index >= 0; P.index--) for (string_position P = Str::back(Str::end(ST)); P.index >= 0; P.index--)
@h Character operations. @ \section{Character operations.}
How to get at individual characters, then, now that we can refer to positions: How to get at individual characters, then, now that we can refer to positions:
= <<*>>=
wchar_t Str::get(string_position P) { wchar_t Str::get(string_position P) {
if ((P.S == NULL) || (P.index < 0)) return 0; if ((P.S == NULL) [[]] (P.index < 0)) return 0;
return Streams::get_char_at_index(P.S, P.index); return Streams::get_char_at_index(P.S, P.index);
} }
@ -239,7 +240,7 @@ wchar_t Str::get_last_char(text_stream *S) {
return Str::get(Str::at(S, L-1)); return Str::get(Str::at(S, L-1));
} }
@ = <<*>>=
void Str::put(string_position P, wchar_t C) { void Str::put(string_position P, wchar_t C) {
if (P.index < 0) internal_error("wrote before start of string"); if (P.index < 0) internal_error("wrote before start of string");
if (P.S == NULL) internal_error("wrote to null stream"); if (P.S == NULL) internal_error("wrote to null stream");
@ -256,9 +257,9 @@ void Str::put_at(text_stream *S, int index, wchar_t C) {
Str::put(Str::at(S, index), C); Str::put(Str::at(S, index), C);
} }
@h Truncation. @ \section{Truncation.}
= <<*>>=
void Str::clear(text_stream *S) { void Str::clear(text_stream *S) {
Str::truncate(S, 0); Str::truncate(S, 0);
} }
@ -268,9 +269,9 @@ void Str::truncate(text_stream *S, int len) {
if (len < Str::len(S)) Str::put(Str::at(S, len), 0); if (len < Str::len(S)) Str::put(Str::at(S, len), 0);
} }
@h Indentation. @ \section{Indentation.}
= <<*>>=
int Str::remove_indentation(text_stream *S, int spaces_per_tab) { int Str::remove_indentation(text_stream *S, int spaces_per_tab) {
int spaces_in = 0, tab_stops_of_indentation = 0; int spaces_in = 0, tab_stops_of_indentation = 0;
while (Characters::is_space_or_tab(Str::get_first_char(S))) { while (Characters::is_space_or_tab(Str::get_first_char(S))) {
@ -307,9 +308,9 @@ void Str::rectify_indentation(text_stream *S, int spaces_per_tab) {
DISCARD_TEXT(tail) DISCARD_TEXT(tail)
} }
@h Copying. @ \section{Copying.}
= <<*>>=
void Str::concatenate(text_stream *S1, text_stream *S2) { void Str::concatenate(text_stream *S1, text_stream *S2) {
Streams::copy(S1, S2); Streams::copy(S1, S2);
} }
@ -330,7 +331,7 @@ void Str::copy_tail(text_stream *S1, text_stream *S2, int from) {
@ A subtly different operation is to set a string equal to a given C string: @ A subtly different operation is to set a string equal to a given C string:
= <<*>>=
void Str::copy_ISO_string(text_stream *S, char *C_string) { void Str::copy_ISO_string(text_stream *S, char *C_string) {
Str::clear(S); Str::clear(S);
Streams::write_ISO_string(S, C_string); Streams::write_ISO_string(S, C_string);
@ -346,10 +347,10 @@ void Str::copy_wide_string(text_stream *S, wchar_t *C_string) {
Streams::write_wide_string(S, C_string); Streams::write_wide_string(S, C_string);
} }
@h Comparisons. @ \section{Comparisons.}
We provide both case sensitive and insensitive versions. We provide both case sensitive and insensitive versions.
= <<*>>=
int Str::eq(text_stream *S1, text_stream *S2) { int Str::eq(text_stream *S1, text_stream *S2) {
if (Str::cmp(S1, S2) == 0) return TRUE; if (Str::cmp(S1, S2) == 0) return TRUE;
return FALSE; return FALSE;
@ -371,10 +372,10 @@ int Str::ne_insensitive(text_stream *S1, text_stream *S2) {
} }
@ These two routines produce a numerical string difference suitable for @ These two routines produce a numerical string difference suitable for
alphabetic sorting, like |strlen| in the C standard library. alphabetic sorting, like [[strlen]] in the C standard library.
This would be a more elegant implementation: This would be a more elegant implementation:
= (text as InC)
for (string_position P = Str::start(S1), Q = Str::start(S2); for (string_position P = Str::start(S1), Q = Str::start(S2);
(P.index < Str::len(S1)) && (Q.index < Str::len(S2)); (P.index < Str::len(S1)) && (Q.index < Str::len(S2));
P = Str::forward(P), Q = Str::forward(Q)) { P = Str::forward(P), Q = Str::forward(Q)) {
@ -382,11 +383,11 @@ This would be a more elegant implementation:
if (d != 0) return d; if (d != 0) return d;
} }
return Str::len(S1) - Str::len(S2); return Str::len(S1) - Str::len(S2);
=
But profiling shows that the following speeds up the Inform 7 compiler by But profiling shows that the following speeds up the Inform 7 compiler by
around 1%. around 1%.
= <<*>>=
int Str::cmp(text_stream *S1, text_stream *S2) { int Str::cmp(text_stream *S1, text_stream *S2) {
int L1 = Str::len(S1), L2 = Str::len(S2), M = L1; int L1 = Str::len(S1), L2 = Str::len(S2), M = L1;
if (L2 < M) M = L2; if (L2 < M) M = L2;
@ -408,16 +409,16 @@ int Str::cmp_insensitive(text_stream *S1, text_stream *S2) {
} }
@ It's sometimes useful to see whether two strings agree on their last @ It's sometimes useful to see whether two strings agree on their last
|N| characters, or their first |N|. For example, [[N]] characters, or their first [[N]]. For example,
= (text as code)
Str::suffix_eq(I"wayzgoose", I"snow goose", N) Str::suffix_eq(I"wayzgoose", I"snow goose", N)
=
will return |TRUE| for |N| equal to 0 to 5, and |FALSE| thereafter. will return [[TRUE]] for [[N]] equal to 0 to 5, and [[FALSE]] thereafter.
(The Oxford English Dictionary defines a "wayzgoose" as a holiday outing (The Oxford English Dictionary defines a "wayzgoose" as a holiday outing
for the staff of a publishing house.) for the staff of a publishing house.)
= <<*>>=
int Str::prefix_eq(text_stream *S1, text_stream *S2, int N) { int Str::prefix_eq(text_stream *S1, text_stream *S2, int N) {
int L1 = Str::len(S1), L2 = Str::len(S2); int L1 = Str::len(S1), L2 = Str::len(S2);
if ((N > L1) || (N > L2)) return FALSE; if ((N > L1) || (N > L2)) return FALSE;
@ -454,7 +455,7 @@ int Str::ends_with_wide_string(text_stream *S, wchar_t *suffix) {
return TRUE; return TRUE;
} }
@ = <<*>>=
int Str::eq_wide_string(text_stream *S1, wchar_t *S2) { int Str::eq_wide_string(text_stream *S1, wchar_t *S2) {
if (S2 == NULL) return (Str::len(S1) == 0)?TRUE:FALSE; if (S2 == NULL) return (Str::len(S1) == 0)?TRUE:FALSE;
if (Str::len(S1) == (int) wcslen(S2)) { if (Str::len(S1) == (int) wcslen(S2)) {
@ -481,9 +482,9 @@ int Str::ne_wide_string(text_stream *S1, wchar_t *S2) {
return (Str::eq_wide_string(S1, S2)?FALSE:TRUE); return (Str::eq_wide_string(S1, S2)?FALSE:TRUE);
} }
@h White space. @ \section{White space.}
= <<*>>=
int Str::is_whitespace(text_stream *S) { int Str::is_whitespace(text_stream *S) {
LOOP_THROUGH_TEXT(pos, S) LOOP_THROUGH_TEXT(pos, S)
if (Characters::is_space_or_tab(Str::get(pos)) == FALSE) if (Characters::is_space_or_tab(Str::get(pos)) == FALSE)
@ -493,7 +494,7 @@ int Str::is_whitespace(text_stream *S) {
@ This removes spaces and tabs from both ends: @ This removes spaces and tabs from both ends:
= <<*>>=
void Str::trim_white_space(text_stream *S) { void Str::trim_white_space(text_stream *S) {
int len = Str::len(S), i = 0, j = 0; int len = Str::len(S), i = 0, j = 0;
string_position F = Str::start(S); string_position F = Str::start(S);
@ -545,9 +546,9 @@ int Str::trim_all_white_space_at_end(text_stream *S) {
return shortened; return shortened;
} }
@h Deleting characters. @ \section{Deleting characters.}
= <<*>>=
void Str::delete_first_character(text_stream *S) { void Str::delete_first_character(text_stream *S) {
Str::delete_nth_character(S, 0); Str::delete_nth_character(S, 0);
} }
@ -572,9 +573,9 @@ void Str::delete_n_characters(text_stream *S, int n) {
} }
} }
@h Substrings. @ \section{Substrings.}
= <<*>>=
void Str::substr(OUTPUT_STREAM, string_position from, string_position to) { void Str::substr(OUTPUT_STREAM, string_position from, string_position to) {
if (from.S != to.S) internal_error("substr on two different strings"); if (from.S != to.S) internal_error("substr on two different strings");
for (int i = from.index; i < to.index; i++) for (int i = from.index; i < to.index; i++)
@ -632,29 +633,29 @@ int Str::includes_at(text_stream *line, int i, text_stream *pattern) {
return TRUE; return TRUE;
} }
@h Shim for literal storage. @ \section{Shim for literal storage.}
This is where all of those I-literals created by Inweb are stored at run-time. This is where all of those I-literals created by Inweb are stored at run-time.
Note that every instance of, say, |I"fish"| would return the same string, Note that every instance of, say, [[I"fish"]] would return the same string,
that is, the same |text_stream *| value. To prevent nasty accidents, this that is, the same [[text_stream *]] value. To prevent nasty accidents, this
is marked so that the stream value, "fish", cannot be modified at run-time. is marked so that the stream value, "fish", cannot be modified at run-time.
The dictionary look-up here would not be thread-safe, so it's protected by The dictionary look-up here would not be thread-safe, so it's protected by
a mutex. There's no real performance concern because the following routine a mutex. There's no real performance concern because the following routine
is run just once per I-literal in the source code, when the program starts up. is run just once per I-literal in the source code, when the program starts up.
= <<*>>=
dictionary *string_literals_dictionary = NULL; dictionary *string_literals_dictionary = NULL;
text_stream *Str::literal(wchar_t *wide_C_string) { text_stream *Str::literal(wchar_t *wide_C_string) {
text_stream *answer = NULL; text_stream *answer = NULL;
CREATE_MUTEX(mutex); CREATE_MUTEX(mutex);
LOCK_MUTEX(mutex); LOCK_MUTEX(mutex);
@<Look in dictionary of string literals@>; <<Look in dictionary of string literals>>;
UNLOCK_MUTEX(mutex); UNLOCK_MUTEX(mutex);
return answer; return answer;
} }
@<Look in dictionary of string literals@> = <<Look in dictionary of string literals>>=
if (string_literals_dictionary == NULL) if (string_literals_dictionary == NULL)
string_literals_dictionary = Dictionaries::new(100, TRUE); string_literals_dictionary = Dictionaries::new(100, TRUE);
answer = Dictionaries::get_text_literal(string_literals_dictionary, wide_C_string); answer = Dictionaries::get_text_literal(string_literals_dictionary, wide_C_string);

View file

@ -2,7 +2,7 @@
To read text files of whatever flavour, one line at a time. To read text files of whatever flavour, one line at a time.
@h Text files. @ \section{Text files.}
Foundation was written mainly to support command-line tools which, of their Foundation was written mainly to support command-line tools which, of their
nature, deal with a lot of text files: source code of programs, configuration nature, deal with a lot of text files: source code of programs, configuration
files, HTML, XML and so on. The main aim of this section is to provide a files, HTML, XML and so on. The main aim of this section is to provide a
@ -10,13 +10,13 @@ standard way to read in and iterate through lines of a text file.
First, though, here is a perhaps clumsy but effective way to test if a First, though, here is a perhaps clumsy but effective way to test if a
file actually exists on disc at a given filename. Note that under the C standard, file actually exists on disc at a given filename. Note that under the C standard,
it's entirely legal for |fopen| to behave more or less as it likes if asked to it's entirely legal for [[fopen]] to behave more or less as it likes if asked to
open a directory as a file; and on MacOS, it sometimes opens a directory exactly open a directory as a file; and on MacOS, it sometimes opens a directory exactly
as if it were an empty text file. The safest way to ensure that a directory is as if it were an empty text file. The safest way to ensure that a directory is
never confused with a file seems to be to try |opendir| on it, and the following never confused with a file seems to be to try [[opendir]] on it, and the following
does essentially that. does essentially that.
= <<*>>=
int TextFiles::exists(filename *F) { int TextFiles::exists(filename *F) {
TEMPORARY_TEXT(pn) TEMPORARY_TEXT(pn)
WRITE_TO(pn, "%f", F); WRITE_TO(pn, "%f", F);
@ -32,10 +32,10 @@ int TextFiles::exists(filename *F) {
return TRUE; return TRUE;
} }
@h Text file positions. @ \section{Text file positions.}
Here's how we record a position in a text file: Here's how we record a position in a text file:
= <<*>>=
typedef struct text_file_position { typedef struct text_file_position {
struct filename *text_file_filename; struct filename *text_file_filename;
FILE *handle_when_open; FILE *handle_when_open;
@ -48,7 +48,7 @@ typedef struct text_file_position {
@ For access: @ For access:
= <<*>>=
int TextFiles::get_line_count(text_file_position *tfp) { int TextFiles::get_line_count(text_file_position *tfp) {
if (tfp == NULL) return 0; if (tfp == NULL) return 0;
return tfp->line_count; return tfp->line_count;
@ -56,7 +56,7 @@ int TextFiles::get_line_count(text_file_position *tfp) {
@ And this is for a real nowhere man: @ And this is for a real nowhere man:
= <<*>>=
text_file_position TextFiles::nowhere(void) { text_file_position TextFiles::nowhere(void) {
text_file_position tfp; text_file_position tfp;
tfp.text_file_filename = NULL; tfp.text_file_filename = NULL;
@ -74,26 +74,26 @@ text_file_position TextFiles::at(filename *F, int line) {
return tfp; return tfp;
} }
@h Text file scanner. @ \section{Text file scanner.}
We read lines in, delimited by any of the standard line-ending characters, We read lines in, delimited by any of the standard line-ending characters,
and send them one at a time to a function called |iterator|. Throughout, and send them one at a time to a function called [[iterator]]. Throughout,
we preserve a pointer called |state| to some object being used by the we preserve a pointer called [[state]] to some object being used by the
client. client.
= <<*>>=
int TextFiles::read(filename *F, int escape_oddities, char *message, int serious, int TextFiles::read(filename *F, int escape_oddities, char *message, int serious,
void (iterator)(text_stream *, text_file_position *, void *), void (iterator)(text_stream *, text_file_position *, void *),
text_file_position *start_at, void *state) { text_file_position *start_at, void *state) {
text_file_position tfp; text_file_position tfp;
tfp.ufb = TextFiles::create_ufb(); tfp.ufb = TextFiles::create_ufb();
@<Open the text file@>; <<Open the text file>>;
@<Set the initial position, seeking it in the file if need be@>; <<Set the initial position, seeking it in the file if need be>>;
@<Read in lines and send them one by one to the iterator@>; <<Read in lines and send them one by one to the iterator>>;
fclose(tfp.handle_when_open); fclose(tfp.handle_when_open);
return tfp.line_count; return tfp.line_count;
} }
@<Open the text file@> = <<Open the text file>>=
tfp.handle_when_open = Filenames::fopen(F, "rb"); tfp.handle_when_open = Filenames::fopen(F, "rb");
if (tfp.handle_when_open == NULL) { if (tfp.handle_when_open == NULL) {
if (message == NULL) return 0; if (message == NULL) return 0;
@ -101,12 +101,12 @@ int TextFiles::read(filename *F, int escape_oddities, char *message, int serious
else { Errors::with_file(message, F); return 0; } else { Errors::with_file(message, F); return 0; }
} }
@ The ANSI definition of |ftell| and |fseek| says that, with text files, the @ The ANSI definition of [[ftell]] and [[fseek]] says that, with text files, the
only definite position value is 0 -- meaning the beginning of the file -- and only definite position value is 0 -- meaning the beginning of the file -- and
this is what we initialise |line_position| to. We must otherwise only write this is what we initialise [[line_position]] to. We must otherwise only write
values returned by |ftell| into this field. values returned by [[ftell]] into this field.
@<Set the initial position, seeking it in the file if need be@> = <<Set the initial position, seeking it in the file if need be>>=
if (start_at == NULL) { if (start_at == NULL) {
tfp.line_count = 1; tfp.line_count = 1;
tfp.line_position = 0; tfp.line_position = 0;
@ -122,48 +122,48 @@ values returned by |ftell| into this field.
tfp.actively_scanning = TRUE; tfp.actively_scanning = TRUE;
tfp.text_file_filename = F; tfp.text_file_filename = F;
@ We aim to get this right whether the lines are terminated by |0A|, |0D|, @ We aim to get this right whether the lines are terminated by [[0A]], [[0D]],
|0A 0D| or |0D 0A|. The final line is not required to be terminated. [[0A 0D]] or [[0D 0A]]. The final line is not required to be terminated.
@<Read in lines and send them one by one to the iterator@> = <<Read in lines and send them one by one to the iterator>>=
TEMPORARY_TEXT(line) TEMPORARY_TEXT(line)
int i = 0, c = ' '; int i = 0, c = ' ';
while ((c != EOF) && (tfp.actively_scanning)) { while ((c != EOF) && (tfp.actively_scanning)) {
c = TextFiles::utf8_fgetc(tfp.handle_when_open, NULL, escape_oddities, &tfp.ufb); c = TextFiles::utf8_fgetc(tfp.handle_when_open, NULL, escape_oddities, &tfp.ufb);
if ((c == EOF) || (c == '\x0a') || (c == '\x0d')) { if ((c == EOF) [[| (c == '\x0a') |]] (c == '\x0d')) {
Str::put_at(line, i, 0); Str::put_at(line, i, 0);
if ((i > 0) || (c != tfp.skip_terminator)) { if ((i > 0) [[]] (c != tfp.skip_terminator)) {
@<Feed the completed line to the iterator routine@>; <<Feed the completed line to the iterator routine>>;
if (c == '\x0a') tfp.skip_terminator = '\x0d'; if (c == '\x0a') tfp.skip_terminator = '\x0d';
if (c == '\x0d') tfp.skip_terminator = '\x0a'; if (c == '\x0d') tfp.skip_terminator = '\x0a';
} else tfp.skip_terminator = 'X'; } else tfp.skip_terminator = 'X';
@<Update the text file position@>; <<Update the text file position>>;
i = 0; i = 0;
} else { } else {
Str::put_at(line, i++, (wchar_t) c); Str::put_at(line, i++, (wchar_t) c);
} }
} }
if ((i > 0) && (tfp.actively_scanning)) if ((i > 0) && (tfp.actively_scanning))
@<Feed the completed line to the iterator routine@>; <<Feed the completed line to the iterator routine>>;
DISCARD_TEXT(line) DISCARD_TEXT(line)
@ We update the line counter only when a line is actually sent: @ We update the line counter only when a line is actually sent:
@<Feed the completed line to the iterator routine@> = <<Feed the completed line to the iterator routine>>=
iterator(line, &tfp, state); iterator(line, &tfp, state);
tfp.line_count++; tfp.line_count++;
@ But we update the text file position after every apparent line terminator. @ But we update the text file position after every apparent line terminator.
This is because we might otherwise, on a Windows text file, end up with an This is because we might otherwise, on a Windows text file, end up with an
|ftell| position in between the |CR| and the |LF|; if we resume at that point, [[ftell]] position in between the [[CR]] and the [[LF]]; if we resume at that point,
later on, we'll then have an off-by-one error in the line numbering in the later on, we'll then have an off-by-one error in the line numbering in the
resumption as compared to during the original pass. resumption as compared to during the original pass.
Properly speaking, |ftell| returns a long |int|, not an |int|, but on a Properly speaking, [[ftell]] returns a long [[int]], not an [[int]], but on a
32-bit-or-more integer machine, this gives us room for files to run to 2GB. 32-bit-or-more integer machine, this gives us room for files to run to 2GB.
Text files seldom come that large. Text files seldom come that large.
@<Update the text file position@> = <<Update the text file position>>=
tfp.line_position = (int) (ftell(tfp.handle_when_open)); tfp.line_position = (int) (ftell(tfp.handle_when_open));
if (tfp.line_position == -1) { if (tfp.line_position == -1) {
if (serious) if (serious)
@ -172,15 +172,15 @@ Text files seldom come that large.
Errors::with_file("unable to determine position in file", F); Errors::with_file("unable to determine position in file", F);
} }
@ = <<*>>=
void TextFiles::read_line(OUTPUT_STREAM, int escape_oddities, text_file_position *tfp) { void TextFiles::read_line(OUTPUT_STREAM, int escape_oddities, text_file_position *tfp) {
Str::clear(OUT); Str::clear(OUT);
int i = 0, c = ' '; int i = 0, c = ' ';
while ((c != EOF) && (tfp->actively_scanning)) { while ((c != EOF) && (tfp->actively_scanning)) {
c = TextFiles::utf8_fgetc(tfp->handle_when_open, NULL, escape_oddities, &tfp->ufb); c = TextFiles::utf8_fgetc(tfp->handle_when_open, NULL, escape_oddities, &tfp->ufb);
if ((c == EOF) || (c == '\x0a') || (c == '\x0d')) { if ((c == EOF) [[| (c == '\x0a') |]] (c == '\x0d')) {
Str::put_at(OUT, i, 0); Str::put_at(OUT, i, 0);
if ((i > 0) || (c != tfp->skip_terminator)) { if ((i > 0) [[]] (c != tfp->skip_terminator)) {
if (c == '\x0a') tfp->skip_terminator = '\x0d'; if (c == '\x0a') tfp->skip_terminator = '\x0d';
if (c == '\x0d') tfp->skip_terminator = '\x0a'; if (c == '\x0d') tfp->skip_terminator = '\x0a';
} else tfp->skip_terminator = 'X'; } else tfp->skip_terminator = 'X';
@ -196,35 +196,35 @@ void TextFiles::read_line(OUTPUT_STREAM, int escape_oddities, text_file_position
@ The routine being iterated can indicate that it has had enough by @ The routine being iterated can indicate that it has had enough by
calling the following: calling the following:
= <<*>>=
void TextFiles::lose_interest(text_file_position *tfp) { void TextFiles::lose_interest(text_file_position *tfp) {
tfp->actively_scanning = FALSE; tfp->actively_scanning = FALSE;
} }
@h Reading UTF-8 files. @ \section{Reading UTF-8 files.}
The following routine reads a sequence of Unicode characters from a UTF-8 The following routine reads a sequence of Unicode characters from a UTF-8
encoded file, but returns them as a sequence of ISO Latin-1 characters, a encoded file, but returns them as a sequence of ISO Latin-1 characters, a
trick it can only pull off by escaping non-ISO characters. This is done by trick it can only pull off by escaping non-ISO characters. This is done by
taking character number |N| and feeding it out, one character at a time, as taking character number [[N]] and feeding it out, one character at a time, as
the text |[unicode N]|, writing the number in decimal. Only one UTF-8 the text [[[unicode N]]], writing the number in decimal. Only one UTF-8
file like this will be being read at a time, and the routine will be file like this will be being read at a time, and the routine will be
repeatedly called until |EOF| or a line division. repeatedly called until [[EOF]] or a line division.
Strictly speaking, we transmit not as ISO Latin-1 but as that subset of ISO Strictly speaking, we transmit not as ISO Latin-1 but as that subset of ISO
which have corresponding (different) codes in the ZSCII character set. This which have corresponding (different) codes in the ZSCII character set. This
excludes some typewriter symbols and a handful of letterforms, as we shall excludes some typewriter symbols and a handful of letterforms, as we shall
see. see.
There are two exceptions: |TextFiles::utf8_fgetc| can also return the usual C There are two exceptions: [[TextFiles::utf8_fgetc]] can also return the usual C
end-of-file pseudo-character |EOF|, and it can also return the Unicode BOM end-of-file pseudo-character [[EOF]], and it can also return the Unicode BOM
(byte-ordering marker) pseudo-character, which is legal at the start of a (byte-ordering marker) pseudo-character, which is legal at the start of a
file and which is automatically prepended by some text editors and file and which is automatically prepended by some text editors and
word-processors when they save a UTF-8 file (though in fact it is not word-processors when they save a UTF-8 file (though in fact it is not
required by the UTF-8 specification). Anyone calling |TextFiles::utf8_fgetc| must required by the UTF-8 specification). Anyone calling [[TextFiles::utf8_fgetc]] must
check the return value for |EOF| every time, and for |0xFEFF| every time we check the return value for [[EOF]] every time, and for [[0xFEFF]] every time we
might be at the start of the file being read. might be at the start of the file being read.
= <<*>>=
typedef struct unicode_file_buffer { typedef struct unicode_file_buffer {
char unicode_feed_buffer[32]; /* holds a single escape such as "[unicode 3106]" */ char unicode_feed_buffer[32]; /* holds a single escape such as "[unicode 3106]" */
int ufb_counter; /* position in the unicode feed buffer */ int ufb_counter; /* position in the unicode feed buffer */
@ -247,9 +247,9 @@ int TextFiles::utf8_fgetc(FILE *from, const char **or_from, int escape_oddities,
if (c == EOF) return c; /* ruling out EOF leaves a genuine byte from the file */ if (c == EOF) return c; /* ruling out EOF leaves a genuine byte from the file */
if (c<0x80) return c; /* in all other cases, a UTF-8 continuation sequence begins */ if (c<0x80) return c; /* in all other cases, a UTF-8 continuation sequence begins */
@<Unpack one to five continuation bytes to obtain the Unicode character code@>; <<Unpack one to five continuation bytes to obtain the Unicode character code>>;
@<Return non-ASCII codes in the intersection of ISO Latin-1 and ZSCII as literals@>; <<Return non-ASCII codes in the intersection of ISO Latin-1 and ZSCII as literals>>;
if (escape_oddities) @<Return Unicode fancy equivalents as simpler literals@>; if (escape_oddities) <<Return Unicode fancy equivalents as simpler literals>>;
if (c == 0xFEFF) return c; /* the Unicode BOM non-character */ if (c == 0xFEFF) return c; /* the Unicode BOM non-character */
@ -268,7 +268,7 @@ fatal error (which is pretty well the only alternative here). The user
is likely to see problem messages later on which arise from the question is likely to see problem messages later on which arise from the question
marks, and that will have to do. marks, and that will have to do.
@<Unpack one to five continuation bytes to obtain the Unicode character code@> = <<Unpack one to five continuation bytes to obtain the Unicode character code>>=
if (c<0xC0) return '?'; /* malformed UTF-8 */ if (c<0xC0) return '?'; /* malformed UTF-8 */
if (c<0xE0) { c = c & 0x1f; conts = 1; } if (c<0xE0) { c = c & 0x1f; conts = 1; }
else if (c<0xF0) { c = c & 0xf; conts = 2; } else if (c<0xF0) { c = c & 0xf; conts = 2; }
@ -293,12 +293,12 @@ fiction offerings. Had they been collaborating with J. R. R. Tolkien
rather than Douglas Adams, they might have filled this gap. As it was, rather than Douglas Adams, they might have filled this gap. As it was,
"eth" never occurred in any of their works.) "eth" never occurred in any of their works.)
We let the multiplication sign |0xd7| through even though ZSCII doesn't We let the multiplication sign [[0xd7]] through even though ZSCII doesn't
support it, but convert it to an "x": this is so that we can parse numbers support it, but convert it to an "x": this is so that we can parse numbers
in scientific notation. in scientific notation.
@<Return non-ASCII codes in the intersection of ISO Latin-1 and ZSCII as literals@> = <<Return non-ASCII codes in the intersection of ISO Latin-1 and ZSCII as literals>>=
if ((c == 0xa1) || (c == 0xa3) || (c == 0xbf)) return c; /* pound sign, inverted ! and ? */ if ((c == 0xa1) [[| (c == 0xa3) |]] (c == 0xbf)) return c; /* pound sign, inverted ! and ? */
if (c == 0xd7) return 'x'; /* convert multiplication sign to lower case "x" */ if (c == 0xd7) return 'x'; /* convert multiplication sign to lower case "x" */
if ((c >= 0xc0) && (c <= 0xff)) { /* accented West European letters, but... */ if ((c >= 0xc0) && (c <= 0xff)) { /* accented West European letters, but... */
if ((c != 0xd0) && (c != 0xf0) && /* not Icelandic eths */ if ((c != 0xd0) && (c != 0xf0) && /* not Icelandic eths */
@ -312,7 +312,7 @@ where we would normally expect hyphens and ordinary spaces: this is intended
for the benefit of users with helpful word-processors which autocorrect for the benefit of users with helpful word-processors which autocorrect
hyphens into em-rules when they are flanked by spaces, and so on. hyphens into em-rules when they are flanked by spaces, and so on.
@<Return Unicode fancy equivalents as simpler literals@> = <<Return Unicode fancy equivalents as simpler literals>>=
if (c == 0x85) return '\x0d'; /* NEL, or "next line" */ if (c == 0x85) return '\x0d'; /* NEL, or "next line" */
if (c == 0xa0) return ' '; /* non-breaking space */ if (c == 0xa0) return ' '; /* non-breaking space */
if ((c >= 0x2000) && (c <= 0x200a)) return ' '; /* space variants */ if ((c >= 0x2000) && (c <= 0x200a)) return ' '; /* space variants */

View file

@ -2,7 +2,7 @@
To examine heads and tails of text, to see how it may inflect. To examine heads and tails of text, to see how it may inflect.
@h Tries. @ \section{Tries.}
The standard data structure for searches through possible prefixes or The standard data structure for searches through possible prefixes or
suffixes is a "trie". The term goes back to Edward Fredkin in 1961; suffixes is a "trie". The term goes back to Edward Fredkin in 1961;
some pronounce it "try" and some "tree", and either would be a fair some pronounce it "try" and some "tree", and either would be a fair
@ -17,28 +17,29 @@ front of a text, whereas an end head represents matching from the back.
(b) "Choices". A choice node has a given match character, say an "f", and (b) "Choices". A choice node has a given match character, say an "f", and
represents which node to go to next if this is the current character in the represents which node to go to next if this is the current character in the
text. It must either be a valid Unicode character or |TRIE_ANYTHING|, which text. It must either be a valid Unicode character or [[TRIE_ANYTHING]], which
is a wildcard representing "any text of any length here". Since a choice is a wildcard representing "any text of any length here". Since a choice
must always lead somewhere, |on_success| must point to another node. must always lead somewhere, [[on_success]] must point to another node.
There can be any number of choices at a given position, so choice nodes There can be any number of choices at a given position, so choice nodes
are always organised in linked lists joined by |next|. are always organised in linked lists joined by [[next]].
(c) "Terminals", always leaves, which have match character set to the (c) "Terminals", always leaves, which have match character set to the
impossible value |TRIE_STOP|, and for which |match_outcome| is non-null; thus, impossible value [[TRIE_STOP]], and for which [[match_outcome]] is non-null; thus,
different terminal nodes can result in different outcomes if they are ever different terminal nodes can result in different outcomes if they are ever
reached at the end of a successful scan. A terminal node is always the only item reached at the end of a successful scan. A terminal node is always the only item
in a list. in a list.
@d TRIE_START -1 /* head: the root of a trie parsing forwards from the start */ <<*>>=
@d TRIE_END -2 /* head: the root of a trie parsing backwards from the end */ #define TRIE_START -1 /* head: the root of a trie parsing forwards from the start */
@d TRIE_ANYTHING 10003 /* choice: match any text here */ #define TRIE_END -2 /* head: the root of a trie parsing backwards from the end */
@d TRIE_ANY_GROUP 10001 /* choice: match any character from this group */ #define TRIE_ANYTHING 10003 /* choice: match any text here */
@d TRIE_NOT_GROUP 10002 /* choice: match any character not in this group */ #define TRIE_ANY_GROUP 10001 /* choice: match any character from this group */
@d TRIE_STOP -3 /* terminal: here's the outcome */ #define TRIE_NOT_GROUP 10002 /* choice: match any character not in this group */
#define TRIE_STOP -3 /* terminal: here's the outcome */
@d MAX_TRIE_GROUP_SIZE 26 /* size of the allowable groups of characters */ #define MAX_TRIE_GROUP_SIZE 26 /* size of the allowable groups of characters */
= <<*>>=
typedef struct match_trie { typedef struct match_trie {
int match_character; /* or one of the special cases above */ int match_character; /* or one of the special cases above */
wchar_t group_characters[MAX_TRIE_GROUP_SIZE+1]; wchar_t group_characters[MAX_TRIE_GROUP_SIZE+1];
@ -48,40 +49,41 @@ typedef struct match_trie {
} match_trie; } match_trie;
@ We have just one routine for extending and scanning the trie: it either @ We have just one routine for extending and scanning the trie: it either
tries to find whether a text |p| leads to any outcome in the existing trie, tries to find whether a text [[p]] leads to any outcome in the existing trie,
or else forcibly extends the existing trie to ensure that it does. or else forcibly extends the existing trie to ensure that it does.
It might look as if calling |Tries::search| always returns |add_outcome| when It might look as if calling [[Tries::search]] always returns [[add_outcome]] when
this is set, but this isn't true: if the trie already contains a node this is set, but this isn't true: if the trie already contains a node
representing how to deal with |p|, we get whatever outcome is already representing how to deal with [[p]], we get whatever outcome is already
established. established.
There are two motions to keep track of: our progress through the text |p| There are two motions to keep track of: our progress through the text [[p]]
being scanned, and our progress through the trie which tells us how to scan it. being scanned, and our progress through the trie which tells us how to scan it.
We scan the text either forwards or backwards, starting with the first or We scan the text either forwards or backwards, starting with the first or
last character and then working through, finishing with a 0 terminator. last character and then working through, finishing with a 0 terminator.
(This is true even if working backwards: we pretend the character stored (This is true even if working backwards: we pretend the character stored
before the text began is 0.) |i| represents the index of our current position before the text began is 0.) [[i]] represents the index of our current position
in |p|, and runs either from 0 up to |N| or from |N-1| down to |-1|, in [[p]], and runs either from 0 up to [[N]] or from [[N-1]] down to [[-1]],
where |N| is the number of characters in |p|. where [[N]] is the number of characters in [[p]].
We scan the trie using a pair of pointers. |prev| is the last node we We scan the trie using a pair of pointers. [[prev]] is the last node we
successfully left, and |pos| is one we are currently at, which can be successfully left, and [[pos]] is one we are currently at, which can be
either a terminal node or a choice node (in which case it's the head of either a terminal node or a choice node (in which case it's the head of
a linked list of such nodes). a linked list of such nodes).
@d MAX_TRIE_REWIND 10 /* that should be far, far more rewinding than necessary */ <<*>>=
#define MAX_TRIE_REWIND 10 /* that should be far, far more rewinding than necessary */
= <<*>>=
wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) { wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) {
if (T == NULL) internal_error("no trie to search"); if (T == NULL) internal_error("no trie to search");
int start, endpoint, delta; int start, endpoint, delta;
@<Look at the root node of the trie, setting up the scan accordingly@>; <<Look at the root node of the trie, setting up the scan accordingly>>;
match_trie *prev = NULL, *pos = T; match_trie *prev = NULL, *pos = T;
@<Accept the current node of the trie@>; <<Accept the current node of the trie>>;
int rewind_sp = 0; int rewind_sp = 0;
int rewind_points[MAX_TRIE_REWIND]; int rewind_points[MAX_TRIE_REWIND];
@ -113,7 +115,7 @@ wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) {
if (c == '*') endpoint -= delta; if (c == '*') endpoint -= delta;
RewindHere: RewindHere:
@<Look through the possible exits from this position and move on if any match@>; <<Look through the possible exits from this position and move on if any match>>;
if (add_outcome == NULL) { if (add_outcome == NULL) {
if (rewind_sp > 0) { if (rewind_sp > 0) {
i = rewind_points[rewind_sp-1]; i = rewind_points[rewind_sp-1];
@ -124,19 +126,19 @@ wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) {
} }
return NULL; /* failure! */ return NULL; /* failure! */
} }
@<We have run out of trie and must create a new exit to continue@>; <<We have run out of trie and must create a new exit to continue>>;
} }
if ((pos) && (pos->match_character == TRIE_ANYTHING)) @<Accept the current node of the trie@>; if ((pos) && (pos->match_character == TRIE_ANYTHING)) <<Accept the current node of the trie>>;
if ((pos) && (pos->match_outcome)) return pos->match_outcome; /* success! */ if ((pos) && (pos->match_outcome)) return pos->match_outcome; /* success! */
if (add_outcome == NULL) return NULL; /* failure! */ if (add_outcome == NULL) return NULL; /* failure! */
if (pos == NULL) if (pos == NULL)
@<We failed by running out of trie, so we must add a terminal node to make this string acceptable@> <<We failed by running out of trie, so we must add a terminal node to make this string acceptable>>
else else
@<We failed by finishing at a non-terminal node, so we must add an outcome@>; <<We failed by finishing at a non-terminal node, so we must add an outcome>>;
} }
@<Look at the root node of the trie, setting up the scan accordingly@> = <<Look at the root node of the trie, setting up the scan accordingly>>=
start = 0; endpoint = Str::len(p); delta = 1; start = 0; endpoint = Str::len(p); delta = 1;
if (T->match_character == TRIE_END) { start = Str::len(p)-1; endpoint = -1; delta = -1; } if (T->match_character == TRIE_END) { start = Str::len(p)-1; endpoint = -1; delta = -1; }
@ -146,10 +148,10 @@ this tends to make commonly used exits migrate upwards and rarities downwards.
But we aren't going to search these tries anything like intensively enough But we aren't going to search these tries anything like intensively enough
to make it worth the trouble. to make it worth the trouble.
(The following cannot be a |while| loop since C does not allow us to |break| (The following cannot be a [[while]] loop since C does not allow us to [[break]]
or |continue| out of an outer loop from an inner one.) or [[continue]] out of an outer loop from an inner one.)
@<Look through the possible exits from this position and move on if any match@> = <<Look through the possible exits from this position and move on if any match>>=
int ambig = 0, unambig = 0; int ambig = 0, unambig = 0;
match_trie *point; match_trie *point;
for (point = pos; point; point = point->next) for (point = pos; point; point = point->next)
@ -158,7 +160,7 @@ or |continue| out of an outer loop from an inner one.)
FauxWhileLoop: FauxWhileLoop:
if (pos) { if (pos) {
if ((add_outcome == NULL) || (Tries::is_ambiguous(pos) == FALSE)) if ((add_outcome == NULL) [[]] (Tries::is_ambiguous(pos) == FALSE))
if (Tries::matches(pos, c)) { if (Tries::matches(pos, c)) {
if (pos->match_character == TRIE_ANYTHING) break; if (pos->match_character == TRIE_ANYTHING) break;
if ((add_outcome == NULL) && (ambig > 0) && (ambig+unambig > 1) if ((add_outcome == NULL) && (ambig > 0) && (ambig+unambig > 1)
@ -168,14 +170,14 @@ or |continue| out of an outer loop from an inner one.)
rewind_prev_positions[rewind_sp] = prev; rewind_prev_positions[rewind_sp] = prev;
rewind_sp++; rewind_sp++;
} }
@<Accept the current node of the trie@>; <<Accept the current node of the trie>>;
continue; continue;
} }
pos = pos->next; pos = pos->next;
goto FauxWhileLoop; goto FauxWhileLoop;
} }
@<We have run out of trie and must create a new exit to continue@> = <<We have run out of trie and must create a new exit to continue>>=
match_trie *new_pos = NULL; match_trie *new_pos = NULL;
if (g > 0) { if (g > 0) {
int nt = TRIE_ANY_GROUP; int nt = TRIE_ANY_GROUP;
@ -211,29 +213,29 @@ or |continue| out of an outer loop from an inner one.)
} }
pos = new_pos; pos = new_pos;
@<Accept the current node of the trie@>; continue; <<Accept the current node of the trie>>; continue;
@<Accept the current node of the trie@> = <<Accept the current node of the trie>>=
if (pos == NULL) internal_error("trie invariant broken"); if (pos == NULL) internal_error("trie invariant broken");
prev = pos; pos = prev->on_success; prev = pos; pos = prev->on_success;
@ If |pos| is |NULL| then it follows that |prev->on_success| is |NULL|, since @ If [[pos]] is [[NULL]] then it follows that [[prev->on_success]] is [[NULL]], since
this is how |pos| was calculated; so to add a new terminal node we simply add this is how [[pos]] was calculated; so to add a new terminal node we simply add
it there. it there.
@<We failed by running out of trie, so we must add a terminal node to make this string acceptable@> = <<We failed by running out of trie, so we must add a terminal node to make this string acceptable>>=
prev->on_success = Tries::new(TRIE_STOP); prev->on_success = Tries::new(TRIE_STOP);
prev->on_success->match_outcome = add_outcome; prev->on_success->match_outcome = add_outcome;
return add_outcome; return add_outcome;
@<We failed by finishing at a non-terminal node, so we must add an outcome@> = <<We failed by finishing at a non-terminal node, so we must add an outcome>>=
prev->on_success = Tries::new(TRIE_STOP); prev->on_success = Tries::new(TRIE_STOP);
prev->on_success->match_outcome = add_outcome; prev->on_success->match_outcome = add_outcome;
return add_outcome; return add_outcome;
@ Single nodes are matched thus: @ Single nodes are matched thus:
= <<*>>=
int Tries::matches(match_trie *pos, int c) { int Tries::matches(match_trie *pos, int c) {
if (pos->match_character == TRIE_ANYTHING) return TRUE; if (pos->match_character == TRIE_ANYTHING) return TRUE;
if (pos->match_character == TRIE_ANY_GROUP) { if (pos->match_character == TRIE_ANY_GROUP) {
@ -263,7 +265,7 @@ int Tries::is_ambiguous(match_trie *pos) {
@ Where: @ Where:
= <<*>>=
match_trie *Tries::new(int mc) { match_trie *Tries::new(int mc) {
match_trie *T = CREATE(match_trie); match_trie *T = CREATE(match_trie);
T->match_character = mc; T->match_character = mc;
@ -273,14 +275,14 @@ match_trie *Tries::new(int mc) {
return T; return T;
} }
@h Avinues. @ \section{Avinues.}
A trie is only a limited form of finite state machine. We're not going to need A trie is only a limited form of finite state machine. We're not going to need
the whole power of these, but we do find it useful to chain a series of tries the whole power of these, but we do find it useful to chain a series of tries
together. The idea is to scan against one trie, then, if there's no result, together. The idea is to scan against one trie, then, if there's no result,
start again with the next, and so on. Inform therefore often matches text start again with the next, and so on. Inform therefore often matches text
against a linked list of tries: we'll call that an "avinue". against a linked list of tries: we'll call that an "avinue".
= <<*>>=
typedef struct match_avinue { typedef struct match_avinue {
struct match_trie *the_trie; struct match_trie *the_trie;
struct match_avinue *next; struct match_avinue *next;
@ -289,7 +291,7 @@ typedef struct match_avinue {
@ An avinue starts out with a single trie, which itself has just a single @ An avinue starts out with a single trie, which itself has just a single
head node (of either sort). head node (of either sort).
= <<*>>=
match_avinue *Tries::new_avinue(int from_start) { match_avinue *Tries::new_avinue(int from_start) {
match_avinue *A = CREATE(match_avinue); match_avinue *A = CREATE(match_avinue);
A->next = NULL; A->next = NULL;
@ -298,14 +300,14 @@ match_avinue *Tries::new_avinue(int from_start) {
} }
void Tries::add_to_avinue(match_avinue *mt, text_stream *from, wchar_t *to) { void Tries::add_to_avinue(match_avinue *mt, text_stream *from, wchar_t *to) {
if ((mt == NULL) || (mt->the_trie == NULL)) internal_error("null trie"); if ((mt == NULL) [[]] (mt->the_trie == NULL)) internal_error("null trie");
Tries::search(mt->the_trie, from, to); Tries::search(mt->the_trie, from, to);
} }
@ The following duplicates an avinue, pointing to the same sequence of @ The following duplicates an avinue, pointing to the same sequence of
tries. tries.
= <<*>>=
match_avinue *Tries::duplicate_avinue(match_avinue *A) { match_avinue *Tries::duplicate_avinue(match_avinue *A) {
match_avinue *F = NULL, *FL = NULL; match_avinue *F = NULL, *FL = NULL;
while (A) { while (A) {
@ -323,7 +325,7 @@ match_avinue *Tries::duplicate_avinue(match_avinue *A) {
@ As noted above, searching an avinue is a matter of searching with each @ As noted above, searching an avinue is a matter of searching with each
trie in turn until one matches (if it does). trie in turn until one matches (if it does).
= <<*>>=
wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) { wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) {
wchar_t *result = NULL; wchar_t *result = NULL;
while ((T) && (result == NULL)) { while ((T) && (result == NULL)) {
@ -333,9 +335,9 @@ wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) {
return result; return result;
} }
@h Logging. @ \section{Logging.}
= <<*>>=
void Tries::log_avinue(OUTPUT_STREAM, void *vA) { void Tries::log_avinue(OUTPUT_STREAM, void *vA) {
match_avinue *A = (match_avinue *) vA; match_avinue *A = (match_avinue *) vA;
WRITE("Avinue:\n"); INDENT; WRITE("Avinue:\n"); INDENT;

View file

@ -2,27 +2,27 @@
A minimal library for handling wide C strings. A minimal library for handling wide C strings.
@ By "wide string", we mean an array of |wchar_t|. A pointer to this type @ By "wide string", we mean an array of [[wchar_t]]. A pointer to this type
is what is returned by an L-literal in ANSI C, such as |L"look, I'm wide"|. is what is returned by an L-literal in ANSI C, such as [[L"look, I'm wide"]].
A wide string is essentially a C string but with characters stored in full A wide string is essentially a C string but with characters stored in full
words instead of bytes. The character values should be Unicode code points. words instead of bytes. The character values should be Unicode code points.
We will do as little as possible with wide strings, and the following We will do as little as possible with wide strings, and the following
wrappers simply abstract the standard C library's handling. wrappers simply abstract the standard C library's handling.
= <<*>>=
int Wide::len(wchar_t *p) { int Wide::len(wchar_t *p) {
return (int) wcslen(p); return (int) wcslen(p);
} }
@ On the rare occasions when we need to sort alphabetically we'll also call: @ On the rare occasions when we need to sort alphabetically we'll also call:
= <<*>>=
int Wide::cmp(wchar_t *A, wchar_t *B) { int Wide::cmp(wchar_t *A, wchar_t *B) {
return wcscmp(A, B); return wcscmp(A, B);
} }
@ = <<*>>=
int Wide::atoi(wchar_t *p) { int Wide::atoi(wchar_t *p) {
return (int) wcstol(p, NULL, 10); return (int) wcstol(p, NULL, 10);
} }