foundation-module: Chapter 4: Nowebify.
This commit is contained in:
parent
dce27ab9d8
commit
f7bbaba2b4
9 changed files with 570 additions and 560 deletions
|
@ -2,8 +2,8 @@
|
||||||
|
|
||||||
A minimal library for handling C-style strings.
|
A minimal library for handling C-style strings.
|
||||||
|
|
||||||
@ Programs using Foundation store text in |text_stream| structures almost all
|
@ Programs using Foundation store text in [[text_stream]] structures almost all
|
||||||
of the time, but old-style, null-terminated |char *| array strings are
|
of the time, but old-style, null-terminated [[char *]] array strings are
|
||||||
still occasionally needed.
|
still occasionally needed.
|
||||||
|
|
||||||
We need to handle C strings long enough to contain any plausible filename, and
|
We need to handle C strings long enough to contain any plausible filename, and
|
||||||
|
@ -11,17 +11,18 @@ any run of a dozen or so lines of code; but we have no real need to handle
|
||||||
strings of unlimited length, nor to be parsimonious with memory.
|
strings of unlimited length, nor to be parsimonious with memory.
|
||||||
|
|
||||||
The following defines a type for a string long enough for our purposes.
|
The following defines a type for a string long enough for our purposes.
|
||||||
It should be at least as long as the constant sometimes called |PATH_MAX|,
|
It should be at least as long as the constant sometimes called [[PATH_MAX]],
|
||||||
the maximum length of a pathname, which is 1024 on Mac OS X.
|
the maximum length of a pathname, which is 1024 on Mac OS X.
|
||||||
|
|
||||||
@d MAX_STRING_LENGTH 8*1024
|
<<*>>=
|
||||||
|
#define MAX_STRING_LENGTH 8*1024
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef char string[MAX_STRING_LENGTH+1];
|
typedef char string[MAX_STRING_LENGTH+1];
|
||||||
|
|
||||||
@ Occasionally we need access to the real, unbounded strlen:
|
@ Occasionally we need access to the real, unbounded strlen:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int CStrings::strlen_unbounded(const char *p) {
|
int CStrings::strlen_unbounded(const char *p) {
|
||||||
return (int) strlen(p);
|
return (int) strlen(p);
|
||||||
}
|
}
|
||||||
|
@ -31,16 +32,16 @@ an attempt to continue execution after a string overflow might conceivably
|
||||||
result in a malformatted shell command being passed to the operating system,
|
result in a malformatted shell command being passed to the operating system,
|
||||||
which we cannot risk.
|
which we cannot risk.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int CStrings::check_len(int n) {
|
int CStrings::check_len(int n) {
|
||||||
if ((n > MAX_STRING_LENGTH) || (n < 0)) Errors::fatal("String overflow\n");
|
if ((n > MAX_STRING_LENGTH) [[]] (n < 0)) Errors::fatal("String overflow\n");
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ The following is then protected from reading out of range if given a
|
@ The following is then protected from reading out of range if given a
|
||||||
non-terminated string, though this should never actually happen.
|
non-terminated string, though this should never actually happen.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int CStrings::len(char *str) {
|
int CStrings::len(char *str) {
|
||||||
for (int i=0; i<=MAX_STRING_LENGTH; i++)
|
for (int i=0; i<=MAX_STRING_LENGTH; i++)
|
||||||
if (str[i] == 0) return i;
|
if (str[i] == 0) return i;
|
||||||
|
@ -48,10 +49,10 @@ int CStrings::len(char *str) {
|
||||||
return MAX_STRING_LENGTH;
|
return MAX_STRING_LENGTH;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ We then have a replacement for |strcpy|, identical except that it's
|
@ We then have a replacement for [[strcpy]], identical except that it's
|
||||||
bounds-checked:
|
bounds-checked:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void CStrings::copy(char *to, char *from) {
|
void CStrings::copy(char *to, char *from) {
|
||||||
CStrings::check_len(CStrings::len(from));
|
CStrings::check_len(CStrings::len(from));
|
||||||
int i;
|
int i;
|
||||||
|
@ -59,9 +60,9 @@ void CStrings::copy(char *to, char *from) {
|
||||||
to[i] = 0;
|
to[i] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ String comparisons will be done with the following, not |strcmp| directly:
|
@ String comparisons will be done with the following, not [[strcmp]] directly:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int CStrings::eq(char *A, char *B) {
|
int CStrings::eq(char *A, char *B) {
|
||||||
return (CStrings::cmp(A, B) == 0)?TRUE:FALSE;
|
return (CStrings::cmp(A, B) == 0)?TRUE:FALSE;
|
||||||
}
|
}
|
||||||
|
@ -72,20 +73,20 @@ int CStrings::ne(char *A, char *B) {
|
||||||
|
|
||||||
@ On the rare occasions when we need to sort alphabetically we'll also call:
|
@ On the rare occasions when we need to sort alphabetically we'll also call:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int CStrings::cmp(char *A, char *B) {
|
int CStrings::cmp(char *A, char *B) {
|
||||||
if ((A == NULL) || (A[0] == 0)) {
|
if ((A == NULL) [[]] (A[0] == 0)) {
|
||||||
if ((B == NULL) || (B[0] == 0)) return 0;
|
if ((B == NULL) [[]] (B[0] == 0)) return 0;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if ((B == NULL) || (B[0] == 0)) return 1;
|
if ((B == NULL) [[]] (B[0] == 0)) return 1;
|
||||||
return strcmp(A, B);
|
return strcmp(A, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ And the following is needed to deal with extension filenames on platforms
|
@ And the following is needed to deal with extension filenames on platforms
|
||||||
whose locale is encoded as UTF-8.
|
whose locale is encoded as UTF-8.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void CStrings::transcode_ISO_string_to_UTF8(char *p, char *dest) {
|
void CStrings::transcode_ISO_string_to_UTF8(char *p, char *dest) {
|
||||||
int i, j;
|
int i, j;
|
||||||
for (i=0, j=0; p[i]; i++) {
|
for (i=0, j=0; p[i]; i++) {
|
||||||
|
@ -100,33 +101,33 @@ void CStrings::transcode_ISO_string_to_UTF8(char *p, char *dest) {
|
||||||
dest[j] = 0;
|
dest[j] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ I dislike to use |strncpy| because, and for some reason this surprises
|
@ I dislike to use [[strncpy]] because, and for some reason this surprises
|
||||||
me every time, it truncates but fails to write a null termination character
|
me every time, it truncates but fails to write a null termination character
|
||||||
if the string to be copied is larger than the buffer to write to: the
|
if the string to be copied is larger than the buffer to write to: the
|
||||||
result is therefore not a well-formed string and we have to fix matters by
|
result is therefore not a well-formed string and we have to fix matters by
|
||||||
hand. This I think makes for opaque code. So:
|
hand. This I think makes for opaque code. So:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void CStrings::truncated_strcpy(char *to, char *from, int max) {
|
void CStrings::truncated_strcpy(char *to, char *from, int max) {
|
||||||
int i;
|
int i;
|
||||||
for (i=0; ((from[i]) && (i<max-1)); i++) to[i] = from[i];
|
for (i=0; ((from[i]) && (i<max-1)); i++) to[i] = from[i];
|
||||||
to[i] = 0;
|
to[i] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Text storage.
|
@ \section{Text storage.}
|
||||||
The following is convenient for parking a read-only string of text somewhere
|
The following is convenient for parking a read-only string of text somewhere
|
||||||
safe in memory. Since the length can't be extended, it's usually unsafe
|
safe in memory. Since the length can't be extended, it's usually unsafe
|
||||||
to write to the result. (Inform tools very seldom use this, because C strings
|
to write to the result. (Inform tools very seldom use this, because C strings
|
||||||
are almost always best avoided.)
|
are almost always best avoided.)
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct string_storage_area {
|
typedef struct string_storage_area {
|
||||||
char *storage_at;
|
char *storage_at;
|
||||||
int capacity;
|
int capacity;
|
||||||
CLASS_DEFINITION
|
CLASS_DEFINITION
|
||||||
} string_storage_area;
|
} string_storage_area;
|
||||||
|
|
||||||
@ =
|
<<*>>=
|
||||||
char *CStrings::park_string(char *from) {
|
char *CStrings::park_string(char *from) {
|
||||||
string_storage_area *ssa = CREATE(string_storage_area);
|
string_storage_area *ssa = CREATE(string_storage_area);
|
||||||
ssa->capacity = (int) CStrings::strlen_unbounded(from) + 1;
|
ssa->capacity = (int) CStrings::strlen_unbounded(from) + 1;
|
||||||
|
@ -137,7 +138,7 @@ char *CStrings::park_string(char *from) {
|
||||||
|
|
||||||
@ And here we free any SSAs needed in the course of the run.
|
@ And here we free any SSAs needed in the course of the run.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void CStrings::free_ssas(void) {
|
void CStrings::free_ssas(void) {
|
||||||
string_storage_area *ssa;
|
string_storage_area *ssa;
|
||||||
LOOP_OVER(ssa, string_storage_area)
|
LOOP_OVER(ssa, string_storage_area)
|
|
@ -2,9 +2,9 @@
|
||||||
|
|
||||||
Individual characters.
|
Individual characters.
|
||||||
|
|
||||||
@h Character classes.
|
@ \section{Character classes.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
wchar_t Characters::tolower(wchar_t c) {
|
wchar_t Characters::tolower(wchar_t c) {
|
||||||
return (wchar_t) tolower((int) c);
|
return (wchar_t) tolower((int) c);
|
||||||
}
|
}
|
||||||
|
@ -31,19 +31,19 @@ int Characters::iscntrl(wchar_t c) {
|
||||||
return ((i >= 0) && (i < 32));
|
return ((i >= 0) && (i < 32));
|
||||||
}
|
}
|
||||||
int Characters::vowel(wchar_t c) {
|
int Characters::vowel(wchar_t c) {
|
||||||
if ((c == 'a') || (c == 'e') || (c == 'i') || (c == 'o') || (c == 'u')) return TRUE;
|
if ((c == 'a') [[| (c == 'e') || (c == 'i') || (c == 'o') |]] (c == 'u')) return TRUE;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ White space classes:
|
@ White space classes:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Characters::is_space_or_tab(int c) {
|
int Characters::is_space_or_tab(int c) {
|
||||||
if ((c == ' ') || (c == '\t')) return TRUE;
|
if ((c == ' ') [[]] (c == '\t')) return TRUE;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
int Characters::is_whitespace(int c) {
|
int Characters::is_whitespace(int c) {
|
||||||
if ((c == ' ') || (c == '\t') || (c == '\n')) return TRUE;
|
if ((c == ' ') [[| (c == '\t') |]] (c == '\n')) return TRUE;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -51,18 +51,18 @@ int Characters::is_whitespace(int c) {
|
||||||
sense of the Treaty of Babel rules on leading and trailing spaces in
|
sense of the Treaty of Babel rules on leading and trailing spaces in
|
||||||
iFiction records.
|
iFiction records.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Characters::is_babel_whitespace(int c) {
|
int Characters::is_babel_whitespace(int c) {
|
||||||
if ((c == ' ') || (c == '\t') || (c == '\x0a')
|
if ((c == ' ') [[| (c == '\t') |]] (c == '\x0a')
|
||||||
|| (c == '\x0d') || (c == NEWLINE_IN_STRING)) return TRUE;
|
[[| (c == '\x0d') |]] (c == NEWLINE_IN_STRING)) return TRUE;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Unicode composition.
|
@ \section{Unicode composition.}
|
||||||
A routine which converts the Unicode combining accents with letters,
|
A routine which converts the Unicode combining accents with letters,
|
||||||
sufficient correctly to handle all characters in the ZSCII set.
|
sufficient correctly to handle all characters in the ZSCII set.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Characters::combine_accent(int accent, int letter) {
|
int Characters::combine_accent(int accent, int letter) {
|
||||||
switch(accent) {
|
switch(accent) {
|
||||||
case 0x0300: /* Unicode combining grave */
|
case 0x0300: /* Unicode combining grave */
|
||||||
|
@ -112,11 +112,11 @@ int Characters::combine_accent(int accent, int letter) {
|
||||||
return '?';
|
return '?';
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Accent stripping.
|
@ \section{Accent stripping.}
|
||||||
It's occasionally useful to simplify text used as a filename by removing
|
It's occasionally useful to simplify text used as a filename by removing
|
||||||
the more obvious accents from it.
|
the more obvious accents from it.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Characters::make_filename_safe(int charcode) {
|
int Characters::make_filename_safe(int charcode) {
|
||||||
charcode = Characters::remove_accent(charcode);
|
charcode = Characters::remove_accent(charcode);
|
||||||
if (charcode >= 128) charcode = '-';
|
if (charcode >= 128) charcode = '-';
|
||||||
|
@ -131,7 +131,7 @@ wchar_t Characters::make_wchar_t_filename_safe(wchar_t charcode) {
|
||||||
|
|
||||||
@ The following strips the accent, if present, from an ISO Latin-1 character:
|
@ The following strips the accent, if present, from an ISO Latin-1 character:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Characters::remove_accent(int charcode) {
|
int Characters::remove_accent(int charcode) {
|
||||||
switch (charcode) {
|
switch (charcode) {
|
||||||
case 0xC0: case 0xC1: case 0xC2: case 0xC3:
|
case 0xC0: case 0xC1: case 0xC2: case 0xC3:
|
||||||
|
@ -165,7 +165,7 @@ wchar_t Characters::remove_wchar_t_accent(wchar_t charcode) {
|
||||||
|
|
||||||
@ This will do until we properly use Unicode character classes some day:
|
@ This will do until we properly use Unicode character classes some day:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Characters::isalphabetic(int letter) {
|
int Characters::isalphabetic(int letter) {
|
||||||
return Characters::isalpha((wchar_t) Characters::remove_accent(letter));
|
return Characters::isalpha((wchar_t) Characters::remove_accent(letter));
|
||||||
}
|
}
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
To read, validate and write JSON data interchange material.
|
To read, validate and write JSON data interchange material.
|
||||||
|
|
||||||
@h Introduction.
|
@ \section{Introduction.}
|
||||||
JSON (Douglas Crockford, c. 2000) stands for "JavaScript Object Notation", but is
|
JSON (Douglas Crockford, c. 2000) stands for "JavaScript Object Notation", but is
|
||||||
now a //standardised data interchange format -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf//
|
now a //standardised data interchange format -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf//
|
||||||
used in many contexts. It's especially suitable for passing small amounts of data
|
used in many contexts. It's especially suitable for passing small amounts of data
|
||||||
|
@ -13,17 +13,17 @@ or preference files.
|
||||||
This section provides encoding and decoding facilities. It is intended to comply
|
This section provides encoding and decoding facilities. It is intended to comply
|
||||||
with //ECMA-404 -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf//,
|
with //ECMA-404 -> https://www.ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf//,
|
||||||
except that (i) it disallows repetition the same key in the same object, and (ii)
|
except that (i) it disallows repetition the same key in the same object, and (ii)
|
||||||
text can only be used in the Basic Multilingual Plane of Unicode points |0x0000|
|
text can only be used in the Basic Multilingual Plane of Unicode points [[0x0000]]
|
||||||
to |0xffff|.
|
to [[0xffff]].
|
||||||
|
|
||||||
There are no size maxima or limitations. Still, this code was written at typing speed,
|
There are no size maxima or limitations. Still, this code was written at typing speed,
|
||||||
and no effort has gone into reducing memory usage or running time in the face of
|
and no effort has gone into reducing memory usage or running time in the face of
|
||||||
large (or malicious) JSON content. Error reporting is also limited in fulsomeness.
|
large (or malicious) JSON content. Error reporting is also limited in fulsomeness.
|
||||||
|
|
||||||
See the |foundation-test| test case |json| for many exercises of the code below;
|
See the [[foundation-test]] test case [[json]] for many exercises of the code below;
|
||||||
do not change this section without checking that it continues to pass.
|
do not change this section without checking that it continues to pass.
|
||||||
|
|
||||||
@h Data model.
|
@ \section{Data model.}
|
||||||
JSON has a simple data model which we need to replicate in memory. Each value
|
JSON has a simple data model which we need to replicate in memory. Each value
|
||||||
will be a pointer to a (permanently held in memory) //JSON_value// object.
|
will be a pointer to a (permanently held in memory) //JSON_value// object.
|
||||||
This is in effect a union, in that its type is always one of the following,
|
This is in effect a union, in that its type is always one of the following,
|
||||||
|
@ -31,19 +31,20 @@ and then only certain elements are meaningful depending on type.
|
||||||
|
|
||||||
These are exactly the JSON types except that numbers are split between integer
|
These are exactly the JSON types except that numbers are split between integer
|
||||||
and floating-point versions (the conflation of the two is where the Javascript
|
and floating-point versions (the conflation of the two is where the Javascript
|
||||||
origins of JSON show through), and that the type |ERROR_JSONTYPE| represents
|
origins of JSON show through), and that the type [[ERROR_JSONTYPE]] represents
|
||||||
invalid data resulting from attempting to decode erroneous JSON.
|
invalid data resulting from attempting to decode erroneous JSON.
|
||||||
|
|
||||||
@e NUMBER_JSONTYPE from 1
|
<<*>>=
|
||||||
@e DOUBLE_JSONTYPE
|
enum NUMBER_JSONTYPE from 1
|
||||||
@e STRING_JSONTYPE
|
enum DOUBLE_JSONTYPE
|
||||||
@e BOOLEAN_JSONTYPE
|
enum STRING_JSONTYPE
|
||||||
@e ARRAY_JSONTYPE
|
enum BOOLEAN_JSONTYPE
|
||||||
@e OBJECT_JSONTYPE
|
enum ARRAY_JSONTYPE
|
||||||
@e NULL_JSONTYPE
|
enum OBJECT_JSONTYPE
|
||||||
@e ERROR_JSONTYPE
|
enum NULL_JSONTYPE
|
||||||
|
enum ERROR_JSONTYPE
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void JSON::write_type(OUTPUT_STREAM, int t) {
|
void JSON::write_type(OUTPUT_STREAM, int t) {
|
||||||
switch (t) {
|
switch (t) {
|
||||||
case NUMBER_JSONTYPE: WRITE("number"); break;
|
case NUMBER_JSONTYPE: WRITE("number"); break;
|
||||||
|
@ -60,23 +61,23 @@ void JSON::write_type(OUTPUT_STREAM, int t) {
|
||||||
|
|
||||||
@
|
@
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct JSON_value {
|
typedef struct JSON_value {
|
||||||
int JSON_type;
|
int JSON_type;
|
||||||
int if_integer;
|
int if_integer;
|
||||||
double if_double;
|
double if_double;
|
||||||
struct text_stream *if_string;
|
struct text_stream *if_string;
|
||||||
int if_boolean;
|
int if_boolean;
|
||||||
struct linked_list *if_list; /* of |JSON_value| */
|
struct linked_list *if_list; /* of [[JSON_value]] */
|
||||||
struct dictionary *dictionary_if_object; /* to |JSON_value| */
|
struct dictionary *dictionary_if_object; /* to [[JSON_value]] */
|
||||||
struct linked_list *list_if_object; /* of |text_stream| */
|
struct linked_list *list_if_object; /* of [[text_stream]] */
|
||||||
struct text_stream *if_error;
|
struct text_stream *if_error;
|
||||||
CLASS_DEFINITION
|
CLASS_DEFINITION
|
||||||
} JSON_value;
|
} JSON_value;
|
||||||
|
|
||||||
@ Now some constructor functions to create data of each JSON type:
|
@ Now some constructor functions to create data of each JSON type:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::new_null(void) {
|
JSON_value *JSON::new_null(void) {
|
||||||
JSON_value *value = CREATE(JSON_value);
|
JSON_value *value = CREATE(JSON_value);
|
||||||
value->JSON_type = NULL_JSONTYPE;
|
value->JSON_type = NULL_JSONTYPE;
|
||||||
|
@ -123,7 +124,7 @@ JSON_value *JSON::new_string(text_stream *S) {
|
||||||
@ JSON arrays -- lists, in effect -- should be created in an empty state, and
|
@ JSON arrays -- lists, in effect -- should be created in an empty state, and
|
||||||
then have entries added sequentially:
|
then have entries added sequentially:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::new_array(void) {
|
JSON_value *JSON::new_array(void) {
|
||||||
JSON_value *value = JSON::new_null();
|
JSON_value *value = JSON::new_null();
|
||||||
value->JSON_type = ARRAY_JSONTYPE;
|
value->JSON_type = ARRAY_JSONTYPE;
|
||||||
|
@ -144,7 +145,7 @@ JSON_value *JSON::add_to_array(JSON_value *array, JSON_value *new_entry) {
|
||||||
@ Similarly, JSON objects -- dictionaries of key-value pairs, in effect --
|
@ Similarly, JSON objects -- dictionaries of key-value pairs, in effect --
|
||||||
should be created in an empty state, and then have key-value pairs added as needed:
|
should be created in an empty state, and then have key-value pairs added as needed:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::new_object(void) {
|
JSON_value *JSON::new_object(void) {
|
||||||
JSON_value *value = JSON::new_null();
|
JSON_value *value = JSON::new_null();
|
||||||
value->JSON_type = OBJECT_JSONTYPE;
|
value->JSON_type = OBJECT_JSONTYPE;
|
||||||
|
@ -166,10 +167,10 @@ JSON_value *JSON::add_to_object(JSON_value *obj, text_stream *key, JSON_value *v
|
||||||
return obj;
|
return obj;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ The following looks up a key in an object, returning |NULL| if and only if
|
@ The following looks up a key in an object, returning [[NULL]] if and only if
|
||||||
it is not present:
|
it is not present:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::look_up_object(JSON_value *obj, text_stream *key) {
|
JSON_value *JSON::look_up_object(JSON_value *obj, text_stream *key) {
|
||||||
if (obj == NULL) internal_error("no object");
|
if (obj == NULL) internal_error("no object");
|
||||||
if (obj->JSON_type == ERROR_JSONTYPE) return NULL;
|
if (obj->JSON_type == ERROR_JSONTYPE) return NULL;
|
||||||
|
@ -182,7 +183,7 @@ JSON_value *JSON::look_up_object(JSON_value *obj, text_stream *key) {
|
||||||
@ One last constructor creates an invalid JSON value resulting from incorrect
|
@ One last constructor creates an invalid JSON value resulting from incorrect
|
||||||
JSON input:
|
JSON input:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::error(text_stream *msg) {
|
JSON_value *JSON::error(text_stream *msg) {
|
||||||
JSON_value *value = JSON::new_null();
|
JSON_value *value = JSON::new_null();
|
||||||
value->JSON_type = ERROR_JSONTYPE;
|
value->JSON_type = ERROR_JSONTYPE;
|
||||||
|
@ -193,7 +194,7 @@ JSON_value *JSON::error(text_stream *msg) {
|
||||||
@ This is a very limited form of comparison, since it cannot test equality
|
@ This is a very limited form of comparison, since it cannot test equality
|
||||||
of arrays or objects.
|
of arrays or objects.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int JSON::eq(JSON_value *val1, JSON_value *val2) {
|
int JSON::eq(JSON_value *val1, JSON_value *val2) {
|
||||||
if ((val1 == NULL) && (val2)) return FALSE;
|
if ((val1 == NULL) && (val2)) return FALSE;
|
||||||
if ((val1) && (val2 == NULL)) return FALSE;
|
if ((val1) && (val2 == NULL)) return FALSE;
|
||||||
|
@ -208,16 +209,16 @@ int JSON::eq(JSON_value *val1, JSON_value *val2) {
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Decoding JSON.
|
@ \section{Decoding JSON.}
|
||||||
We do no actual file-handling in this section, but the following decoder can
|
We do no actual file-handling in this section, but the following decoder can
|
||||||
be pointed to the contents of UTF-8 text file as needed.
|
be pointed to the contents of UTF-8 text file as needed.
|
||||||
|
|
||||||
The decoder returns a non-|NULL| pointer in all cases. If the text contains
|
The decoder returns a non-[[NULL]] pointer in all cases. If the text contains
|
||||||
any malformed JSON anywhere inside it, this pointer will be to a value of type
|
any malformed JSON anywhere inside it, this pointer will be to a value of type
|
||||||
|ERROR_JSONTYPE|. Such a value should be thrown away as soon as the error
|
[[ERROR_JSONTYPE]]. Such a value should be thrown away as soon as the error
|
||||||
message is made use of.
|
message is made use of.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::decode(text_stream *T, text_file_position *tfp) {
|
JSON_value *JSON::decode(text_stream *T, text_file_position *tfp) {
|
||||||
return JSON::decode_range(T, 0, Str::len(T), tfp);
|
return JSON::decode_range(T, 0, Str::len(T), tfp);
|
||||||
}
|
}
|
||||||
|
@ -248,19 +249,19 @@ JSON_value *JSON::decode_error_q(text_stream *err, text_file_position *tfp,
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ This decodes the text in the character position range |[from, to)| as a
|
@ This decodes the text in the character position range [[[from, to)]] as a
|
||||||
JSON value.
|
JSON value.
|
||||||
|
|
||||||
The possibilities here are |[ ... ]| for an array, |{ ... }| for an object,
|
The possibilities here are [[[ ... ]]] for an array, [[{ ... }]] for an object,
|
||||||
|"..."| for a string, a token beginning with a digit or a minus sign for a
|
[["..."]] for a string, a token beginning with a digit or a minus sign for a
|
||||||
number (note that |+| and |.| are not allowed to open a number according to
|
number (note that [[+]] and [[.]] are not allowed to open a number according to
|
||||||
the JSON standard), and the special cases |true|, |false| and |null|.
|
the JSON standard), and the special cases [[true]], [[false]] and [[null]].
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_position *tfp) {
|
JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_position *tfp) {
|
||||||
int first_nws = -1, last_nws = -1;
|
int first_nws = -1, last_nws = -1;
|
||||||
wchar_t first_c = 0, last_c = 0;
|
wchar_t first_c = 0, last_c = 0;
|
||||||
@<Find the first and last non-whitespace character@>;
|
<<Find the first and last non-whitespace character>>;
|
||||||
switch (first_c) {
|
switch (first_c) {
|
||||||
case '[':
|
case '[':
|
||||||
if (last_c != ']') return JSON::decode_error(I"mismatched '[' ... ']'", tfp);
|
if (last_c != ']') return JSON::decode_error(I"mismatched '[' ... ']'", tfp);
|
||||||
|
@ -274,7 +275,7 @@ JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_posit
|
||||||
if (last_c != '"') return JSON::decode_error(I"mismatched quotation marks", tfp);
|
if (last_c != '"') return JSON::decode_error(I"mismatched quotation marks", tfp);
|
||||||
return JSON::decode_string(T, first_nws+1, last_nws, tfp);
|
return JSON::decode_string(T, first_nws+1, last_nws, tfp);
|
||||||
}
|
}
|
||||||
if ((Characters::isdigit(first_c)) || (first_c == '-'))
|
if ((Characters::isdigit(first_c)) [[]] (first_c == '-'))
|
||||||
return JSON::decode_number(T, first_nws, last_nws+1, tfp);
|
return JSON::decode_number(T, first_nws, last_nws+1, tfp);
|
||||||
if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3))
|
if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3))
|
||||||
return JSON::new_boolean(TRUE);
|
return JSON::new_boolean(TRUE);
|
||||||
|
@ -285,7 +286,7 @@ JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_posit
|
||||||
return JSON::decode_error(I"unknown JSON value", tfp);
|
return JSON::decode_error(I"unknown JSON value", tfp);
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Find the first and last non-whitespace character@> =
|
<<Find the first and last non-whitespace character>>=
|
||||||
for (int i=from; i<to; i++)
|
for (int i=from; i<to; i++)
|
||||||
if (Characters::is_whitespace(Str::get_at(T, i)) == FALSE) {
|
if (Characters::is_whitespace(Str::get_at(T, i)) == FALSE) {
|
||||||
first_nws = i; break;
|
first_nws = i; break;
|
||||||
|
@ -299,11 +300,11 @@ JSON_value *JSON::decode_range(text_stream *T, int from, int to, text_file_posit
|
||||||
last_c = Str::get_at(T, last_nws);
|
last_c = Str::get_at(T, last_nws);
|
||||||
|
|
||||||
@ So now we have individual decoder functions for each type. First, arrays, where
|
@ So now we have individual decoder functions for each type. First, arrays, where
|
||||||
now the range |[from, to)| represents what is inside the square brackets: this
|
now the range [[[from, to)]] represents what is inside the square brackets: this
|
||||||
needs to be a comma-separated list. We follow ECMA strictly in disallowing a final
|
needs to be a comma-separated list. We follow ECMA strictly in disallowing a final
|
||||||
comma before the |]|, unlike some JSON-like parsers.
|
comma before the [[]]], unlike some JSON-like parsers.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::decode_array(JSON_value *array, text_stream *T, int from, int to,
|
JSON_value *JSON::decode_array(JSON_value *array, text_stream *T, int from, int to,
|
||||||
text_file_position *tfp) {
|
text_file_position *tfp) {
|
||||||
int content = FALSE;
|
int content = FALSE;
|
||||||
|
@ -340,7 +341,7 @@ JSON_value *JSON::decode_array_entry(JSON_value *array, text_stream *T, int from
|
||||||
|
|
||||||
@ And similarly for objects.
|
@ And similarly for objects.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::decode_object(JSON_value *obj, text_stream *T, int from, int to,
|
JSON_value *JSON::decode_object(JSON_value *obj, text_stream *T, int from, int to,
|
||||||
text_file_position *tfp) {
|
text_file_position *tfp) {
|
||||||
int content = FALSE;
|
int content = FALSE;
|
||||||
|
@ -370,13 +371,13 @@ JSON_value *JSON::decode_object(JSON_value *obj, text_stream *T, int from, int t
|
||||||
}
|
}
|
||||||
|
|
||||||
@ Note that we allow key names to include all kinds of unconscionable garbage,
|
@ Note that we allow key names to include all kinds of unconscionable garbage,
|
||||||
as ECMA requires. |\u0003\"\t\t\t| is a valid JSON key name; so is the empty string.
|
as ECMA requires. [[\u0003\"\t\t\t]] is a valid JSON key name; so is the empty string.
|
||||||
|
|
||||||
We are however slightly stricter than ECMA in that we disallow duplicate keys
|
We are however slightly stricter than ECMA in that we disallow duplicate keys
|
||||||
in the same object. ECMA says this is a "semantic consideration that may be defined
|
in the same object. ECMA says this is a "semantic consideration that may be defined
|
||||||
by JSON processors". We are hereby defining it.
|
by JSON processors". We are hereby defining it.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from, int to,
|
JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from, int to,
|
||||||
text_file_position *tfp) {
|
text_file_position *tfp) {
|
||||||
while (Characters::is_whitespace(Str::get_at(T, from))) from++;
|
while (Characters::is_whitespace(Str::get_at(T, from))) from++;
|
||||||
|
@ -399,7 +400,7 @@ JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from,
|
||||||
}
|
}
|
||||||
if (ended == FALSE) return JSON::decode_error_q(I"key does not end with quotation mark", tfp, T, saved_from, saved_to);
|
if (ended == FALSE) return JSON::decode_error_q(I"key does not end with quotation mark", tfp, T, saved_from, saved_to);
|
||||||
while (Characters::is_whitespace(Str::get_at(T, from))) from++;
|
while (Characters::is_whitespace(Str::get_at(T, from))) from++;
|
||||||
if ((from >= to) || (Str::get_at(T, from) != ':'))
|
if ((from >= to) [[]] (Str::get_at(T, from) != ':'))
|
||||||
return JSON::decode_error_q(I"key is not followed by ':'", tfp, T, saved_from, saved_to);
|
return JSON::decode_error_q(I"key is not followed by ':'", tfp, T, saved_from, saved_to);
|
||||||
from++;
|
from++;
|
||||||
if (JSON::look_up_object(obj, key)) return JSON::decode_error_q(I"duplicate key", tfp, T, saved_from, saved_to);
|
if (JSON::look_up_object(obj, key)) return JSON::decode_error_q(I"duplicate key", tfp, T, saved_from, saved_to);
|
||||||
|
@ -412,9 +413,9 @@ JSON_value *JSON::decode_object_entry(JSON_value *obj, text_stream *T, int from,
|
||||||
@ Numbers are annoying to decode since they can be given either in a restricted
|
@ Numbers are annoying to decode since they can be given either in a restricted
|
||||||
floating-point syntax, or in decimal. ECMA is slippery on the question of exactly
|
floating-point syntax, or in decimal. ECMA is slippery on the question of exactly
|
||||||
what floating-point numbers can be represented, but it's common to consider
|
what floating-point numbers can be represented, but it's common to consider
|
||||||
them as being |double|, so we'll follow suit.
|
them as being [[double]], so we'll follow suit.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_position *tfp) {
|
JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_position *tfp) {
|
||||||
while (Characters::is_whitespace(Str::get_at(T, from))) from++;
|
while (Characters::is_whitespace(Str::get_at(T, from))) from++;
|
||||||
while ((to > from) && (Characters::is_whitespace(Str::get_at(T, to-1)))) to--;
|
while ((to > from) && (Characters::is_whitespace(Str::get_at(T, to-1)))) to--;
|
||||||
|
@ -426,8 +427,8 @@ JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_posi
|
||||||
for (int i=at; i<to; i++)
|
for (int i=at; i<to; i++)
|
||||||
if (Characters::isdigit(Str::get_at(T, i)))
|
if (Characters::isdigit(Str::get_at(T, i)))
|
||||||
PUT_TO(integer, Str::get_at(T, i));
|
PUT_TO(integer, Str::get_at(T, i));
|
||||||
else if ((Str::get_at(T, i) == 'E') || (Str::get_at(T, i) == 'e') ||
|
else if ((Str::get_at(T, i) == 'E') [[| (Str::get_at(T, i) == 'e') |]]
|
||||||
(Str::get_at(T, i) == '.') || (Str::get_at(T, i) == '+'))
|
(Str::get_at(T, i) == '.') [[]] (Str::get_at(T, i) == '+'))
|
||||||
double_me = TRUE;
|
double_me = TRUE;
|
||||||
else
|
else
|
||||||
return JSON::decode_error(I"number is not a decimal integer", tfp);
|
return JSON::decode_error(I"number is not a decimal integer", tfp);
|
||||||
|
@ -451,7 +452,7 @@ JSON_value *JSON::decode_number(text_stream *T, int from, int to, text_file_posi
|
||||||
@ Strings are easy except for escape characters. I have no idea why JSON wants
|
@ Strings are easy except for escape characters. I have no idea why JSON wants
|
||||||
to allow the escaping of forward slash, but the standard requires it.
|
to allow the escaping of forward slash, but the standard requires it.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_position *tfp) {
|
JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_position *tfp) {
|
||||||
TEMPORARY_TEXT(string)
|
TEMPORARY_TEXT(string)
|
||||||
for (int i=from; i<to; i++) {
|
for (int i=from; i<to; i++) {
|
||||||
|
@ -469,7 +470,7 @@ JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_posi
|
||||||
case '\\': break;
|
case '\\': break;
|
||||||
case '/': break;
|
case '/': break;
|
||||||
case '"': break;
|
case '"': break;
|
||||||
case 'u': @<Decode a hexadecimal Unicode escape@>; break;
|
case 'u': <<Decode a hexadecimal Unicode escape>>; break;
|
||||||
default: return JSON::decode_error(I"bad '\\' escape in string", tfp);
|
default: return JSON::decode_error(I"bad '\\' escape in string", tfp);
|
||||||
}
|
}
|
||||||
PUT_TO(string, c);
|
PUT_TO(string, c);
|
||||||
|
@ -484,9 +485,9 @@ JSON_value *JSON::decode_string(text_stream *T, int from, int to, text_file_posi
|
||||||
|
|
||||||
@ We don't quite fully implement ECMA here: the following is fine for code points
|
@ We don't quite fully implement ECMA here: the following is fine for code points
|
||||||
in the Basic Multilingual Plane, but we don't handle the curious UTF-16 surrogate pair
|
in the Basic Multilingual Plane, but we don't handle the curious UTF-16 surrogate pair
|
||||||
rule for code points between |0x10000| and |0x10fff|.
|
rule for code points between [[0x10000]] and [[0x10fff]].
|
||||||
|
|
||||||
@<Decode a hexadecimal Unicode escape@> =
|
<<Decode a hexadecimal Unicode escape>>=
|
||||||
if (i+4 >= to) return JSON::decode_error(I"incomplete '\\u' escape", tfp);
|
if (i+4 >= to) return JSON::decode_error(I"incomplete '\\u' escape", tfp);
|
||||||
int hex = 0;
|
int hex = 0;
|
||||||
for (int j=0; j<4; j++) {
|
for (int j=0; j<4; j++) {
|
||||||
|
@ -501,9 +502,9 @@ rule for code points between |0x10000| and |0x10fff|.
|
||||||
c = (wchar_t) hex;
|
c = (wchar_t) hex;
|
||||||
i += 4;
|
i += 4;
|
||||||
|
|
||||||
@h Encoding JSON.
|
@ \section{Encoding JSON.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void JSON::encode(OUTPUT_STREAM, JSON_value *J) {
|
void JSON::encode(OUTPUT_STREAM, JSON_value *J) {
|
||||||
if (J == NULL) internal_error("no JSON value supplied");
|
if (J == NULL) internal_error("no JSON value supplied");
|
||||||
switch (J->JSON_type) {
|
switch (J->JSON_type) {
|
||||||
|
@ -563,7 +564,7 @@ void JSON::encode(OUTPUT_STREAM, JSON_value *J) {
|
||||||
@ Note that we elect not to escape the slash character, or any Unicode code
|
@ Note that we elect not to escape the slash character, or any Unicode code
|
||||||
points above 32.
|
points above 32.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void JSON::encode_string(OUTPUT_STREAM, text_stream *T) {
|
void JSON::encode_string(OUTPUT_STREAM, text_stream *T) {
|
||||||
LOOP_THROUGH_TEXT(pos, T) {
|
LOOP_THROUGH_TEXT(pos, T) {
|
||||||
wchar_t c = Str::get(pos);
|
wchar_t c = Str::get(pos);
|
||||||
|
@ -582,7 +583,7 @@ void JSON::encode_string(OUTPUT_STREAM, text_stream *T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Requirements.
|
@ \section{Requirements.}
|
||||||
Of course, the trouble with JSON is that it's a soup of undifferentiated data.
|
Of course, the trouble with JSON is that it's a soup of undifferentiated data.
|
||||||
Just because you're expecting a pair of numbers, there's no reason to suppose
|
Just because you're expecting a pair of numbers, there's no reason to suppose
|
||||||
that's what you've been given.
|
that's what you've been given.
|
||||||
|
@ -591,9 +592,9 @@ A //JSON_requirement// is a sort of JSON schema: a specification for the structu
|
||||||
of a //JSON_value//. At the top level, it's a list of one or more equally
|
of a //JSON_value//. At the top level, it's a list of one or more equally
|
||||||
good alternative specifications. Note that the empty list is not allowed.
|
good alternative specifications. Note that the empty list is not allowed.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct JSON_requirement {
|
typedef struct JSON_requirement {
|
||||||
struct linked_list *alternatives; /* of |JSON_single_requirement| */
|
struct linked_list *alternatives; /* of [[JSON_single_requirement]] */
|
||||||
CLASS_DEFINITION
|
CLASS_DEFINITION
|
||||||
} JSON_requirement;
|
} JSON_requirement;
|
||||||
|
|
||||||
|
@ -615,7 +616,7 @@ JSON_requirement *JSON::add_alternative(JSON_requirement *so_far,
|
||||||
if Javascript actually had types. It can communicate something like "a number"
|
if Javascript actually had types. It can communicate something like "a number"
|
||||||
or "a list of strings"; but it can also say "the value has to be exactly this".
|
or "a list of strings"; but it can also say "the value has to be exactly this".
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct JSON_single_requirement {
|
typedef struct JSON_single_requirement {
|
||||||
struct JSON_requirement *this_requirement;
|
struct JSON_requirement *this_requirement;
|
||||||
struct JSON_value *this_value;
|
struct JSON_value *this_value;
|
||||||
|
@ -623,10 +624,10 @@ typedef struct JSON_single_requirement {
|
||||||
CLASS_DEFINITION
|
CLASS_DEFINITION
|
||||||
} JSON_single_requirement;
|
} JSON_single_requirement;
|
||||||
|
|
||||||
@ Exactly one of |this_requirement|, |this_value| and |this_type| should be
|
@ Exactly one of [[this_requirement]], [[this_value]] and [[this_type]] should be
|
||||||
non-|NULL|, so we have one constructor function for each case:
|
non-[[NULL]], so we have one constructor function for each case:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_single_requirement *JSON::require_requirement(JSON_requirement *req) {
|
JSON_single_requirement *JSON::require_requirement(JSON_requirement *req) {
|
||||||
JSON_single_requirement *sing = CREATE(JSON_single_requirement);
|
JSON_single_requirement *sing = CREATE(JSON_single_requirement);
|
||||||
sing->this_requirement = req;
|
sing->this_requirement = req;
|
||||||
|
@ -653,15 +654,15 @@ JSON_single_requirement *JSON::require_type(int t) {
|
||||||
|
|
||||||
@ JSON types, in our model, look very like //JSON_value//s.
|
@ JSON types, in our model, look very like //JSON_value//s.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct JSON_type {
|
typedef struct JSON_type {
|
||||||
int JSON_type;
|
int JSON_type;
|
||||||
|
|
||||||
struct linked_list *if_list; /* of |JSON_requirement| */
|
struct linked_list *if_list; /* of [[JSON_requirement]] */
|
||||||
struct JSON_requirement *all_if_list;
|
struct JSON_requirement *all_if_list;
|
||||||
|
|
||||||
struct dictionary *dictionary_if_object; /* to |JSON_pair_requirement| */
|
struct dictionary *dictionary_if_object; /* to [[JSON_pair_requirement]] */
|
||||||
struct linked_list *list_if_object; /* of |text_stream| */
|
struct linked_list *list_if_object; /* of [[text_stream]] */
|
||||||
|
|
||||||
struct text_stream *if_error;
|
struct text_stream *if_error;
|
||||||
CLASS_DEFINITION
|
CLASS_DEFINITION
|
||||||
|
@ -698,9 +699,9 @@ JSON_type *JSON::new_type_requirement(int t) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ A convenience for "the value must be an array of any number of entries, each
|
@ A convenience for "the value must be an array of any number of entries, each
|
||||||
of which meets the requirement |E_req|":
|
of which meets the requirement [[E_req]]":
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_single_requirement *JSON::require_array_of(JSON_requirement *E_req) {
|
JSON_single_requirement *JSON::require_array_of(JSON_requirement *E_req) {
|
||||||
JSON_single_requirement *req = JSON::require_type(ARRAY_JSONTYPE);
|
JSON_single_requirement *req = JSON::require_type(ARRAY_JSONTYPE);
|
||||||
req->this_type->all_if_list = E_req;
|
req->this_type->all_if_list = E_req;
|
||||||
|
@ -708,22 +709,22 @@ JSON_single_requirement *JSON::require_array_of(JSON_requirement *E_req) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ If an array wants to be a tuple with a fixed number of entries, each with
|
@ If an array wants to be a tuple with a fixed number of entries, each with
|
||||||
its own requirement, then instead call |JSON::require_type(ARRAY_JSONTYPE)| and
|
its own requirement, then instead call [[JSON::require_type(ARRAY_JSONTYPE)]] and
|
||||||
then make a number of calls to the following in sequence:
|
then make a number of calls to the following in sequence:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void JSON::require_entry(JSON_single_requirement *array_sr, JSON_requirement *entry_sr) {
|
void JSON::require_entry(JSON_single_requirement *array_sr, JSON_requirement *entry_sr) {
|
||||||
if (array_sr == NULL) internal_error("no array");
|
if (array_sr == NULL) internal_error("no array");
|
||||||
if ((array_sr->this_type == NULL) ||
|
if ((array_sr->this_type == NULL) [[]]
|
||||||
(array_sr->this_type->JSON_type != ARRAY_JSONTYPE)) internal_error("not an array");
|
(array_sr->this_type->JSON_type != ARRAY_JSONTYPE)) internal_error("not an array");
|
||||||
if (entry_sr == NULL) internal_error("no new entry");
|
if (entry_sr == NULL) internal_error("no new entry");
|
||||||
ADD_TO_LINKED_LIST(entry_sr, JSON_requirement, array_sr->this_type->if_list);
|
ADD_TO_LINKED_LIST(entry_sr, JSON_requirement, array_sr->this_type->if_list);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ Similarly, create an object requirement with |JSON::require_type(OBJECT_JSONTYPE)|
|
@ Similarly, create an object requirement with [[JSON::require_type(OBJECT_JSONTYPE)]]
|
||||||
and then either require or allow key-value pairs with:
|
and then either require or allow key-value pairs with:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void JSON::require_pair(JSON_single_requirement *obj_sr, text_stream *key, JSON_requirement *req) {
|
void JSON::require_pair(JSON_single_requirement *obj_sr, text_stream *key, JSON_requirement *req) {
|
||||||
JSON::require_pair_inner(obj_sr, key, req, FALSE);
|
JSON::require_pair_inner(obj_sr, key, req, FALSE);
|
||||||
}
|
}
|
||||||
|
@ -735,7 +736,7 @@ void JSON::allow_pair(JSON_single_requirement *obj_sr, text_stream *key, JSON_re
|
||||||
void JSON::require_pair_inner(JSON_single_requirement *obj_sr, text_stream *key,
|
void JSON::require_pair_inner(JSON_single_requirement *obj_sr, text_stream *key,
|
||||||
JSON_requirement *req, int opt) {
|
JSON_requirement *req, int opt) {
|
||||||
if (obj_sr == NULL) internal_error("no object");
|
if (obj_sr == NULL) internal_error("no object");
|
||||||
if ((obj_sr->this_type == NULL) ||
|
if ((obj_sr->this_type == NULL) [[]]
|
||||||
(obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object");
|
(obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object");
|
||||||
if (req == NULL) internal_error("no val req");
|
if (req == NULL) internal_error("no val req");
|
||||||
key = Str::duplicate(key);
|
key = Str::duplicate(key);
|
||||||
|
@ -747,13 +748,13 @@ void JSON::require_pair_inner(JSON_single_requirement *obj_sr, text_stream *key,
|
||||||
if (de) de->value = pr;
|
if (de) de->value = pr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ This then extracts the requirement on a given key, or returns |NULL| is if
|
@ This then extracts the requirement on a given key, or returns [[NULL]] is if
|
||||||
is not permitted:
|
is not permitted:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_pair_requirement *JSON::look_up_pair(JSON_single_requirement *obj_sr, text_stream *key) {
|
JSON_pair_requirement *JSON::look_up_pair(JSON_single_requirement *obj_sr, text_stream *key) {
|
||||||
if (obj_sr == NULL) internal_error("no object");
|
if (obj_sr == NULL) internal_error("no object");
|
||||||
if ((obj_sr->this_type == NULL) ||
|
if ((obj_sr->this_type == NULL) [[]]
|
||||||
(obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object");
|
(obj_sr->this_type->JSON_type != OBJECT_JSONTYPE)) internal_error("not an object");
|
||||||
dict_entry *de = Dictionaries::find(obj_sr->this_type->dictionary_if_object, key);
|
dict_entry *de = Dictionaries::find(obj_sr->this_type->dictionary_if_object, key);
|
||||||
if (de == NULL) return NULL;
|
if (de == NULL) return NULL;
|
||||||
|
@ -763,25 +764,25 @@ JSON_pair_requirement *JSON::look_up_pair(JSON_single_requirement *obj_sr, text_
|
||||||
@ This is used when parsing textual requirements, to indicate a syntax error;
|
@ This is used when parsing textual requirements, to indicate a syntax error;
|
||||||
but it is not valid as a requirement itself.
|
but it is not valid as a requirement itself.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_single_requirement *JSON::error_sr(text_stream *msg) {
|
JSON_single_requirement *JSON::error_sr(text_stream *msg) {
|
||||||
JSON_single_requirement *req = JSON::require_type(ERROR_JSONTYPE);
|
JSON_single_requirement *req = JSON::require_type(ERROR_JSONTYPE);
|
||||||
req->this_type->if_error = Str::duplicate(msg);
|
req->this_type->if_error = Str::duplicate(msg);
|
||||||
return req;
|
return req;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Validation.
|
@ \section{Validation.}
|
||||||
To "validate" a JSON value is to determine that it meets some //JSON_requirement//.
|
To "validate" a JSON value is to determine that it meets some //JSON_requirement//.
|
||||||
|
|
||||||
The following returns |TRUE| if the value meets the requirement in full;
|
The following returns [[TRUE]] if the value meets the requirement in full;
|
||||||
if not, |FALSE|, and then if |errs| is not null, a list of error messages is
|
if not, [[FALSE]], and then if [[errs]] is not null, a list of error messages is
|
||||||
appended to the linked list |errs|.
|
appended to the linked list [[errs]].
|
||||||
|
|
||||||
The stack here is used to give better error messages by locating where the
|
The stack here is used to give better error messages by locating where the
|
||||||
problem was: e.g. |"object.coordinates[1]"| is the result of the stack
|
problem was: e.g. [["object.coordinates[1]"]] is the result of the stack
|
||||||
holding |"object" > ".cooordinates" > "[1]"|.
|
holding [["object" > ".cooordinates" > "[1]"]].
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int JSON::validate(JSON_value *val, JSON_requirement *req, linked_list *errs) {
|
int JSON::validate(JSON_value *val, JSON_requirement *req, linked_list *errs) {
|
||||||
lifo_stack *location = NEW_LIFO_STACK(text_stream);
|
lifo_stack *location = NEW_LIFO_STACK(text_stream);
|
||||||
if ((val) && (val->JSON_type == ARRAY_JSONTYPE)) {
|
if ((val) && (val->JSON_type == ARRAY_JSONTYPE)) {
|
||||||
|
@ -815,7 +816,7 @@ value must match one of the single requirements in the list. (We can stop as
|
||||||
soon as it has met one.) If it meets none of them, we produce error messages
|
soon as it has met one.) If it meets none of them, we produce error messages
|
||||||
for the reason it fails just the first.
|
for the reason it fails just the first.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int JSON::validate_r(JSON_value *val, JSON_requirement *req, linked_list *errs,
|
int JSON::validate_r(JSON_value *val, JSON_requirement *req, linked_list *errs,
|
||||||
lifo_stack *location) {
|
lifo_stack *location) {
|
||||||
if (val == NULL) internal_error("no value");
|
if (val == NULL) internal_error("no value");
|
||||||
|
@ -834,7 +835,7 @@ int JSON::validate_r(JSON_value *val, JSON_requirement *req, linked_list *errs,
|
||||||
|
|
||||||
@ Bad data always fails, and otherwise we split into the three cases.
|
@ Bad data always fails, and otherwise we split into the three cases.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
|
int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
|
||||||
linked_list *errs, lifo_stack *location) {
|
linked_list *errs, lifo_stack *location) {
|
||||||
if (val->JSON_type == ERROR_JSONTYPE) {
|
if (val->JSON_type == ERROR_JSONTYPE) {
|
||||||
|
@ -842,16 +843,16 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
|
||||||
I"erroneous JSON value from parsing bad text", location);
|
I"erroneous JSON value from parsing bad text", location);
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
if (req->this_requirement) @<Validate against this requirement@>;
|
if (req->this_requirement) <<Validate against this requirement>>;
|
||||||
if (req->this_value) @<Validate against this value@>;
|
if (req->this_value) <<Validate against this value>>;
|
||||||
if (req->this_type) @<Validate against this type@>;
|
if (req->this_type) <<Validate against this type>>;
|
||||||
internal_error("bad single requirement");
|
internal_error("bad single requirement");
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Validate against this requirement@> =
|
<<Validate against this requirement>>=
|
||||||
return JSON::validate_r(val, req->this_requirement, errs, location);
|
return JSON::validate_r(val, req->this_requirement, errs, location);
|
||||||
|
|
||||||
@<Validate against this value@> =
|
<<Validate against this value>>=
|
||||||
if (JSON::eq(val, req->this_value) == FALSE) {
|
if (JSON::eq(val, req->this_value) == FALSE) {
|
||||||
TEMPORARY_TEXT(msg)
|
TEMPORARY_TEXT(msg)
|
||||||
WRITE_TO(msg, "value ");
|
WRITE_TO(msg, "value ");
|
||||||
|
@ -863,16 +864,16 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
|
||||||
}
|
}
|
||||||
return TRUE;
|
return TRUE;
|
||||||
|
|
||||||
@<Validate against this type@> =
|
<<Validate against this type>>=
|
||||||
@<Verify that the JSON type constructors match@>;
|
<<Verify that the JSON type constructors match>>;
|
||||||
int outcome = TRUE;
|
int outcome = TRUE;
|
||||||
if (val->JSON_type == ARRAY_JSONTYPE)
|
if (val->JSON_type == ARRAY_JSONTYPE)
|
||||||
@<Verify that the array entries meet requirements@>;
|
<<Verify that the array entries meet requirements>>;
|
||||||
if (val->JSON_type == OBJECT_JSONTYPE)
|
if (val->JSON_type == OBJECT_JSONTYPE)
|
||||||
@<Verify that the object members meet requirements@>;
|
<<Verify that the object members meet requirements>>;
|
||||||
return outcome;
|
return outcome;
|
||||||
|
|
||||||
@<Verify that the JSON type constructors match@> =
|
<<Verify that the JSON type constructors match>>=
|
||||||
if (val->JSON_type != req->this_type->JSON_type) {
|
if (val->JSON_type != req->this_type->JSON_type) {
|
||||||
if (errs) {
|
if (errs) {
|
||||||
TEMPORARY_TEXT(msg)
|
TEMPORARY_TEXT(msg)
|
||||||
|
@ -886,7 +887,7 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Verify that the array entries meet requirements@> =
|
<<Verify that the array entries meet requirements>>=
|
||||||
int count = 0;
|
int count = 0;
|
||||||
JSON_value *E;
|
JSON_value *E;
|
||||||
LOOP_OVER_LINKED_LIST(E, JSON_value, val->if_list) {
|
LOOP_OVER_LINKED_LIST(E, JSON_value, val->if_list) {
|
||||||
|
@ -912,19 +913,19 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Verify that the object members meet requirements@> =
|
<<Verify that the object members meet requirements>>=
|
||||||
text_stream *key;
|
text_stream *key;
|
||||||
LOOP_OVER_LINKED_LIST(key, text_stream, val->list_if_object)
|
LOOP_OVER_LINKED_LIST(key, text_stream, val->list_if_object)
|
||||||
@<Verify that the member with this key is allowed and contains the right data@>;
|
<<Verify that the member with this key is allowed and contains the right data>>;
|
||||||
LOOP_OVER_LINKED_LIST(key, text_stream, req->this_type->list_if_object) {
|
LOOP_OVER_LINKED_LIST(key, text_stream, req->this_type->list_if_object) {
|
||||||
JSON_pair_requirement *pr =
|
JSON_pair_requirement *pr =
|
||||||
Dictionaries::read_value(req->this_type->dictionary_if_object, key);
|
Dictionaries::read_value(req->this_type->dictionary_if_object, key);
|
||||||
if (pr == NULL) internal_error("broken JSON object requirement");
|
if (pr == NULL) internal_error("broken JSON object requirement");
|
||||||
if (pr->optional == FALSE)
|
if (pr->optional == FALSE)
|
||||||
@<Verify that the value object does provide this mandatory member@>;
|
<<Verify that the value object does provide this mandatory member>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Verify that the member with this key is allowed and contains the right data@> =
|
<<Verify that the member with this key is allowed and contains the right data>>=
|
||||||
JSON_value *E = Dictionaries::read_value(val->dictionary_if_object, key);
|
JSON_value *E = Dictionaries::read_value(val->dictionary_if_object, key);
|
||||||
if (E == NULL) internal_error("broken JSON object dictionary");
|
if (E == NULL) internal_error("broken JSON object dictionary");
|
||||||
JSON_pair_requirement *pr = JSON::look_up_pair(req, key);
|
JSON_pair_requirement *pr = JSON::look_up_pair(req, key);
|
||||||
|
@ -943,7 +944,7 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
|
||||||
POP_LIFO_STACK(text_stream, location);
|
POP_LIFO_STACK(text_stream, location);
|
||||||
DISCARD_TEXT(at)
|
DISCARD_TEXT(at)
|
||||||
|
|
||||||
@<Verify that the value object does provide this mandatory member@> =
|
<<Verify that the value object does provide this mandatory member>>=
|
||||||
JSON_value *E = JSON::look_up_object(val, key);
|
JSON_value *E = JSON::look_up_object(val, key);
|
||||||
if (E == NULL) {
|
if (E == NULL) {
|
||||||
TEMPORARY_TEXT(msg)
|
TEMPORARY_TEXT(msg)
|
||||||
|
@ -953,21 +954,20 @@ int JSON::validate_single_r(JSON_value *val, JSON_single_requirement *req,
|
||||||
outcome = FALSE;
|
outcome = FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Decoding JSON requirements.
|
@ \section{Decoding JSON requirements.}
|
||||||
It's convenient to be able to read and write these requirements to textual
|
It's convenient to be able to read and write these requirements to textual
|
||||||
form, exactly as we do with JSON itself, and here goes.
|
form, exactly as we do with JSON itself, and here goes.
|
||||||
|
|
||||||
This is an example of the syntax we parse. It's JSON except that
|
This is an example of the syntax we parse. It's JSON except that
|
||||||
(a) the type names |number|, |double|, |string|, |boolean| and |null| are
|
(a) the type names [[number]], [[double]], [[string]], [[boolean]] and [[null]] are
|
||||||
used in place of their respective values;
|
used in place of their respective values;
|
||||||
(b) a question mark |?| before the name of a key means that it is optional;
|
(b) a question mark [[?]] before the name of a key means that it is optional;
|
||||||
(c) if an array has one entry followed by an asterisk |*|, it means
|
(c) if an array has one entry followed by an asterisk [[*]], it means
|
||||||
"any number of entries, each of which must match this";
|
"any number of entries, each of which must match this";
|
||||||
(d) |<name>| refers to a requirement recorded in the |known_names| dictionary.
|
(d) [[<name>]] refers to a requirement recorded in the [[known_names]] dictionary.
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
= (text)
|
|
||||||
{
|
{
|
||||||
"coordinates": [ double, double, string ],
|
"coordinates": [ double, double, string ],
|
||||||
?"jurisdiction": string,
|
?"jurisdiction": string,
|
||||||
|
@ -976,28 +976,27 @@ For example:
|
||||||
"entry": string
|
"entry": string
|
||||||
}* ]
|
}* ]
|
||||||
}
|
}
|
||||||
=
|
|
||||||
|
|
||||||
This function is essentially the same as //JSON::decode//, but returning a
|
This function is essentially the same as //JSON::decode//, but returning a
|
||||||
requirement rather than a value.
|
requirement rather than a value.
|
||||||
|
|
||||||
Note that |known_names| can be |NULL| to have it not recognise any such names;
|
Note that [[known_names]] can be [[NULL]] to have it not recognise any such names;
|
||||||
there's no need to create an empty dictionary if this feature is unwanted.
|
there's no need to create an empty dictionary if this feature is unwanted.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_requirement *JSON::decode_req(text_stream *T, dictionary *known_names) {
|
JSON_requirement *JSON::decode_req(text_stream *T, dictionary *known_names) {
|
||||||
return JSON::decode_req_range(T, 0, Str::len(T), known_names);
|
return JSON::decode_req_range(T, 0, Str::len(T), known_names);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ This decodes the text in the character position range |[from, to)| as a
|
@ This decodes the text in the character position range [[[from, to)]] as a
|
||||||
JSON requirement.
|
JSON requirement.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_requirement *JSON::decode_req_range(text_stream *T, int from, int to,
|
JSON_requirement *JSON::decode_req_range(text_stream *T, int from, int to,
|
||||||
dictionary *known_names) {
|
dictionary *known_names) {
|
||||||
int first_nws = -1, last_nws = -1;
|
int first_nws = -1, last_nws = -1;
|
||||||
wchar_t first_c = 0, last_c = 0;
|
wchar_t first_c = 0, last_c = 0;
|
||||||
@<Find the first and last non-whitespace character in requirement@>;
|
<<Find the first and last non-whitespace character in requirement>>;
|
||||||
if (first_c == '(') {
|
if (first_c == '(') {
|
||||||
if (last_c != ')')
|
if (last_c != ')')
|
||||||
return JSON::single_choice(JSON::error_sr(I"mismatched '(' ... ')'"));
|
return JSON::single_choice(JSON::error_sr(I"mismatched '(' ... ')'"));
|
||||||
|
@ -1032,17 +1031,17 @@ JSON_requirement *JSON::decode_req_alternative(JSON_requirement *req, text_strea
|
||||||
return JSON::add_alternative(req, sing);
|
return JSON::add_alternative(req, sing);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ Note that the keyword |null| is ambiguous in the grammar for JSON requirements:
|
@ Note that the keyword [[null]] is ambiguous in the grammar for JSON requirements:
|
||||||
does it mean "the value |null|", or does it mean "any value of the type |null|"?
|
does it mean "the value [[null]]", or does it mean "any value of the type [[null]]"?
|
||||||
This makes no difference because the type |null| admits only the value |null|, but
|
This makes no difference because the type [[null]] admits only the value [[null]], but
|
||||||
for what it's worth, we opt for the value.
|
for what it's worth, we opt for the value.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int to,
|
JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int to,
|
||||||
dictionary *known_names) {
|
dictionary *known_names) {
|
||||||
int first_nws = -1, last_nws = -1;
|
int first_nws = -1, last_nws = -1;
|
||||||
wchar_t first_c = 0, last_c = 0;
|
wchar_t first_c = 0, last_c = 0;
|
||||||
@<Find the first and last non-whitespace character in requirement@>;
|
<<Find the first and last non-whitespace character in requirement>>;
|
||||||
if (first_nws < 0) return JSON::error_sr(I"whitespace where requirement expected");
|
if (first_nws < 0) return JSON::error_sr(I"whitespace where requirement expected");
|
||||||
switch (first_c) {
|
switch (first_c) {
|
||||||
case '[':
|
case '[':
|
||||||
|
@ -1073,7 +1072,7 @@ JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int t
|
||||||
|
|
||||||
int require_value = FALSE;
|
int require_value = FALSE;
|
||||||
|
|
||||||
if ((first_c == '"') || (first_c == '-') || (Characters::isdigit(first_c)))
|
if ((first_c == '"') [[| (first_c == '-') |]] (Characters::isdigit(first_c)))
|
||||||
require_value = TRUE;
|
require_value = TRUE;
|
||||||
if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3))
|
if ((Str::includes_at(T, first_nws, I"true")) && (last_nws - first_nws == 3))
|
||||||
require_value = TRUE;
|
require_value = TRUE;
|
||||||
|
@ -1110,7 +1109,7 @@ JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int t
|
||||||
return JSON::error_sr(msg);
|
return JSON::error_sr(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Find the first and last non-whitespace character in requirement@> =
|
<<Find the first and last non-whitespace character in requirement>>=
|
||||||
for (int i=from; i<to; i++)
|
for (int i=from; i<to; i++)
|
||||||
if (Characters::is_whitespace(Str::get_at(T, i)) == FALSE) {
|
if (Characters::is_whitespace(Str::get_at(T, i)) == FALSE) {
|
||||||
first_nws = i; break;
|
first_nws = i; break;
|
||||||
|
@ -1124,7 +1123,7 @@ JSON_single_requirement *JSON::decode_sreq_range(text_stream *T, int from, int t
|
||||||
|
|
||||||
@ Array requirements:
|
@ Array requirements:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_single_requirement *JSON::decode_req_array(JSON_single_requirement *array_sr,
|
JSON_single_requirement *JSON::decode_req_array(JSON_single_requirement *array_sr,
|
||||||
text_stream *T, int from, int to, dictionary *known_names) {
|
text_stream *T, int from, int to, dictionary *known_names) {
|
||||||
int content = FALSE;
|
int content = FALSE;
|
||||||
|
@ -1167,7 +1166,7 @@ JSON_single_requirement *JSON::decode_req_array_entry(JSON_single_requirement *a
|
||||||
|
|
||||||
@ And similarly for objects.
|
@ And similarly for objects.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
JSON_single_requirement *JSON::decode_req_object(JSON_single_requirement *obj,
|
JSON_single_requirement *JSON::decode_req_object(JSON_single_requirement *obj,
|
||||||
text_stream *T, int from, int to, dictionary *known_names) {
|
text_stream *T, int from, int to, dictionary *known_names) {
|
||||||
int content = FALSE;
|
int content = FALSE;
|
||||||
|
@ -1218,7 +1217,7 @@ JSON_single_requirement *JSON::decode_req_object_entry(JSON_single_requirement *
|
||||||
}
|
}
|
||||||
if (ended == FALSE) return JSON::error_sr(I"key does not end with quotation mark");
|
if (ended == FALSE) return JSON::error_sr(I"key does not end with quotation mark");
|
||||||
while (Characters::is_whitespace(Str::get_at(T, from))) from++;
|
while (Characters::is_whitespace(Str::get_at(T, from))) from++;
|
||||||
if ((from >= to) || (Str::get_at(T, from) != ':'))
|
if ((from >= to) [[]] (Str::get_at(T, from) != ':'))
|
||||||
return JSON::error_sr(I"key is not followed by ':'");
|
return JSON::error_sr(I"key is not followed by ':'");
|
||||||
from++;
|
from++;
|
||||||
if (JSON::look_up_pair(obj, key)) return JSON::error_sr(I"duplicate key");
|
if (JSON::look_up_pair(obj, key)) return JSON::error_sr(I"duplicate key");
|
||||||
|
@ -1229,21 +1228,21 @@ JSON_single_requirement *JSON::decode_req_object_entry(JSON_single_requirement *
|
||||||
return obj;
|
return obj;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Encoding JSON requirements.
|
@ \section{Encoding JSON requirements.}
|
||||||
This is now simple, with one caveat. It's possible to set up requirement trees
|
This is now simple, with one caveat. It's possible to set up requirement trees
|
||||||
so that they are not well-founded. For example:
|
so that they are not well-founded. For example:
|
||||||
|
|
||||||
= (text as InC)
|
|
||||||
JSON_single_requirement *set = JSON::require_type(ARRAY_JSONTYPE);
|
JSON_single_requirement *set = JSON::require_type(ARRAY_JSONTYPE);
|
||||||
set->all_if_list = JSON::single_choice(set);
|
set->all_if_list = JSON::single_choice(set);
|
||||||
=
|
|
||||||
|
|
||||||
This is not useless: it matches, say, |[]|, |[ [] ]| and |[ [], [ [] ] ]|
|
|
||||||
|
This is not useless: it matches, say, [[[]]], [[[ [] ]]] and [[[ [], [ [] ] ]]]
|
||||||
and other constructions giving amusement to set theorists. But it would cause
|
and other constructions giving amusement to set theorists. But it would cause
|
||||||
the following to hang. Note that requirements read in from files (see below)
|
the following to hang. Note that requirements read in from files (see below)
|
||||||
are always well-founded, and so do not have this issue.
|
are always well-founded, and so do not have this issue.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void JSON::encode_req(OUTPUT_STREAM, JSON_requirement *req) {
|
void JSON::encode_req(OUTPUT_STREAM, JSON_requirement *req) {
|
||||||
JSON::encode_req_r(OUT, req);
|
JSON::encode_req_r(OUT, req);
|
||||||
}
|
}
|
||||||
|
@ -1311,32 +1310,32 @@ void JSON::encode_type(OUTPUT_STREAM, JSON_type *type) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Reading requirements files.
|
@ \section{Reading requirements files.}
|
||||||
This convenient function reads in a set of requirements from a text file. Each
|
This convenient function reads in a set of requirements from a text file. Each
|
||||||
requirement should begin |<name> ::=|, and then continues until the next such
|
requirement should begin [[<name> ::=]], and then continues until the next such
|
||||||
header, or the end of the file. So for example:
|
header, or the end of the file. So for example:
|
||||||
= (text)
|
|
||||||
! My scheme for JSON files describing geographical locations
|
! My scheme for JSON files describing geographical locations
|
||||||
|
|
||||||
<optional-letter> ::= ( "alpha" | "beta" | null )
|
<optional-letter> ::= ( "alpha" [[ "beta" ]] null )
|
||||||
|
|
||||||
<position> ::= {
|
<position> ::= {
|
||||||
"category": <optional-letter>,
|
"category": <optional-letter>,
|
||||||
"latitude": double,
|
"latitude": double,
|
||||||
"longitude": double,
|
"longitude": double,
|
||||||
}
|
}
|
||||||
=
|
|
||||||
is a valid file declaring two requirements. Forward references are not allowed --
|
is a valid file declaring two requirements. Forward references are not allowed --
|
||||||
e.g., <position> can refer to <optional-letter> but not vice versa -- and
|
e.g., <position> can refer to <optional-letter> but not vice versa -- and
|
||||||
therefore the requirements read in will always be well-founded. Comments are
|
therefore the requirements read in will always be well-founded. Comments are
|
||||||
lines beginning with |!|; other than comments, only white space is permitted
|
lines beginning with [[!]]; other than comments, only white space is permitted
|
||||||
before the first requirement begins.
|
before the first requirement begins.
|
||||||
|
|
||||||
Note that the function //JSON::read_requirements_file// returns a dictionary
|
Note that the function //JSON::read_requirements_file// returns a dictionary
|
||||||
of the requirements it has read, by name (but without their angle-brackets):
|
of the requirements it has read, by name (but without their angle-brackets):
|
||||||
here, it would have two keys, |optional-letter| and |position|.
|
here, it would have two keys, [[optional-letter]] and [[position]].
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct JSON_rrf_state {
|
typedef struct JSON_rrf_state {
|
||||||
struct text_stream *name;
|
struct text_stream *name;
|
||||||
struct text_stream *defn;
|
struct text_stream *defn;
|
||||||
|
@ -1379,7 +1378,7 @@ void JSON::read_requirements_file_helper(text_stream *text, text_file_position *
|
||||||
@ This is called when the end of a definition is reached, either because another
|
@ This is called when the end of a definition is reached, either because another
|
||||||
is about to start, or because the end of the file has come:
|
is about to start, or because the end of the file has come:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void JSON::process_req_defn(JSON_rrf_state *state) {
|
void JSON::process_req_defn(JSON_rrf_state *state) {
|
||||||
if (Str::len(state->name) > 0) {
|
if (Str::len(state->name) > 0) {
|
||||||
JSON_requirement *req =
|
JSON_requirement *req =
|
|
@ -2,36 +2,36 @@
|
||||||
|
|
||||||
To provide a limited regular-expression parser.
|
To provide a limited regular-expression parser.
|
||||||
|
|
||||||
@h Character types.
|
@ \section{Character types.}
|
||||||
We will define white space as spaces and tabs only, since the various kinds
|
We will define white space as spaces and tabs only, since the various kinds
|
||||||
of line terminator will always be stripped out before this is applied.
|
of line terminator will always be stripped out before this is applied.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Regexp::white_space(int c) {
|
int Regexp::white_space(int c) {
|
||||||
if ((c == ' ') || (c == '\t')) return TRUE;
|
if ((c == ' ') [[]] (c == '\t')) return TRUE;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ The presence of |:| here is perhaps a bit surprising, since it's illegal in
|
@ The presence of [[:]] here is perhaps a bit surprising, since it's illegal in
|
||||||
C and has other meanings in other languages, but it's legal in C-for-Inform
|
C and has other meanings in other languages, but it's legal in C-for-Inform
|
||||||
identifiers.
|
identifiers.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Regexp::identifier_char(int c) {
|
int Regexp::identifier_char(int c) {
|
||||||
if ((c == '_') || (c == ':') ||
|
if ((c == '_') [[| (c == ':') |]]
|
||||||
((c >= 'A') && (c <= 'Z')) ||
|
((c >= 'A') && (c <= 'Z')) [[]]
|
||||||
((c >= 'a') && (c <= 'z')) ||
|
((c >= 'a') && (c <= 'z')) [[]]
|
||||||
((c >= '0') && (c <= '9'))) return TRUE;
|
((c >= '0') && (c <= '9'))) return TRUE;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Simple parsing.
|
@ \section{Simple parsing.}
|
||||||
The following finds the earliest minimal-length substring of a string,
|
The following finds the earliest minimal-length substring of a string,
|
||||||
delimited by two pairs of characters: for example, |<<| and |>>|. This could
|
delimited by two pairs of characters: for example, [[<<]] and [[>>]]. This could
|
||||||
easily be done as a regular expression using |Regexp::match|, but the routine
|
easily be done as a regular expression using [[Regexp::match]], but the routine
|
||||||
here is much quicker.
|
here is much quicker.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Regexp::find_expansion(text_stream *text, wchar_t on1, wchar_t on2,
|
int Regexp::find_expansion(text_stream *text, wchar_t on1, wchar_t on2,
|
||||||
wchar_t off1, wchar_t off2, int *len) {
|
wchar_t off1, wchar_t off2, int *len) {
|
||||||
for (int i = 0; i < Str::len(text); i++)
|
for (int i = 0; i < Str::len(text); i++)
|
||||||
|
@ -47,7 +47,7 @@ int Regexp::find_expansion(text_stream *text, wchar_t on1, wchar_t on2,
|
||||||
|
|
||||||
@ Still more simply:
|
@ Still more simply:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Regexp::find_open_brace(text_stream *text) {
|
int Regexp::find_open_brace(text_stream *text) {
|
||||||
for (int i=0; i < Str::len(text); i++)
|
for (int i=0; i < Str::len(text); i++)
|
||||||
if (Str::get_at(text, i) == '{')
|
if (Str::get_at(text, i) == '{')
|
||||||
|
@ -56,9 +56,9 @@ int Regexp::find_open_brace(text_stream *text) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ Note that we count the empty string as being white space. Again, this is
|
@ Note that we count the empty string as being white space. Again, this is
|
||||||
equivalent to |Regexp::match(p, " *")|, but much faster.
|
equivalent to [[Regexp::match(p, " *")]], but much faster.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Regexp::string_is_white_space(text_stream *text) {
|
int Regexp::string_is_white_space(text_stream *text) {
|
||||||
LOOP_THROUGH_TEXT(P, text)
|
LOOP_THROUGH_TEXT(P, text)
|
||||||
if (Regexp::white_space(Str::get(P)) == FALSE)
|
if (Regexp::white_space(Str::get(P)) == FALSE)
|
||||||
|
@ -66,46 +66,47 @@ int Regexp::string_is_white_space(text_stream *text) {
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h A Worse PCRE.
|
@ \section{A Worse PCRE.}
|
||||||
I originally wanted to call the function in this section |a_better_sscanf|, then
|
I originally wanted to call the function in this section [[a_better_sscanf]], then
|
||||||
thought perhaps |a_worse_PCRE| would be more true. (PCRE is Philip Hazel's superb
|
thought perhaps [[a_worse_PCRE]] would be more true. (PCRE is Philip Hazel's superb
|
||||||
C implementation of regular-expression parsing, but I didn't need its full strength,
|
C implementation of regular-expression parsing, but I didn't need its full strength,
|
||||||
and I didn't want to complicate the build process by linking to it.)
|
and I didn't want to complicate the build process by linking to it.)
|
||||||
|
|
||||||
This is a very minimal regular expression parser, simply for convenience of parsing
|
This is a very minimal regular expression parser, simply for convenience of parsing
|
||||||
short texts against particularly simple patterns. Here is an example of use:
|
short texts against particularly simple patterns. Here is an example of use:
|
||||||
= (text as code)
|
|
||||||
match_results mr = Regexp::create_mr();
|
match_results mr = Regexp::create_mr();
|
||||||
if (Regexp::match(&mr, text, L"fish (%d+) ([a-zA-Z_][a-zA-Z0-9_]*) *") {
|
if (Regexp::match(&mr, text, L"fish (%d+) ([a-zA-Z_][a-zA-Z0-9_]*) *") {
|
||||||
PRINT("Fish number: %S\n", mr.exp[0]);
|
PRINT("Fish number: %S\n", mr.exp[0]);
|
||||||
PRINT("Fish name: %S\n", mr.exp[1]);
|
PRINT("Fish name: %S\n", mr.exp[1]);
|
||||||
}
|
}
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
=
|
|
||||||
Note the |L| at the front of the regex itself: this is a wide string.
|
|
||||||
|
|
||||||
This tries to match the given |text| to see if it consists of the word fish,
|
Note the [[L]] at the front of the regex itself: this is a wide string.
|
||||||
|
|
||||||
|
This tries to match the given [[text]] to see if it consists of the word fish,
|
||||||
then any amount of whitespace, then a string of digits which are copied into
|
then any amount of whitespace, then a string of digits which are copied into
|
||||||
|mr->exp[0]|, then whitespace again, and then an alphanumeric identifier to be
|
[[mr->exp[0]]], then whitespace again, and then an alphanumeric identifier to be
|
||||||
copied into |mr->exp[1]|, and finally optional whitespace. (If no match is
|
copied into [[mr->exp[1]]], and finally optional whitespace. (If no match is
|
||||||
made, the contents of the found strings are undefined.)
|
made, the contents of the found strings are undefined.)
|
||||||
|
|
||||||
Note that this differs from, for example, Perl's regular expression matcher
|
Note that this differs from, for example, Perl's regular expression matcher
|
||||||
in several ways. The regular expression syntax is slightly different and in
|
in several ways. The regular expression syntax is slightly different and in
|
||||||
general simpler. A match has to be made from start to end, so it's as if there
|
general simpler. A match has to be made from start to end, so it's as if there
|
||||||
were an implicit |^| at the front and |$| at the back (in Perl terms). The
|
were an implicit [[^]] at the front and [[$]] at the back (in Perl terms). The
|
||||||
full match text is therefore always the entire text put in, so there's no
|
full match text is therefore always the entire text put in, so there's no
|
||||||
need to record this. In Perl, matching against |m/(.*) plus (.*)/| would
|
need to record this. In Perl, matching against [[m/(.*) plus (.*)/]] would
|
||||||
set three subexpressions: number 0 would be the whole text matched, number
|
set three subexpressions: number 0 would be the whole text matched, number
|
||||||
1 would be the first bracketed part, number 2 the second. Here, though, the
|
1 would be the first bracketed part, number 2 the second. Here, though, the
|
||||||
corresponding regex would be written |L"(%c*) plus (%c*)"|, and the bracketed
|
corresponding regex would be written [[L"(%c*) plus (%c*)"]], and the bracketed
|
||||||
terms would be subexpressions 0 and 1.
|
terms would be subexpressions 0 and 1.
|
||||||
|
|
||||||
@d MAX_BRACKETED_SUBEXPRESSIONS 5 /* this many bracketed subexpressions can be extracted */
|
<<*>>=
|
||||||
|
#define MAX_BRACKETED_SUBEXPRESSIONS 5 /* this many bracketed subexpressions can be extracted */
|
||||||
|
|
||||||
@ The internal state of the matcher is stored as follows:
|
@ The internal state of the matcher is stored as follows:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct match_position {
|
typedef struct match_position {
|
||||||
int tpos; /* position within text being matched */
|
int tpos; /* position within text being matched */
|
||||||
int ppos; /* position within pattern */
|
int ppos; /* position within pattern */
|
||||||
|
@ -120,9 +121,10 @@ typedef struct match_position {
|
||||||
@ It may appear that match texts are limited to 64 characters here, but they
|
@ It may appear that match texts are limited to 64 characters here, but they
|
||||||
are not. They are simply a little faster to access if short.
|
are not. They are simply a little faster to access if short.
|
||||||
|
|
||||||
@d MATCH_TEXT_INITIAL_ALLOCATION 64
|
<<*>>=
|
||||||
|
#define MATCH_TEXT_INITIAL_ALLOCATION 64
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct match_result {
|
typedef struct match_result {
|
||||||
wchar_t match_text_storage[MATCH_TEXT_INITIAL_ALLOCATION];
|
wchar_t match_text_storage[MATCH_TEXT_INITIAL_ALLOCATION];
|
||||||
struct text_stream match_text_struct;
|
struct text_stream match_text_struct;
|
||||||
|
@ -140,7 +142,7 @@ explicitly. Note that the storage required is on the C stack (unless some
|
||||||
result strings grow very large), so that it's very quick to allocate and
|
result strings grow very large), so that it's very quick to allocate and
|
||||||
deallocate.
|
deallocate.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
match_results Regexp::create_mr(void) {
|
match_results Regexp::create_mr(void) {
|
||||||
match_results mr;
|
match_results mr;
|
||||||
mr.no_matched_texts = 0;
|
mr.no_matched_texts = 0;
|
||||||
|
@ -164,7 +166,7 @@ void Regexp::dispose_of(match_results *mr) {
|
||||||
|
|
||||||
@ So, then: the matcher itself.
|
@ So, then: the matcher itself.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Regexp::match(match_results *mr, text_stream *text, wchar_t *pattern) {
|
int Regexp::match(match_results *mr, text_stream *text, wchar_t *pattern) {
|
||||||
if (mr) Regexp::prepare(mr);
|
if (mr) Regexp::prepare(mr);
|
||||||
int rv = (Regexp::match_r(mr, text, pattern, NULL, FALSE) >= 0)?TRUE:FALSE;
|
int rv = (Regexp::match_r(mr, text, pattern, NULL, FALSE) >= 0)?TRUE:FALSE;
|
||||||
|
@ -203,41 +205,41 @@ void Regexp::prepare(match_results *mr) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ =
|
<<*>>=
|
||||||
int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern,
|
int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern,
|
||||||
match_position *scan_from, int allow_partial) {
|
match_position *scan_from, int allow_partial) {
|
||||||
match_position at;
|
match_position at;
|
||||||
if (scan_from) at = *scan_from;
|
if (scan_from) at = *scan_from;
|
||||||
else { at.tpos = 0; at.ppos = 0; at.bc = 0; at.bl = 0; }
|
else { at.tpos = 0; at.ppos = 0; at.bc = 0; at.bl = 0; }
|
||||||
while ((Str::get_at(text, at.tpos)) || (pattern[at.ppos])) {
|
while ((Str::get_at(text, at.tpos)) [[]] (pattern[at.ppos])) {
|
||||||
if ((allow_partial) && (pattern[at.ppos] == 0)) break;
|
if ((allow_partial) && (pattern[at.ppos] == 0)) break;
|
||||||
@<Parentheses in the match pattern set up substrings to extract@>;
|
<<Parentheses in the match pattern set up substrings to extract>>;
|
||||||
|
|
||||||
int chcl, /* what class of characters to match: a |*_CHARCLASS| value */
|
int chcl, /* what class of characters to match: a [[*_CHARCLASS]] value */
|
||||||
range_from, range_to, /* for |LITERAL_CHARCLASS| only */
|
range_from, range_to, /* for [[LITERAL_CHARCLASS]] only */
|
||||||
reverse = FALSE; /* require a non-match rather than a match */
|
reverse = FALSE; /* require a non-match rather than a match */
|
||||||
@<Extract the character class to match from the pattern@>;
|
<<Extract the character class to match from the pattern>>;
|
||||||
|
|
||||||
int rep_from = 1, rep_to = 1; /* minimum and maximum number of repetitions */
|
int rep_from = 1, rep_to = 1; /* minimum and maximum number of repetitions */
|
||||||
int greedy = TRUE; /* go for a maximal-length match if possible */
|
int greedy = TRUE; /* go for a maximal-length match if possible */
|
||||||
@<Extract repetition markers from the pattern@>;
|
<<Extract repetition markers from the pattern>>;
|
||||||
|
|
||||||
int reps = 0;
|
int reps = 0;
|
||||||
@<Count how many repetitions can be made here@>;
|
<<Count how many repetitions can be made here>>;
|
||||||
if (reps < rep_from) return -1;
|
if (reps < rep_from) return -1;
|
||||||
|
|
||||||
/* we can now accept anything from |rep_from| to |reps| repetitions */
|
/* we can now accept anything from [[rep_from| to |reps]] repetitions */
|
||||||
if (rep_from == reps) { at.tpos += reps; continue; }
|
if (rep_from == reps) { at.tpos += reps; continue; }
|
||||||
@<Try all possible match lengths until we find a match@>;
|
<<Try all possible match lengths until we find a match>>;
|
||||||
|
|
||||||
/* no match length worked, so no match */
|
/* no match length worked, so no match */
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@<Copy the bracketed texts found into the global strings@>;
|
<<Copy the bracketed texts found into the global strings>>;
|
||||||
return at.tpos;
|
return at.tpos;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Parentheses in the match pattern set up substrings to extract@> =
|
<<Parentheses in the match pattern set up substrings to extract>>=
|
||||||
if (pattern[at.ppos] == '(') {
|
if (pattern[at.ppos] == '(') {
|
||||||
if (at.bl < MAX_BRACKETED_SUBEXPRESSIONS) at.bracket_nesting[at.bl] = -1;
|
if (at.bl < MAX_BRACKETED_SUBEXPRESSIONS) at.bracket_nesting[at.bl] = -1;
|
||||||
if (at.bc < MAX_BRACKETED_SUBEXPRESSIONS) {
|
if (at.bc < MAX_BRACKETED_SUBEXPRESSIONS) {
|
||||||
|
@ -255,7 +257,7 @@ int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Extract the character class to match from the pattern@> =
|
<<Extract the character class to match from the pattern>>=
|
||||||
if (pattern[at.ppos] == 0) return -1;
|
if (pattern[at.ppos] == 0) return -1;
|
||||||
int len = 0;
|
int len = 0;
|
||||||
chcl = Regexp::get_cclass(pattern, at.ppos, &len, &range_from, &range_to, &reverse);
|
chcl = Regexp::get_cclass(pattern, at.ppos, &len, &range_from, &range_to, &reverse);
|
||||||
|
@ -265,7 +267,7 @@ int Regexp::match_r(match_results *mr, text_stream *text, wchar_t *pattern,
|
||||||
@ This is standard regular-expression notation, except that I haven't bothered
|
@ This is standard regular-expression notation, except that I haven't bothered
|
||||||
to implement numeric repetition counts, which we won't need:
|
to implement numeric repetition counts, which we won't need:
|
||||||
|
|
||||||
@<Extract repetition markers from the pattern@> =
|
<<Extract repetition markers from the pattern>>=
|
||||||
if (chcl == WHITESPACE_CHARCLASS) {
|
if (chcl == WHITESPACE_CHARCLASS) {
|
||||||
rep_from = 1; rep_to = Str::len(text)-at.tpos;
|
rep_from = 1; rep_to = Str::len(text)-at.tpos;
|
||||||
}
|
}
|
||||||
|
@ -276,13 +278,13 @@ to implement numeric repetition counts, which we won't need:
|
||||||
}
|
}
|
||||||
if (pattern[at.ppos] == '?') { greedy = FALSE; at.ppos++; }
|
if (pattern[at.ppos] == '?') { greedy = FALSE; at.ppos++; }
|
||||||
|
|
||||||
@<Count how many repetitions can be made here@> =
|
<<Count how many repetitions can be made here>>=
|
||||||
for (reps = 0; ((Str::get_at(text, at.tpos+reps)) && (reps < rep_to)); reps++)
|
for (reps = 0; ((Str::get_at(text, at.tpos+reps)) && (reps < rep_to)); reps++)
|
||||||
if (Regexp::test_cclass(Str::get_at(text, at.tpos+reps), chcl,
|
if (Regexp::test_cclass(Str::get_at(text, at.tpos+reps), chcl,
|
||||||
range_from, range_to, pattern, reverse) == FALSE)
|
range_from, range_to, pattern, reverse) == FALSE)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@<Try all possible match lengths until we find a match@> =
|
<<Try all possible match lengths until we find a match>>=
|
||||||
int from = rep_from, to = reps, dj = 1, from_tpos = at.tpos;
|
int from = rep_from, to = reps, dj = 1, from_tpos = at.tpos;
|
||||||
if (greedy) { from = reps; to = rep_from; dj = -1; }
|
if (greedy) { from = reps; to = rep_from; dj = -1; }
|
||||||
for (int j = from; j != to+dj; j += dj) {
|
for (int j = from; j != to+dj; j += dj) {
|
||||||
|
@ -291,7 +293,7 @@ to implement numeric repetition counts, which we won't need:
|
||||||
if (try >= 0) return try;
|
if (try >= 0) return try;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Copy the bracketed texts found into the global strings@> =
|
<<Copy the bracketed texts found into the global strings>>=
|
||||||
if (mr) {
|
if (mr) {
|
||||||
for (int i=0; i<at.bc; i++) {
|
for (int i=0; i<at.bc; i++) {
|
||||||
Str::clear(mr->exp[i]);
|
Str::clear(mr->exp[i]);
|
||||||
|
@ -303,38 +305,39 @@ to implement numeric repetition counts, which we won't need:
|
||||||
}
|
}
|
||||||
|
|
||||||
@ So then: most characters in the pattern are taken literally (if the pattern
|
@ So then: most characters in the pattern are taken literally (if the pattern
|
||||||
says |q|, the only match is with a lower-case letter "q"), except that:
|
says [[q]], the only match is with a lower-case letter "q"), except that:
|
||||||
|
|
||||||
(a) a space means "one or more characters of white space";
|
(a) a space means "one or more characters of white space";
|
||||||
(b) |%d| means any decimal digit;
|
(b) [[%d]] means any decimal digit;
|
||||||
(c) |%c| means any character at all;
|
(c) [[%c]] means any character at all;
|
||||||
(d) |%C| means any character which isn't white space;
|
(d) [[%C]] means any character which isn't white space;
|
||||||
(e) |%i| means any character from the identifier class (see above);
|
(e) [[%i]] means any character from the identifier class (see above);
|
||||||
(f) |%p| means any character which can be used in the name of a Preform
|
(f) [[%p]] means any character which can be used in the name of a Preform
|
||||||
nonterminal, which is to say, an identifier character or a hyphen;
|
nonterminal, which is to say, an identifier character or a hyphen;
|
||||||
(g) |%P| means the same or else a colon;
|
(g) [[%P]] means the same or else a colon;
|
||||||
(h) |%t| means a tab;
|
(h) [[%t]] means a tab;
|
||||||
(i) |%q| means a double-quote.
|
(i) [[%q]] means a double-quote.
|
||||||
|
|
||||||
|%| otherwise makes a literal escape; a space means any whitespace character;
|
[[%]] otherwise makes a literal escape; a space means any whitespace character;
|
||||||
square brackets enclose literal alternatives, and note as usual with grep
|
square brackets enclose literal alternatives, and note as usual with grep
|
||||||
engines that |[]xyz]| is legal and makes a set of four possibilities, the
|
engines that [[[]xyz]]] is legal and makes a set of four possibilities, the
|
||||||
first of which is a literal close square; within a set, a hyphen makes a
|
first of which is a literal close square; within a set, a hyphen makes a
|
||||||
character range; an initial |^| negates the result; and otherwise everything
|
character range; an initial [[^]] negates the result; and otherwise everything
|
||||||
is literal.
|
is literal.
|
||||||
|
|
||||||
@d ANY_CHARCLASS 1
|
<<*>>=
|
||||||
@d DIGIT_CHARCLASS 2
|
#define ANY_CHARCLASS 1
|
||||||
@d WHITESPACE_CHARCLASS 3
|
#define DIGIT_CHARCLASS 2
|
||||||
@d NONWHITESPACE_CHARCLASS 4
|
#define WHITESPACE_CHARCLASS 3
|
||||||
@d IDENTIFIER_CHARCLASS 5
|
#define NONWHITESPACE_CHARCLASS 4
|
||||||
@d PREFORM_CHARCLASS 6
|
#define IDENTIFIER_CHARCLASS 5
|
||||||
@d PREFORMC_CHARCLASS 7
|
#define PREFORM_CHARCLASS 6
|
||||||
@d LITERAL_CHARCLASS 8
|
#define PREFORMC_CHARCLASS 7
|
||||||
@d TAB_CHARCLASS 9
|
#define LITERAL_CHARCLASS 8
|
||||||
@d QUOTE_CHARCLASS 10
|
#define TAB_CHARCLASS 9
|
||||||
|
#define QUOTE_CHARCLASS 10
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Regexp::get_cclass(wchar_t *pattern, int ppos, int *len, int *from, int *to, int *reverse) {
|
int Regexp::get_cclass(wchar_t *pattern, int ppos, int *len, int *from, int *to, int *reverse) {
|
||||||
if (pattern[ppos] == '^') { ppos++; *reverse = TRUE; } else { *reverse = FALSE; }
|
if (pattern[ppos] == '^') { ppos++; *reverse = TRUE; } else { *reverse = FALSE; }
|
||||||
switch (pattern[ppos]) {
|
switch (pattern[ppos]) {
|
||||||
|
@ -364,7 +367,7 @@ int Regexp::get_cclass(wchar_t *pattern, int ppos, int *len, int *from, int *to,
|
||||||
*len = 1; *from = ppos; *to = ppos; return LITERAL_CHARCLASS;
|
*len = 1; *from = ppos; *to = ppos; return LITERAL_CHARCLASS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ =
|
<<*>>=
|
||||||
int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *drawn_from, int reverse) {
|
int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *drawn_from, int reverse) {
|
||||||
int match = FALSE;
|
int match = FALSE;
|
||||||
switch (chcl) {
|
switch (chcl) {
|
||||||
|
@ -375,11 +378,11 @@ int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *
|
||||||
case NONWHITESPACE_CHARCLASS: if (!(Characters::is_whitespace(c))) match = TRUE; break;
|
case NONWHITESPACE_CHARCLASS: if (!(Characters::is_whitespace(c))) match = TRUE; break;
|
||||||
case QUOTE_CHARCLASS: if (c != '\"') match = TRUE; break;
|
case QUOTE_CHARCLASS: if (c != '\"') match = TRUE; break;
|
||||||
case IDENTIFIER_CHARCLASS: if (Regexp::identifier_char(c)) match = TRUE; break;
|
case IDENTIFIER_CHARCLASS: if (Regexp::identifier_char(c)) match = TRUE; break;
|
||||||
case PREFORM_CHARCLASS: if ((c == '-') || (c == '_') ||
|
case PREFORM_CHARCLASS: if ((c == '-') [[| (c == '_') |]]
|
||||||
((c >= 'a') && (c <= 'z')) ||
|
((c >= 'a') && (c <= 'z')) [[]]
|
||||||
((c >= '0') && (c <= '9'))) match = TRUE; break;
|
((c >= '0') && (c <= '9'))) match = TRUE; break;
|
||||||
case PREFORMC_CHARCLASS: if ((c == '-') || (c == '_') || (c == ':') ||
|
case PREFORMC_CHARCLASS: if ((c == '-') [[| (c == '_') || (c == ':') |]]
|
||||||
((c >= 'a') && (c <= 'z')) ||
|
((c >= 'a') && (c <= 'z')) [[]]
|
||||||
((c >= '0') && (c <= '9'))) match = TRUE; break;
|
((c >= '0') && (c <= '9'))) match = TRUE; break;
|
||||||
case LITERAL_CHARCLASS:
|
case LITERAL_CHARCLASS:
|
||||||
if ((range_to > range_from) && (drawn_from[range_from] == '^')) {
|
if ((range_to > range_from) && (drawn_from[range_from] == '^')) {
|
||||||
|
@ -398,20 +401,21 @@ int Regexp::test_cclass(int c, int chcl, int range_from, int range_to, wchar_t *
|
||||||
return match;
|
return match;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Replacement.
|
@ \section{Replacement.}
|
||||||
And this routine conveniently handles searching and replacing. This time we
|
And this routine conveniently handles searching and replacing. This time we
|
||||||
can match at substrings of the |text| (i.e., we are not forced to match
|
can match at substrings of the [[text]] (i.e., we are not forced to match
|
||||||
from the start right to the end), and multiple replacements can be made.
|
from the start right to the end), and multiple replacements can be made.
|
||||||
For example,
|
For example,
|
||||||
= (text as code)
|
|
||||||
Regexp::replace(text, L"[aeiou]", L"!", REP_REPEATING);
|
Regexp::replace(text, L"[aeiou]", L"!", REP_REPEATING);
|
||||||
=
|
|
||||||
will turn the |text| "goose eggs" into "g!!s! !ggs".
|
|
||||||
|
|
||||||
@d REP_REPEATING 1
|
will turn the [[text]] "goose eggs" into "g!!s! !ggs".
|
||||||
@d REP_ATSTART 2
|
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
|
#define REP_REPEATING 1
|
||||||
|
#define REP_ATSTART 2
|
||||||
|
|
||||||
|
<<*>>=
|
||||||
int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, int options) {
|
int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, int options) {
|
||||||
TEMPORARY_TEXT(altered)
|
TEMPORARY_TEXT(altered)
|
||||||
match_results mr = Regexp::create_mr();
|
match_results mr = Regexp::create_mr();
|
||||||
|
@ -439,10 +443,10 @@ int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, i
|
||||||
changes++;
|
changes++;
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
L = Str::len(text); i = L-left-1;
|
L = Str::len(text); i = L-left-1;
|
||||||
if ((options & REP_REPEATING) == 0) { @<Add the rest@>; break; }
|
if ((options & REP_REPEATING) == 0) { <<Add the rest>>; break; }
|
||||||
continue;
|
continue;
|
||||||
} else PUT_TO(altered, Str::get_at(text, i));
|
} else PUT_TO(altered, Str::get_at(text, i));
|
||||||
if (options & REP_ATSTART) { @<Add the rest@>; break; }
|
if (options & REP_ATSTART) { <<Add the rest>>; break; }
|
||||||
}
|
}
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
if (changes > 0) Str::copy(text, altered);
|
if (changes > 0) Str::copy(text, altered);
|
||||||
|
@ -450,6 +454,6 @@ int Regexp::replace(text_stream *text, wchar_t *pattern, wchar_t *replacement, i
|
||||||
return changes;
|
return changes;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Add the rest@> =
|
<<Add the rest>>=
|
||||||
for (i++; i<L; i++)
|
for (i++; i<L; i++)
|
||||||
PUT_TO(altered, Str::get_at(text, i));
|
PUT_TO(altered, Str::get_at(text, i));
|
|
@ -3,28 +3,29 @@
|
||||||
A simple, general-purpose preprocessor for text files, expanding macros and
|
A simple, general-purpose preprocessor for text files, expanding macros and
|
||||||
performing repetitions.
|
performing repetitions.
|
||||||
|
|
||||||
@h Scanner.
|
@ \section{Scanner.}
|
||||||
Writing a general-purpose preprocessor really is coding like it's 1974, but
|
Writing a general-purpose preprocessor really is coding like it's 1974, but
|
||||||
it turns out to be useful for multiple applications in the Inform project, and
|
it turns out to be useful for multiple applications in the Inform project, and
|
||||||
saves us having to have dependencies on behemoths like the mighty |m4|.
|
saves us having to have dependencies on behemoths like the mighty [[m4]].
|
||||||
|
|
||||||
For documentation on the markup notation, see //inweb: Webs, Tangling and Weaving//.
|
For documentation on the markup notation, see //inweb: Webs, Tangling and Weaving//.
|
||||||
|
|
||||||
To use the preprocessor, call:
|
To use the preprocessor, call:
|
||||||
= (text as InC)
|
|
||||||
Preprocessor::preprocess(from, to, header, special_macros, specifics, encoding)
|
Preprocessor::preprocess(from, to, header, special_macros, specifics, encoding)
|
||||||
=
|
|
||||||
where |from| and |to| are filenames, |header| is text to place at the top of
|
|
||||||
the file (if any), |special_macros| is a |linked_list| of |preprocessor_macro|s
|
|
||||||
set up with special meanings to the situation, and |specifics| is a general
|
|
||||||
pointer to any data those special meanings need to use. |encoding| should be
|
|
||||||
one of |UTF8_ENC| or |ISO_ENC|.
|
|
||||||
|
|
||||||
@d PROTECTED_OPEN_BRACE_PPCHAR 0x25A0
|
where [[from]] and [[to]] are filenames, [[header]] is text to place at the top of
|
||||||
@d PROTECTED_CLOSE_BRACE_PPCHAR 0x25A1
|
the file (if any), [[special_macros]] is a [[linked_list]] of [[preprocessor_macro]]s
|
||||||
@d PROTECTED_BLANK_PPCHAR 0x25A2
|
set up with special meanings to the situation, and [[specifics]] is a general
|
||||||
|
pointer to any data those special meanings need to use. [[encoding]] should be
|
||||||
|
one of [[UTF8_ENC]] or [[ISO_ENC]].
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
|
#define PROTECTED_OPEN_BRACE_PPCHAR 0x25A0
|
||||||
|
#define PROTECTED_CLOSE_BRACE_PPCHAR 0x25A1
|
||||||
|
#define PROTECTED_BLANK_PPCHAR 0x25A2
|
||||||
|
|
||||||
|
<<*>>=
|
||||||
void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *header,
|
void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *header,
|
||||||
linked_list *special_macros, general_pointer specifics, wchar_t comment_char,
|
linked_list *special_macros, general_pointer specifics, wchar_t comment_char,
|
||||||
int encoding) {
|
int encoding) {
|
||||||
|
@ -35,7 +36,7 @@ void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *hea
|
||||||
WRITE("%S", header);
|
WRITE("%S", header);
|
||||||
|
|
||||||
preprocessor_state PPS;
|
preprocessor_state PPS;
|
||||||
@<Initialise the preprocessor state@>;
|
<<Initialise the preprocessor state>>;
|
||||||
TextFiles::read(prototype, FALSE, "can't open prototype file",
|
TextFiles::read(prototype, FALSE, "can't open prototype file",
|
||||||
TRUE, Preprocessor::scan_line, NULL, &PPS);
|
TRUE, Preprocessor::scan_line, NULL, &PPS);
|
||||||
for (int i=0; i<Str::len(PPS.dest); i++) {
|
for (int i=0; i<Str::len(PPS.dest); i++) {
|
||||||
|
@ -50,9 +51,10 @@ void Preprocessor::preprocess(filename *prototype, filename *F, text_stream *hea
|
||||||
@ The following imposing-looking set of state data is used as we work through
|
@ The following imposing-looking set of state data is used as we work through
|
||||||
the prototype file line-by-line:
|
the prototype file line-by-line:
|
||||||
|
|
||||||
@d MAX_PREPROCESSOR_LOOP_DEPTH 8
|
<<*>>=
|
||||||
|
#define MAX_PREPROCESSOR_LOOP_DEPTH 8
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct preprocessor_state {
|
typedef struct preprocessor_state {
|
||||||
struct text_stream *dest;
|
struct text_stream *dest;
|
||||||
struct preprocessor_macro *defining; /* a "define" body being scanned */
|
struct preprocessor_macro *defining; /* a "define" body being scanned */
|
||||||
|
@ -63,19 +65,19 @@ typedef struct preprocessor_state {
|
||||||
int last_line_was_blank; /* used to suppress runs of multiple blank lines */
|
int last_line_was_blank; /* used to suppress runs of multiple blank lines */
|
||||||
struct preprocessor_variable_set *global_variables;
|
struct preprocessor_variable_set *global_variables;
|
||||||
struct preprocessor_variable_set *stack_frame;
|
struct preprocessor_variable_set *stack_frame;
|
||||||
struct linked_list *known_macros; /* of |preprocessor_macro| */
|
struct linked_list *known_macros; /* of [[preprocessor_macro]] */
|
||||||
struct general_pointer specifics;
|
struct general_pointer specifics;
|
||||||
wchar_t comment_character;
|
wchar_t comment_character;
|
||||||
} preprocessor_state;
|
} preprocessor_state;
|
||||||
|
|
||||||
typedef struct preprocessor_loop {
|
typedef struct preprocessor_loop {
|
||||||
struct text_stream *loop_var_name;
|
struct text_stream *loop_var_name;
|
||||||
struct linked_list *iterations; /* of |text_stream| */
|
struct linked_list *iterations; /* of [[text_stream]] */
|
||||||
int repeat_is_block;
|
int repeat_is_block;
|
||||||
struct text_stream *repeat_saved_dest;
|
struct text_stream *repeat_saved_dest;
|
||||||
} preprocessor_loop;
|
} preprocessor_loop;
|
||||||
|
|
||||||
@<Initialise the preprocessor state@> =
|
<<Initialise the preprocessor state>>=
|
||||||
PPS.dest = Str::new();
|
PPS.dest = Str::new();
|
||||||
PPS.suppress_newline = FALSE;
|
PPS.suppress_newline = FALSE;
|
||||||
PPS.last_line_was_blank = TRUE;
|
PPS.last_line_was_blank = TRUE;
|
||||||
|
@ -91,7 +93,7 @@ typedef struct preprocessor_loop {
|
||||||
@ Conceptually, each loop runs a variable with a given name through a series
|
@ Conceptually, each loop runs a variable with a given name through a series
|
||||||
of textual values in sequence, and we store that data here:
|
of textual values in sequence, and we store that data here:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Preprocessor::set_loop_var_name(preprocessor_loop *loop, text_stream *name) {
|
void Preprocessor::set_loop_var_name(preprocessor_loop *loop, text_stream *name) {
|
||||||
loop->loop_var_name = Str::duplicate(name);
|
loop->loop_var_name = Str::duplicate(name);
|
||||||
}
|
}
|
||||||
|
@ -102,31 +104,31 @@ void Preprocessor::add_loop_iteration(preprocessor_loop *loop, text_stream *valu
|
||||||
@ Lines from the prototype (or sometimes from files spliced in) are read, one
|
@ Lines from the prototype (or sometimes from files spliced in) are read, one
|
||||||
at a time, by the following.
|
at a time, by the following.
|
||||||
|
|
||||||
Note that |define| and |end-define| are not themselves macros, and are handled
|
Note that [[define]] and [[end-define]] are not themselves macros, and are handled
|
||||||
directly here. So you cannot use repeat loops to define multiple macros with
|
directly here. So you cannot use repeat loops to define multiple macros with
|
||||||
parametrised names: but then, nor should you.
|
parametrised names: but then, nor should you.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Preprocessor::scan_line(text_stream *line, text_file_position *tfp, void *X) {
|
void Preprocessor::scan_line(text_stream *line, text_file_position *tfp, void *X) {
|
||||||
preprocessor_state *PPS = (preprocessor_state *) X;
|
preprocessor_state *PPS = (preprocessor_state *) X;
|
||||||
@<Skip comments@>;
|
<<Skip comments>>;
|
||||||
@<Make backslash literals safe@>;
|
<<Make backslash literals safe>>;
|
||||||
@<Deal with textual definitions of new macros@>;
|
<<Deal with textual definitions of new macros>>;
|
||||||
Preprocessor::expand(line, tfp, PPS);
|
Preprocessor::expand(line, tfp, PPS);
|
||||||
@<Sometimes, but only sometimes, output a newline@>;
|
<<Sometimes, but only sometimes, output a newline>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ A line is a comment to the preprocessor if its first non-whitespace character
|
@ A line is a comment to the preprocessor if its first non-whitespace character
|
||||||
is the special comment character: often |#|, but not necessarily.
|
is the special comment character: often [[#]], but not necessarily.
|
||||||
|
|
||||||
@<Skip comments@> =
|
<<Skip comments>>=
|
||||||
LOOP_THROUGH_TEXT(pos, line) {
|
LOOP_THROUGH_TEXT(pos, line) {
|
||||||
wchar_t c = Str::get(pos);
|
wchar_t c = Str::get(pos);
|
||||||
if (c == PPS->comment_character) return;
|
if (c == PPS->comment_character) return;
|
||||||
if (Characters::is_whitespace(c) == FALSE) break;
|
if (Characters::is_whitespace(c) == FALSE) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Make backslash literals safe@> =
|
<<Make backslash literals safe>>=
|
||||||
for (int i = 0; i < Str::len(line); i++) {
|
for (int i = 0; i < Str::len(line); i++) {
|
||||||
wchar_t c = Str::get_at(line, i);
|
wchar_t c = Str::get_at(line, i);
|
||||||
if (c == '\\') {
|
if (c == '\\') {
|
||||||
|
@ -151,15 +153,15 @@ is the special comment character: often |#|, but not necessarily.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Deal with textual definitions of new macros@> =
|
<<Deal with textual definitions of new macros>>=
|
||||||
match_results mr = Regexp::create_mr();
|
match_results mr = Regexp::create_mr();
|
||||||
if (Regexp::match(&mr, line, L" *{define: *(%C+) *} *")) @<Begin a bare definition@>;
|
if (Regexp::match(&mr, line, L" *{define: *(%C+) *} *")) <<Begin a bare definition>>;
|
||||||
if (Regexp::match(&mr, line, L" *{define: *(%C+) (%c*)} *")) @<Begin a definition@>;
|
if (Regexp::match(&mr, line, L" *{define: *(%C+) (%c*)} *")) <<Begin a definition>>;
|
||||||
if (Regexp::match(&mr, line, L" *{end-define} *")) @<End a definition@>;
|
if (Regexp::match(&mr, line, L" *{end-define} *")) <<End a definition>>;
|
||||||
if (PPS->defining) @<Continue a definition@>;
|
if (PPS->defining) <<Continue a definition>>;
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
|
|
||||||
@<Begin a bare definition@> =
|
<<Begin a bare definition>>=
|
||||||
if (PPS->defining)
|
if (PPS->defining)
|
||||||
Errors::in_text_file("nested definitions are not allowed", tfp);
|
Errors::in_text_file("nested definitions are not allowed", tfp);
|
||||||
text_stream *name = mr.exp[0];
|
text_stream *name = mr.exp[0];
|
||||||
|
@ -169,7 +171,7 @@ is the special comment character: often |#|, but not necessarily.
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@<Begin a definition@> =
|
<<Begin a definition>>=
|
||||||
if (PPS->defining)
|
if (PPS->defining)
|
||||||
Errors::in_text_file("nested definitions are not allowed", tfp);
|
Errors::in_text_file("nested definitions are not allowed", tfp);
|
||||||
text_stream *name = mr.exp[0];
|
text_stream *name = mr.exp[0];
|
||||||
|
@ -179,19 +181,19 @@ is the special comment character: often |#|, but not necessarily.
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@<Continue a definition@> =
|
<<Continue a definition>>=
|
||||||
Preprocessor::add_line_to_macro(PPS->defining, line, tfp);
|
Preprocessor::add_line_to_macro(PPS->defining, line, tfp);
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@<End a definition@> =
|
<<End a definition>>=
|
||||||
if (PPS->defining == NULL)
|
if (PPS->defining == NULL)
|
||||||
Errors::in_text_file("{end-define} without {define: ...}", tfp);
|
Errors::in_text_file("{end-define} without {define: ...}", tfp);
|
||||||
PPS->defining = NULL;
|
PPS->defining = NULL;
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@<Sometimes, but only sometimes, output a newline@> =
|
<<Sometimes, but only sometimes, output a newline>>=
|
||||||
if (PPS->suppress_newline == FALSE) {
|
if (PPS->suppress_newline == FALSE) {
|
||||||
text_stream *OUT = PPS->dest;
|
text_stream *OUT = PPS->dest;
|
||||||
if (Str::len(line) == 0) {
|
if (Str::len(line) == 0) {
|
||||||
|
@ -208,13 +210,13 @@ is the special comment character: often |#|, but not necessarily.
|
||||||
|
|
||||||
(i) Does not contain any newlines;
|
(i) Does not contain any newlines;
|
||||||
|
|
||||||
(ii) Contains braces |{ ... }| used in nested pairs (unless there is a syntax
|
(ii) Contains braces [[{ ... }]] used in nested pairs (unless there is a syntax
|
||||||
error in the prototype, in which case we must complain).
|
error in the prototype, in which case we must complain).
|
||||||
|
|
||||||
The idea is to pass everything straight through except any braced matter,
|
The idea is to pass everything straight through except any braced matter,
|
||||||
which needs special attention.
|
which needs special attention.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocessor_state *PPS) {
|
void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocessor_state *PPS) {
|
||||||
TEMPORARY_TEXT(before_matter)
|
TEMPORARY_TEXT(before_matter)
|
||||||
TEMPORARY_TEXT(braced_matter)
|
TEMPORARY_TEXT(braced_matter)
|
||||||
|
@ -238,7 +240,7 @@ void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocess
|
||||||
}
|
}
|
||||||
if (bl > 0) Errors::in_text_file("too many '{'s", tfp);
|
if (bl > 0) Errors::in_text_file("too many '{'s", tfp);
|
||||||
if (after_times) {
|
if (after_times) {
|
||||||
@<Expand braced matter@>;
|
<<Expand braced matter>>;
|
||||||
} else {
|
} else {
|
||||||
WRITE_TO(PPS->dest, "%S", text);
|
WRITE_TO(PPS->dest, "%S", text);
|
||||||
}
|
}
|
||||||
|
@ -247,13 +249,13 @@ void Preprocessor::expand(text_stream *text, text_file_position *tfp, preprocess
|
||||||
DISCARD_TEXT(after_matter)
|
DISCARD_TEXT(after_matter)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ Suppose we are expanding the text |this {ADJECTIVE} ocean {BEHAVIOUR}|: then
|
@ Suppose we are expanding the text [[this {ADJECTIVE} ocean {BEHAVIOUR}]]: then
|
||||||
the |before_matter| will be |this |, the |braced_matter| will be |ADJECTIVE|,
|
the [[before_matter]] will be [[this ]], the [[braced_matter]] will be [[ADJECTIVE]],
|
||||||
and the |after_matter| will be | ocean {BEHAVIOUR}|.
|
and the [[after_matter]] will be [[ ocean {BEHAVIOUR}]].
|
||||||
|
|
||||||
@<Expand braced matter@> =
|
<<Expand braced matter>>=
|
||||||
if (Preprocessor::acceptable_variable_name(braced_matter)) {
|
if (Preprocessor::acceptable_variable_name(braced_matter)) {
|
||||||
@<Expand a variable name@>;
|
<<Expand a variable name>>;
|
||||||
} else {
|
} else {
|
||||||
text_stream *identifier = braced_matter;
|
text_stream *identifier = braced_matter;
|
||||||
text_stream *parameter_settings = NULL;
|
text_stream *parameter_settings = NULL;
|
||||||
|
@ -262,7 +264,7 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|.
|
||||||
identifier = mr.exp[0];
|
identifier = mr.exp[0];
|
||||||
parameter_settings = mr.exp[1];
|
parameter_settings = mr.exp[1];
|
||||||
}
|
}
|
||||||
@<Work out which macro identifier is meant by a loop name@>;
|
<<Work out which macro identifier is meant by a loop name>>;
|
||||||
|
|
||||||
preprocessor_macro *mm = Preprocessor::find_macro(PPS->known_macros, identifier);
|
preprocessor_macro *mm = Preprocessor::find_macro(PPS->known_macros, identifier);
|
||||||
if (mm == NULL) {
|
if (mm == NULL) {
|
||||||
|
@ -271,15 +273,15 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|.
|
||||||
Errors::in_text_file_S(erm, tfp);
|
Errors::in_text_file_S(erm, tfp);
|
||||||
DISCARD_TEXT(erm)
|
DISCARD_TEXT(erm)
|
||||||
} else {
|
} else {
|
||||||
@<Expand a macro@>;
|
<<Expand a macro>>;
|
||||||
}
|
}
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ So, for example, the identifier |repeat| would be changed here either to
|
@ So, for example, the identifier [[repeat]] would be changed here either to
|
||||||
|repeat-block| or |repeat-span|: see above for an explanation.
|
[[repeat-block]] or [[repeat-span]]: see above for an explanation.
|
||||||
|
|
||||||
@<Work out which macro identifier is meant by a loop name@> =
|
<<Work out which macro identifier is meant by a loop name>>=
|
||||||
preprocessor_macro *loop_mm;
|
preprocessor_macro *loop_mm;
|
||||||
LOOP_OVER_LINKED_LIST(loop_mm, preprocessor_macro, PPS->known_macros)
|
LOOP_OVER_LINKED_LIST(loop_mm, preprocessor_macro, PPS->known_macros)
|
||||||
if (Str::len(loop_mm->loop_name) > 0) {
|
if (Str::len(loop_mm->loop_name) > 0) {
|
||||||
|
@ -308,27 +310,27 @@ and the |after_matter| will be | ocean {BEHAVIOUR}|.
|
||||||
}
|
}
|
||||||
|
|
||||||
@ Note that if we are inside a loop, we do not perform expansion on the variable
|
@ Note that if we are inside a loop, we do not perform expansion on the variable
|
||||||
name, and instead pass it through unchanged -- still as, say, |{NAME}|. This
|
name, and instead pass it through unchanged -- still as, say, [[{NAME}]]. This
|
||||||
is because it won't be expanded until later, when the expander reaches the
|
is because it won't be expanded until later, when the expander reaches the
|
||||||
end of the loop body.
|
end of the loop body.
|
||||||
|
|
||||||
@<Expand a variable name@> =
|
<<Expand a variable name>>=
|
||||||
Preprocessor::expand(before_matter, tfp, PPS);
|
Preprocessor::expand(before_matter, tfp, PPS);
|
||||||
if (PPS->repeat_sp > 0) {
|
if (PPS->repeat_sp > 0) {
|
||||||
WRITE_TO(PPS->dest, "{%S}", braced_matter);
|
WRITE_TO(PPS->dest, "{%S}", braced_matter);
|
||||||
} else {
|
} else {
|
||||||
@<Definitely expand a variable name@>;
|
<<Definitely expand a variable name>>;
|
||||||
}
|
}
|
||||||
Preprocessor::expand(after_matter, tfp, PPS);
|
Preprocessor::expand(after_matter, tfp, PPS);
|
||||||
|
|
||||||
@ Similarly, we don't expand macros inside the body of a loop, except that we
|
@ Similarly, we don't expand macros inside the body of a loop, except that we
|
||||||
need to expand the |{end-repeat-block}| (or similar) which closes that loop
|
need to expand the [[{end-repeat-block}]] (or similar) which closes that loop
|
||||||
body, so that we can escape back into normal mode. Because loop constructs
|
body, so that we can escape back into normal mode. Because loop constructs
|
||||||
may be nested, we need to react to (but not expand) loop openings, too.
|
may be nested, we need to react to (but not expand) loop openings, too.
|
||||||
The "shadow stack pointer" shows how deep we are inside these shadowy,
|
The "shadow stack pointer" shows how deep we are inside these shadowy,
|
||||||
not-yet-acted-on, loops.
|
not-yet-acted-on, loops.
|
||||||
|
|
||||||
@<Expand a macro@> =
|
<<Expand a macro>>=
|
||||||
if (mm->suppress_whitespace_when_expanding) {
|
if (mm->suppress_whitespace_when_expanding) {
|
||||||
while (Characters::is_whitespace(Str::get_last_char(before_matter)))
|
while (Characters::is_whitespace(Str::get_last_char(before_matter)))
|
||||||
Str::delete_last_character(before_matter);
|
Str::delete_last_character(before_matter);
|
||||||
|
@ -348,16 +350,16 @@ not-yet-acted-on, loops.
|
||||||
if ((divert_if_repeating) && (PPS->repeat_sp > 0)) {
|
if ((divert_if_repeating) && (PPS->repeat_sp > 0)) {
|
||||||
WRITE_TO(PPS->dest, "{%S}", braced_matter);
|
WRITE_TO(PPS->dest, "{%S}", braced_matter);
|
||||||
} else {
|
} else {
|
||||||
@<Definitely expand a macro@>;
|
<<Definitely expand a macro>>;
|
||||||
if (mm->suppress_newline_after_expanding) PPS->suppress_newline = TRUE;
|
if (mm->suppress_newline_after_expanding) PPS->suppress_newline = TRUE;
|
||||||
}
|
}
|
||||||
Preprocessor::expand(after_matter, tfp, PPS);
|
Preprocessor::expand(after_matter, tfp, PPS);
|
||||||
|
|
||||||
@ We can now forget about the |before_matter|, the |after_matter|, or whether
|
@ We can now forget about the [[before_matter]], the [[after_matter]], or whether
|
||||||
we ought not to expand after all: that's all taken care of. A variable expands
|
we ought not to expand after all: that's all taken care of. A variable expands
|
||||||
to its value:
|
to its value:
|
||||||
|
|
||||||
@<Definitely expand a variable name@> =
|
<<Definitely expand a variable name>>=
|
||||||
preprocessor_variable *var =
|
preprocessor_variable *var =
|
||||||
Preprocessor::find_variable(braced_matter, PPS->stack_frame);
|
Preprocessor::find_variable(braced_matter, PPS->stack_frame);
|
||||||
if (var) {
|
if (var) {
|
||||||
|
@ -370,24 +372,24 @@ to its value:
|
||||||
}
|
}
|
||||||
|
|
||||||
@ This looks fussy, but really it delegates the work by calling a function
|
@ This looks fussy, but really it delegates the work by calling a function
|
||||||
attached to the macro, the |expander|.
|
attached to the macro, the [[expander]].
|
||||||
|
|
||||||
@<Definitely expand a macro@> =
|
<<Definitely expand a macro>>=
|
||||||
text_stream *parameter_values[MAX_PP_MACRO_PARAMETERS];
|
text_stream *parameter_values[MAX_PP_MACRO_PARAMETERS];
|
||||||
for (int i=0; i<MAX_PP_MACRO_PARAMETERS; i++) parameter_values[i] = NULL;
|
for (int i=0; i<MAX_PP_MACRO_PARAMETERS; i++) parameter_values[i] = NULL;
|
||||||
@<Parse the parameters supplied@>;
|
<<Parse the parameters supplied>>;
|
||||||
@<Check that all compulsory parameters have been supplied@>;
|
<<Check that all compulsory parameters have been supplied>>;
|
||||||
|
|
||||||
preprocessor_loop *loop = NULL;
|
preprocessor_loop *loop = NULL;
|
||||||
if (mm->begins_loop) @<Initialise repetition data for the loop@>;
|
if (mm->begins_loop) <<Initialise repetition data for the loop>>;
|
||||||
|
|
||||||
(*(mm->expander))(mm, PPS, parameter_values, loop, tfp);
|
(*(mm->expander))(mm, PPS, parameter_values, loop, tfp);
|
||||||
|
|
||||||
@ Note that textual values of the parameters are themselves expanded before
|
@ Note that textual values of the parameters are themselves expanded before
|
||||||
use: they might contain variables, or even macros. Parameter names are not.
|
use: they might contain variables, or even macros. Parameter names are not.
|
||||||
So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|.
|
So you can have [[in: {WHATEVER}]] but not [[{WHATEVER}: this]].
|
||||||
|
|
||||||
@<Parse the parameters supplied@> =
|
<<Parse the parameters supplied>>=
|
||||||
match_results mr = Regexp::create_mr();
|
match_results mr = Regexp::create_mr();
|
||||||
while (Regexp::match(&mr, parameter_settings, L" *(%C+): *(%c*)")) {
|
while (Regexp::match(&mr, parameter_settings, L" *(%C+): *(%c*)")) {
|
||||||
text_stream *setting = mr.exp[0];
|
text_stream *setting = mr.exp[0];
|
||||||
|
@ -422,7 +424,7 @@ So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|.
|
||||||
if (Str::is_whitespace(parameter_settings) == FALSE)
|
if (Str::is_whitespace(parameter_settings) == FALSE)
|
||||||
Errors::in_text_file("parameter list is malformed", tfp);
|
Errors::in_text_file("parameter list is malformed", tfp);
|
||||||
|
|
||||||
@<Check that all compulsory parameters have been supplied@> =
|
<<Check that all compulsory parameters have been supplied>>=
|
||||||
for (int i=0; i<mm->no_parameters; i++)
|
for (int i=0; i<mm->no_parameters; i++)
|
||||||
if (parameter_values[i] == NULL)
|
if (parameter_values[i] == NULL)
|
||||||
if (mm->parameters[i]->optional == FALSE) {
|
if (mm->parameters[i]->optional == FALSE) {
|
||||||
|
@ -432,11 +434,11 @@ So you can have |in: {WHATEVER}| but not |{WHATEVER}: this|.
|
||||||
DISCARD_TEXT(erm)
|
DISCARD_TEXT(erm)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ The following code is a little misleading. At present, |PPS->repeat_sp| is
|
@ The following code is a little misleading. At present, [[PPS->repeat_sp]] is
|
||||||
always either 0 or 1, no matter how deep loop nesting is: but that's just an
|
always either 0 or 1, no matter how deep loop nesting is: but that's just an
|
||||||
artefact of the current scanning algorithm, which might some day change.
|
artefact of the current scanning algorithm, which might some day change.
|
||||||
|
|
||||||
@<Initialise repetition data for the loop@> =
|
<<Initialise repetition data for the loop>>=
|
||||||
if (PPS->repeat_sp >= MAX_PREPROCESSOR_LOOP_DEPTH) {
|
if (PPS->repeat_sp >= MAX_PREPROCESSOR_LOOP_DEPTH) {
|
||||||
Errors::in_text_file("repetition too deep", tfp);
|
Errors::in_text_file("repetition too deep", tfp);
|
||||||
} else {
|
} else {
|
||||||
|
@ -450,10 +452,10 @@ artefact of the current scanning algorithm, which might some day change.
|
||||||
PPS->dest = Str::new();
|
PPS->dest = Str::new();
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Variables.
|
@ \section{Variables.}
|
||||||
Names of variables should conform to:
|
Names of variables should conform to:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Preprocessor::acceptable_variable_name(text_stream *name) {
|
int Preprocessor::acceptable_variable_name(text_stream *name) {
|
||||||
LOOP_THROUGH_TEXT(pos, name) {
|
LOOP_THROUGH_TEXT(pos, name) {
|
||||||
wchar_t c = Str::get(pos);
|
wchar_t c = Str::get(pos);
|
||||||
|
@ -467,7 +469,7 @@ int Preprocessor::acceptable_variable_name(text_stream *name) {
|
||||||
|
|
||||||
@ Variables are all textual:
|
@ Variables are all textual:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct preprocessor_variable {
|
typedef struct preprocessor_variable {
|
||||||
struct text_stream *name;
|
struct text_stream *name;
|
||||||
struct text_stream *value;
|
struct text_stream *value;
|
||||||
|
@ -483,13 +485,13 @@ void Preprocessor::write_variable(preprocessor_variable *var, text_stream *val)
|
||||||
var->value = Str::duplicate(val);
|
var->value = Str::duplicate(val);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ Each variable belongs to a single "set". If |EXAMPLE| has one meaning outside a
|
@ Each variable belongs to a single "set". If [[EXAMPLE]] has one meaning outside a
|
||||||
definition and another insider, that's two variables with a common name, not
|
definition and another insider, that's two variables with a common name, not
|
||||||
one variable belonging to two sets at once.
|
one variable belonging to two sets at once.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct preprocessor_variable_set {
|
typedef struct preprocessor_variable_set {
|
||||||
struct linked_list *variables; /* of |preprocessor_variable| */
|
struct linked_list *variables; /* of [[preprocessor_variable]] */
|
||||||
struct preprocessor_variable_set *outer;
|
struct preprocessor_variable_set *outer;
|
||||||
CLASS_DEFINITION
|
CLASS_DEFINITION
|
||||||
} preprocessor_variable_set;
|
} preprocessor_variable_set;
|
||||||
|
@ -524,7 +526,7 @@ preprocessor_variable *Preprocessor::find_variable(text_stream *name,
|
||||||
@ This creates a variable if it doesn't already exist in the given set. (If
|
@ This creates a variable if it doesn't already exist in the given set. (If
|
||||||
it exists in some outer set, that doesn't count.)
|
it exists in some outer set, that doesn't count.)
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
preprocessor_variable *Preprocessor::ensure_variable(text_stream *name,
|
preprocessor_variable *Preprocessor::ensure_variable(text_stream *name,
|
||||||
preprocessor_variable_set *in_set) {
|
preprocessor_variable_set *in_set) {
|
||||||
if (in_set == NULL) internal_error("variable without set");
|
if (in_set == NULL) internal_error("variable without set");
|
||||||
|
@ -538,29 +540,30 @@ preprocessor_variable *Preprocessor::ensure_variable(text_stream *name,
|
||||||
return var;
|
return var;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Macros.
|
@ \section{Macros.}
|
||||||
For the most part, each macro seen by users corresponds to a single
|
For the most part, each macro seen by users corresponds to a single
|
||||||
//preprocessor_macro//, but loop constructs are an exception. When the user
|
//preprocessor_macro//, but loop constructs are an exception. When the user
|
||||||
types |{repeat ...}|, this is a reference to |repeat-block| if the body of
|
types [[{repeat ...}]], this is a reference to [[repeat-block]] if the body of
|
||||||
what to repeat occupies multiple lines, but to |repeat-span| if only one.
|
what to repeat occupies multiple lines, but to [[repeat-span]] if only one.
|
||||||
|
|
||||||
|
For example, the first [[repeat]] loop here uses the macros [[repeat-block]] and
|
||||||
|
[[end-repeat-block]], and the second uses [[repeat-span]] and [[end-repeat-span]].
|
||||||
|
|
||||||
For example, the first |repeat| loop here uses the macros |repeat-block| and
|
|
||||||
|end-repeat-block|, and the second uses |repeat-span| and |end-repeat-span|.
|
|
||||||
= (text)
|
|
||||||
{repeat with SEA in Black, Caspian}
|
{repeat with SEA in Black, Caspian}
|
||||||
Welcome to the SEA Sea.
|
Welcome to the SEA Sea.
|
||||||
{end-repeat}
|
{end-repeat}
|
||||||
...
|
...
|
||||||
Seas available:{repeat with SEA in Sargasso, Libyan} {SEA} Sea;{end-repeat}
|
Seas available:{repeat with SEA in Sargasso, Libyan} {SEA} Sea;{end-repeat}
|
||||||
=
|
|
||||||
|
|
||||||
@ There are (for now, anyway) hard but harmlessly large limits on the number of
|
@ There are (for now, anyway) hard but harmlessly large limits on the number of
|
||||||
parameters and the length of a macro:
|
parameters and the length of a macro:
|
||||||
|
|
||||||
@d MAX_PP_MACRO_PARAMETERS 8
|
<<*>>=
|
||||||
@d MAX_PP_MACRO_LINES 128
|
#define MAX_PP_MACRO_PARAMETERS 8
|
||||||
|
#define MAX_PP_MACRO_LINES 128
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct preprocessor_macro {
|
typedef struct preprocessor_macro {
|
||||||
/* syntax */
|
/* syntax */
|
||||||
struct text_stream *identifier;
|
struct text_stream *identifier;
|
||||||
|
@ -573,10 +576,10 @@ typedef struct preprocessor_macro {
|
||||||
void (*expander)(struct preprocessor_macro *, struct preprocessor_state *, struct text_stream **, struct preprocessor_loop *, struct text_file_position *);
|
void (*expander)(struct preprocessor_macro *, struct preprocessor_state *, struct text_stream **, struct preprocessor_loop *, struct text_file_position *);
|
||||||
|
|
||||||
/* loop construct if any */
|
/* loop construct if any */
|
||||||
int begins_loop; /* |TRUE| for e.g. |repeat-block| or |repeat-span| */
|
int begins_loop; /* [[TRUE| for e.g. |repeat-block| or |repeat-span]] */
|
||||||
int ends_loop; /* |TRUE| for e.g. |end-repeat-block| */
|
int ends_loop; /* [[TRUE| for e.g. |end-repeat-block]] */
|
||||||
struct text_stream *loop_name; /* e.g. |repeat| */
|
struct text_stream *loop_name; /* e.g. [[repeat]] */
|
||||||
int span; /* |TRUE| for e.g. |end-repeat-span| or |repeat-span| */
|
int span; /* [[TRUE| for e.g. |end-repeat-span| or |repeat-span]] */
|
||||||
|
|
||||||
/* textual behaviour */
|
/* textual behaviour */
|
||||||
int suppress_newline_after_expanding;
|
int suppress_newline_after_expanding;
|
||||||
|
@ -592,16 +595,16 @@ typedef struct preprocessor_macro_parameter {
|
||||||
CLASS_DEFINITION
|
CLASS_DEFINITION
|
||||||
} preprocessor_macro_parameter;
|
} preprocessor_macro_parameter;
|
||||||
|
|
||||||
@ The following creates a new macro and adds it to the list |L|. By default, it
|
@ The following creates a new macro and adds it to the list [[L]]. By default, it
|
||||||
has an empty definition (i.e., no lines), but may have a meaning provided by its
|
has an empty definition (i.e., no lines), but may have a meaning provided by its
|
||||||
|expander| function regardless. The |parameter_specification| is as in the
|
[[expander]] function regardless. The [[parameter_specification]] is as in the
|
||||||
textual declaration: for example, |in: IN ?towards: WAY| would be valid, with
|
textual declaration: for example, [[in: IN ?towards: WAY]] would be valid, with
|
||||||
|in| being compulsory and |towards| optional when the macro is used.
|
[[in]] being compulsory and [[towards]] optional when the macro is used.
|
||||||
|
|
||||||
If we expected 10000 macros, a dictionary would be better than a list. But in
|
If we expected 10000 macros, a dictionary would be better than a list. But in
|
||||||
fact we expect more like 10.
|
fact we expect more like 10.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
|
preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
|
||||||
text_stream *parameter_specification,
|
text_stream *parameter_specification,
|
||||||
void (*expander)(preprocessor_macro *, preprocessor_state *, text_stream **, preprocessor_loop *, text_file_position *),
|
void (*expander)(preprocessor_macro *, preprocessor_state *, text_stream **, preprocessor_loop *, text_file_position *),
|
||||||
|
@ -609,13 +612,13 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
|
||||||
if (Preprocessor::find_macro(L, name))
|
if (Preprocessor::find_macro(L, name))
|
||||||
Errors::in_text_file("a macro with this name already exists", tfp);
|
Errors::in_text_file("a macro with this name already exists", tfp);
|
||||||
preprocessor_macro *new_macro = CREATE(preprocessor_macro);
|
preprocessor_macro *new_macro = CREATE(preprocessor_macro);
|
||||||
@<Initialise the macro@>;
|
<<Initialise the macro>>;
|
||||||
@<Parse the parameter list@>;
|
<<Parse the parameter list>>;
|
||||||
ADD_TO_LINKED_LIST(new_macro, preprocessor_macro, L);
|
ADD_TO_LINKED_LIST(new_macro, preprocessor_macro, L);
|
||||||
return new_macro;
|
return new_macro;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Initialise the macro@> =
|
<<Initialise the macro>>=
|
||||||
new_macro->identifier = Str::duplicate(name);
|
new_macro->identifier = Str::duplicate(name);
|
||||||
new_macro->no_parameters = 0;
|
new_macro->no_parameters = 0;
|
||||||
|
|
||||||
|
@ -629,7 +632,7 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
|
||||||
new_macro->suppress_newline_after_expanding = TRUE;
|
new_macro->suppress_newline_after_expanding = TRUE;
|
||||||
new_macro->suppress_whitespace_when_expanding = TRUE;
|
new_macro->suppress_whitespace_when_expanding = TRUE;
|
||||||
|
|
||||||
@<Parse the parameter list@> =
|
<<Parse the parameter list>>=
|
||||||
text_stream *spec = Str::duplicate(parameter_specification);
|
text_stream *spec = Str::duplicate(parameter_specification);
|
||||||
match_results mr = Regexp::create_mr();
|
match_results mr = Regexp::create_mr();
|
||||||
while (Regexp::match(&mr, spec, L" *(%C+): *(%C+) *(%c*)")) {
|
while (Regexp::match(&mr, spec, L" *(%C+): *(%C+) *(%c*)")) {
|
||||||
|
@ -640,14 +643,14 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
|
||||||
if (new_macro->no_parameters >= MAX_PP_MACRO_PARAMETERS) {
|
if (new_macro->no_parameters >= MAX_PP_MACRO_PARAMETERS) {
|
||||||
Errors::in_text_file("too many parameters in this definition", tfp);
|
Errors::in_text_file("too many parameters in this definition", tfp);
|
||||||
} else {
|
} else {
|
||||||
@<Add parameter to macro@>;
|
<<Add parameter to macro>>;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Regexp::dispose_of(&mr);
|
Regexp::dispose_of(&mr);
|
||||||
if (Str::is_whitespace(spec) == FALSE)
|
if (Str::is_whitespace(spec) == FALSE)
|
||||||
Errors::in_text_file("parameter list for this definition is malformed", tfp);
|
Errors::in_text_file("parameter list for this definition is malformed", tfp);
|
||||||
|
|
||||||
@<Add parameter to macro@> =
|
<<Add parameter to macro>>=
|
||||||
preprocessor_macro_parameter *new_parameter = CREATE(preprocessor_macro_parameter);
|
preprocessor_macro_parameter *new_parameter = CREATE(preprocessor_macro_parameter);
|
||||||
new_parameter->name = Str::duplicate(par_name);
|
new_parameter->name = Str::duplicate(par_name);
|
||||||
new_parameter->definition_token = Str::duplicate(token_name);
|
new_parameter->definition_token = Str::duplicate(token_name);
|
||||||
|
@ -661,7 +664,7 @@ preprocessor_macro *Preprocessor::new_macro(linked_list *L, text_stream *name,
|
||||||
@ We can then add lines to a macro (though this will only have an effect if its
|
@ We can then add lines to a macro (though this will only have an effect if its
|
||||||
expander function is //Preprocessor::default_expander//).
|
expander function is //Preprocessor::default_expander//).
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Preprocessor::add_line_to_macro(preprocessor_macro *mm, text_stream *line,
|
void Preprocessor::add_line_to_macro(preprocessor_macro *mm, text_stream *line,
|
||||||
text_file_position *tfp) {
|
text_file_position *tfp) {
|
||||||
if (mm->no_lines >= MAX_PP_MACRO_LINES) {
|
if (mm->no_lines >= MAX_PP_MACRO_LINES) {
|
||||||
|
@ -671,15 +674,15 @@ void Preprocessor::add_line_to_macro(preprocessor_macro *mm, text_stream *line,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Reserved macros.
|
@ \section{Reserved macros.}
|
||||||
A few macros are "reserved", that is, have built-in meanings, and use expander
|
A few macros are "reserved", that is, have built-in meanings, and use expander
|
||||||
functions other than //Preprocessor::default_expander//.
|
functions other than //Preprocessor::default_expander//.
|
||||||
|
|
||||||
Some of these, the |special_macros|, are supplied by the code calling the
|
Some of these, the [[special_macros]], are supplied by the code calling the
|
||||||
preprocessor. Those will provide domain-specific functionality. But a few are
|
preprocessor. Those will provide domain-specific functionality. But a few are
|
||||||
built in here and therefore work in every domain:
|
built in here and therefore work in every domain:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
linked_list *Preprocessor::list_of_reserved_macros(linked_list *special_macros) {
|
linked_list *Preprocessor::list_of_reserved_macros(linked_list *special_macros) {
|
||||||
linked_list *L = NEW_LINKED_LIST(preprocessor_macro);
|
linked_list *L = NEW_LINKED_LIST(preprocessor_macro);
|
||||||
Preprocessor::new_loop_macro(L, I"repeat", I"with: WITH in: IN",
|
Preprocessor::new_loop_macro(L, I"repeat", I"with: WITH in: IN",
|
||||||
|
@ -736,7 +739,7 @@ void Preprocessor::new_loop_macro(linked_list *L, text_stream *name,
|
||||||
|
|
||||||
@ Finding a macro in a list:
|
@ Finding a macro in a list:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
preprocessor_macro *Preprocessor::find_macro(linked_list *L, text_stream *name) {
|
preprocessor_macro *Preprocessor::find_macro(linked_list *L, text_stream *name) {
|
||||||
preprocessor_macro *mm;
|
preprocessor_macro *mm;
|
||||||
LOOP_OVER_LINKED_LIST(mm, preprocessor_macro, L)
|
LOOP_OVER_LINKED_LIST(mm, preprocessor_macro, L)
|
||||||
|
@ -745,13 +748,13 @@ preprocessor_macro *Preprocessor::find_macro(linked_list *L, text_stream *name)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h The expander for user-defined macros.
|
@ \section{The expander for user-defined macros.}
|
||||||
All macros created by |{define: ...}| are expanded by the following function.
|
All macros created by [[{define: ...}]] are expanded by the following function.
|
||||||
It creates a local "stack frame" making the parameters available as variables,
|
It creates a local "stack frame" making the parameters available as variables,
|
||||||
then runs the definition lines through the scanner, then dismantles the stack
|
then runs the definition lines through the scanner, then dismantles the stack
|
||||||
frame again.
|
frame again.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Preprocessor::default_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
void Preprocessor::default_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
||||||
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
|
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
|
||||||
PPS->stack_frame = Preprocessor::new_variable_set(PPS->stack_frame);
|
PPS->stack_frame = Preprocessor::new_variable_set(PPS->stack_frame);
|
||||||
|
@ -765,10 +768,10 @@ void Preprocessor::default_expander(preprocessor_macro *mm, preprocessor_state *
|
||||||
PPS->stack_frame = PPS->stack_frame->outer;
|
PPS->stack_frame = PPS->stack_frame->outer;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h The set expander.
|
@ \section{The set expander.}
|
||||||
An easy one.
|
An easy one.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Preprocessor::set_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
void Preprocessor::set_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
||||||
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
|
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
|
||||||
text_stream *name = parameter_values[0];
|
text_stream *name = parameter_values[0];
|
||||||
|
@ -781,9 +784,9 @@ void Preprocessor::set_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
||||||
Preprocessor::write_variable(var, value);
|
Preprocessor::write_variable(var, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h The repeat expander.
|
@ \section{The repeat expander.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Preprocessor::repeat_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
void Preprocessor::repeat_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
||||||
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
|
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
|
||||||
text_stream *with = parameter_values[0];
|
text_stream *with = parameter_values[0];
|
||||||
|
@ -803,7 +806,7 @@ void Preprocessor::repeat_expander(preprocessor_macro *mm, preprocessor_state *P
|
||||||
Preprocessor::add_loop_iteration(loop, value);
|
Preprocessor::add_loop_iteration(loop, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h The expander used for all loop ends.
|
@ \section{The expander used for all loop ends.}
|
||||||
The macros which open a loop just store up the name of the variable and the
|
The macros which open a loop just store up the name of the variable and the
|
||||||
range of its values: otherwise, they do nothing. It's only when the end of a
|
range of its values: otherwise, they do nothing. It's only when the end of a
|
||||||
loop is reached that any expansion happens, and this is where.
|
loop is reached that any expansion happens, and this is where.
|
||||||
|
@ -812,7 +815,7 @@ We create a new stack frame inside the current one, and put the loop variable
|
||||||
into it. Then we run through the iteration values, setting the variable to
|
into it. Then we run through the iteration values, setting the variable to
|
||||||
each in turn, and expand the material.
|
each in turn, and expand the material.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Preprocessor::end_loop_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
void Preprocessor::end_loop_expander(preprocessor_macro *mm, preprocessor_state *PPS,
|
||||||
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
|
text_stream **parameter_values, preprocessor_loop *loop, text_file_position *tfp) {
|
||||||
PPS->shadow_sp = 0;
|
PPS->shadow_sp = 0;
|
||||||
|
@ -826,12 +829,12 @@ void Preprocessor::end_loop_expander(preprocessor_macro *mm, preprocessor_state
|
||||||
Preprocessor::ensure_variable(loop->loop_var_name, PPS->stack_frame);
|
Preprocessor::ensure_variable(loop->loop_var_name, PPS->stack_frame);
|
||||||
text_stream *value;
|
text_stream *value;
|
||||||
LOOP_OVER_LINKED_LIST(value, text_stream, loop->iterations)
|
LOOP_OVER_LINKED_LIST(value, text_stream, loop->iterations)
|
||||||
@<Iterate with this value@>;
|
<<Iterate with this value>>;
|
||||||
PPS->stack_frame = PPS->stack_frame->outer;
|
PPS->stack_frame = PPS->stack_frame->outer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Iterate with this value@> =
|
<<Iterate with this value>>=
|
||||||
Preprocessor::write_variable(loop_var, value);
|
Preprocessor::write_variable(loop_var, value);
|
||||||
if (mm->span) {
|
if (mm->span) {
|
||||||
Preprocessor::expand(matter, tfp, PPS);
|
Preprocessor::expand(matter, tfp, PPS);
|
|
@ -2,10 +2,10 @@
|
||||||
|
|
||||||
Convenient routines for manipulating strings of text.
|
Convenient routines for manipulating strings of text.
|
||||||
|
|
||||||
@h Strings are streams.
|
@ \section{Strings are streams.}
|
||||||
Although Foundation provides limited facilities for handling standard or
|
Although Foundation provides limited facilities for handling standard or
|
||||||
wide C-style strings -- that is, null-terminated arrays of |char| or
|
wide C-style strings -- that is, null-terminated arrays of [[char]] or
|
||||||
|wchar_t| -- these are not encouraged.
|
[[wchar_t]] -- these are not encouraged.
|
||||||
|
|
||||||
Instead, a standard string for a program using Foundation is nothing more than
|
Instead, a standard string for a program using Foundation is nothing more than
|
||||||
a text stream (see Chapter 2). These are unbounded in size, with memory
|
a text stream (see Chapter 2). These are unbounded in size, with memory
|
||||||
|
@ -22,19 +22,19 @@ against buffer overruns.
|
||||||
The present section of code provides convenient routines for creating,
|
The present section of code provides convenient routines for creating,
|
||||||
duplicating, modifying and examining such strings.
|
duplicating, modifying and examining such strings.
|
||||||
|
|
||||||
@h New strings.
|
@ \section{New strings.}
|
||||||
Sometimes we want to make a new string in the sense of allocating more
|
Sometimes we want to make a new string in the sense of allocating more
|
||||||
memory to hold it. These objects won't automatically be destroyed, so we
|
memory to hold it. These objects won't automatically be destroyed, so we
|
||||||
shouldn't call these routines too casually. If we need a string just for
|
shouldn't call these routines too casually. If we need a string just for
|
||||||
some space to play with for a short while, it's better to create one
|
some space to play with for a short while, it's better to create one
|
||||||
with |TEMPORARY_TEXT| and then get rid of it with |DISCARD_TEXT|, macros
|
with [[TEMPORARY_TEXT]] and then get rid of it with [[DISCARD_TEXT]], macros
|
||||||
defined in Chapter 2.
|
defined in Chapter 2.
|
||||||
|
|
||||||
The capacity of these strings is unlimited in principle, and the number
|
The capacity of these strings is unlimited in principle, and the number
|
||||||
here is just the size of the initial memory block, which is fastest to
|
here is just the size of the initial memory block, which is fastest to
|
||||||
access.
|
access.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
text_stream *Str::new(void) {
|
text_stream *Str::new(void) {
|
||||||
return Str::new_with_capacity(32);
|
return Str::new_with_capacity(32);
|
||||||
}
|
}
|
||||||
|
@ -50,10 +50,10 @@ void Str::dispose_of(text_stream *text) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ Duplication of an existing string is complicated only by the issue that
|
@ Duplication of an existing string is complicated only by the issue that
|
||||||
we want the duplicate always to be writeable, so that |NULL| can't be
|
we want the duplicate always to be writeable, so that [[NULL]] can't be
|
||||||
duplicated as |NULL|.
|
duplicated as [[NULL]].
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
text_stream *Str::duplicate(text_stream *E) {
|
text_stream *Str::duplicate(text_stream *E) {
|
||||||
if (E == NULL) return Str::new();
|
if (E == NULL) return Str::new();
|
||||||
text_stream *S = CREATE(text_stream);
|
text_stream *S = CREATE(text_stream);
|
||||||
|
@ -64,12 +64,12 @@ text_stream *Str::duplicate(text_stream *E) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Converting from C strings.
|
@ \section{Converting from C strings.}
|
||||||
Here we open text streams initially equal to the given C strings, and
|
Here we open text streams initially equal to the given C strings, and
|
||||||
with the capacity of the initial block large enough to hold the whole
|
with the capacity of the initial block large enough to hold the whole
|
||||||
thing plus a little extra, for efficiency's sake.
|
thing plus a little extra, for efficiency's sake.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
text_stream *Str::new_from_wide_string(const wchar_t *C_string) {
|
text_stream *Str::new_from_wide_string(const wchar_t *C_string) {
|
||||||
text_stream *S = CREATE(text_stream);
|
text_stream *S = CREATE(text_stream);
|
||||||
if (Streams::open_from_wide_string(S, C_string)) return S;
|
if (Streams::open_from_wide_string(S, C_string)) return S;
|
||||||
|
@ -96,7 +96,7 @@ text_stream *Str::new_from_locale_string(const char *C_string) {
|
||||||
|
|
||||||
@ And sometimes we want to use an existing stream object:
|
@ And sometimes we want to use an existing stream object:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
text_stream *Str::from_wide_string(text_stream *S, wchar_t *c_string) {
|
text_stream *Str::from_wide_string(text_stream *S, wchar_t *c_string) {
|
||||||
if (Streams::open_from_wide_string(S, c_string) == FALSE) return NULL;
|
if (Streams::open_from_wide_string(S, c_string) == FALSE) return NULL;
|
||||||
return S;
|
return S;
|
||||||
|
@ -107,9 +107,9 @@ text_stream *Str::from_locale_string(text_stream *S, char *c_string) {
|
||||||
return S;
|
return S;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Converting to C strings.
|
@ \section{Converting to C strings.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Str::copy_to_ISO_string(char *C_string, text_stream *S, int buffer_size) {
|
void Str::copy_to_ISO_string(char *C_string, text_stream *S, int buffer_size) {
|
||||||
Streams::write_as_ISO_string(C_string, S, buffer_size);
|
Streams::write_as_ISO_string(C_string, S, buffer_size);
|
||||||
}
|
}
|
||||||
|
@ -126,9 +126,9 @@ void Str::copy_to_locale_string(char *C_string, text_stream *S, int buffer_size)
|
||||||
Streams::write_as_locale_string(C_string, S, buffer_size);
|
Streams::write_as_locale_string(C_string, S, buffer_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Converting to integers.
|
@ \section{Converting to integers.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Str::atoi(text_stream *S, int index) {
|
int Str::atoi(text_stream *S, int index) {
|
||||||
char buffer[32];
|
char buffer[32];
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
@ -139,15 +139,15 @@ int Str::atoi(text_stream *S, int index) {
|
||||||
return atoi(buffer);
|
return atoi(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Length.
|
@ \section{Length.}
|
||||||
A puritan would return a |size_t| here, but I am not a puritan.
|
A puritan would return a [[size_t]] here, but I am not a puritan.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Str::len(text_stream *S) {
|
int Str::len(text_stream *S) {
|
||||||
return Streams::get_position(S);
|
return Streams::get_position(S);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Position markers.
|
@ \section{Position markers.}
|
||||||
A position marker is a lightweight way to refer to a particular position
|
A position marker is a lightweight way to refer to a particular position
|
||||||
in a given string. Position 0 is before the first character; if, for
|
in a given string. Position 0 is before the first character; if, for
|
||||||
example, the string contains the word "gazpacho", then position 8 represents
|
example, the string contains the word "gazpacho", then position 8 represents
|
||||||
|
@ -155,7 +155,7 @@ the end of the string, after the "o". Negative positions are not allowed,
|
||||||
but positive ones well past the end of the string are legal. (Doing things
|
but positive ones well past the end of the string are legal. (Doing things
|
||||||
at those positions may well not be, of course.)
|
at those positions may well not be, of course.)
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct string_position {
|
typedef struct string_position {
|
||||||
struct text_stream *S;
|
struct text_stream *S;
|
||||||
int index;
|
int index;
|
||||||
|
@ -163,7 +163,7 @@ typedef struct string_position {
|
||||||
|
|
||||||
@ You can then find a position in a given string thus:
|
@ You can then find a position in a given string thus:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
string_position Str::start(text_stream *S) {
|
string_position Str::start(text_stream *S) {
|
||||||
string_position P; P.S = S; P.index = 0; return P;
|
string_position P; P.S = S; P.index = 0; return P;
|
||||||
}
|
}
|
||||||
|
@ -180,7 +180,7 @@ string_position Str::end(text_stream *S) {
|
||||||
|
|
||||||
@ And you can step forwards or backwards:
|
@ And you can step forwards or backwards:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
string_position Str::back(string_position P) {
|
string_position Str::back(string_position P) {
|
||||||
if (P.index > 0) P.index--; return P;
|
if (P.index > 0) P.index--; return P;
|
||||||
}
|
}
|
||||||
|
@ -209,18 +209,19 @@ int Str::index(string_position P) {
|
||||||
|
|
||||||
@ This leads to the following convenient loop macros:
|
@ This leads to the following convenient loop macros:
|
||||||
|
|
||||||
@d LOOP_THROUGH_TEXT(P, ST)
|
<<*>>=
|
||||||
|
#define LOOP_THROUGH_TEXT(P, ST)
|
||||||
for (string_position P = Str::start(ST); P.index < Str::len(P.S); P.index++)
|
for (string_position P = Str::start(ST); P.index < Str::len(P.S); P.index++)
|
||||||
|
|
||||||
@d LOOP_BACKWARDS_THROUGH_TEXT(P, ST)
|
#define LOOP_BACKWARDS_THROUGH_TEXT(P, ST)
|
||||||
for (string_position P = Str::back(Str::end(ST)); P.index >= 0; P.index--)
|
for (string_position P = Str::back(Str::end(ST)); P.index >= 0; P.index--)
|
||||||
|
|
||||||
@h Character operations.
|
@ \section{Character operations.}
|
||||||
How to get at individual characters, then, now that we can refer to positions:
|
How to get at individual characters, then, now that we can refer to positions:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
wchar_t Str::get(string_position P) {
|
wchar_t Str::get(string_position P) {
|
||||||
if ((P.S == NULL) || (P.index < 0)) return 0;
|
if ((P.S == NULL) [[]] (P.index < 0)) return 0;
|
||||||
return Streams::get_char_at_index(P.S, P.index);
|
return Streams::get_char_at_index(P.S, P.index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,7 +240,7 @@ wchar_t Str::get_last_char(text_stream *S) {
|
||||||
return Str::get(Str::at(S, L-1));
|
return Str::get(Str::at(S, L-1));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ =
|
<<*>>=
|
||||||
void Str::put(string_position P, wchar_t C) {
|
void Str::put(string_position P, wchar_t C) {
|
||||||
if (P.index < 0) internal_error("wrote before start of string");
|
if (P.index < 0) internal_error("wrote before start of string");
|
||||||
if (P.S == NULL) internal_error("wrote to null stream");
|
if (P.S == NULL) internal_error("wrote to null stream");
|
||||||
|
@ -256,9 +257,9 @@ void Str::put_at(text_stream *S, int index, wchar_t C) {
|
||||||
Str::put(Str::at(S, index), C);
|
Str::put(Str::at(S, index), C);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Truncation.
|
@ \section{Truncation.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Str::clear(text_stream *S) {
|
void Str::clear(text_stream *S) {
|
||||||
Str::truncate(S, 0);
|
Str::truncate(S, 0);
|
||||||
}
|
}
|
||||||
|
@ -268,9 +269,9 @@ void Str::truncate(text_stream *S, int len) {
|
||||||
if (len < Str::len(S)) Str::put(Str::at(S, len), 0);
|
if (len < Str::len(S)) Str::put(Str::at(S, len), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Indentation.
|
@ \section{Indentation.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Str::remove_indentation(text_stream *S, int spaces_per_tab) {
|
int Str::remove_indentation(text_stream *S, int spaces_per_tab) {
|
||||||
int spaces_in = 0, tab_stops_of_indentation = 0;
|
int spaces_in = 0, tab_stops_of_indentation = 0;
|
||||||
while (Characters::is_space_or_tab(Str::get_first_char(S))) {
|
while (Characters::is_space_or_tab(Str::get_first_char(S))) {
|
||||||
|
@ -307,9 +308,9 @@ void Str::rectify_indentation(text_stream *S, int spaces_per_tab) {
|
||||||
DISCARD_TEXT(tail)
|
DISCARD_TEXT(tail)
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Copying.
|
@ \section{Copying.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Str::concatenate(text_stream *S1, text_stream *S2) {
|
void Str::concatenate(text_stream *S1, text_stream *S2) {
|
||||||
Streams::copy(S1, S2);
|
Streams::copy(S1, S2);
|
||||||
}
|
}
|
||||||
|
@ -330,7 +331,7 @@ void Str::copy_tail(text_stream *S1, text_stream *S2, int from) {
|
||||||
|
|
||||||
@ A subtly different operation is to set a string equal to a given C string:
|
@ A subtly different operation is to set a string equal to a given C string:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Str::copy_ISO_string(text_stream *S, char *C_string) {
|
void Str::copy_ISO_string(text_stream *S, char *C_string) {
|
||||||
Str::clear(S);
|
Str::clear(S);
|
||||||
Streams::write_ISO_string(S, C_string);
|
Streams::write_ISO_string(S, C_string);
|
||||||
|
@ -346,10 +347,10 @@ void Str::copy_wide_string(text_stream *S, wchar_t *C_string) {
|
||||||
Streams::write_wide_string(S, C_string);
|
Streams::write_wide_string(S, C_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Comparisons.
|
@ \section{Comparisons.}
|
||||||
We provide both case sensitive and insensitive versions.
|
We provide both case sensitive and insensitive versions.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Str::eq(text_stream *S1, text_stream *S2) {
|
int Str::eq(text_stream *S1, text_stream *S2) {
|
||||||
if (Str::cmp(S1, S2) == 0) return TRUE;
|
if (Str::cmp(S1, S2) == 0) return TRUE;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
@ -371,10 +372,10 @@ int Str::ne_insensitive(text_stream *S1, text_stream *S2) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ These two routines produce a numerical string difference suitable for
|
@ These two routines produce a numerical string difference suitable for
|
||||||
alphabetic sorting, like |strlen| in the C standard library.
|
alphabetic sorting, like [[strlen]] in the C standard library.
|
||||||
|
|
||||||
This would be a more elegant implementation:
|
This would be a more elegant implementation:
|
||||||
= (text as InC)
|
|
||||||
for (string_position P = Str::start(S1), Q = Str::start(S2);
|
for (string_position P = Str::start(S1), Q = Str::start(S2);
|
||||||
(P.index < Str::len(S1)) && (Q.index < Str::len(S2));
|
(P.index < Str::len(S1)) && (Q.index < Str::len(S2));
|
||||||
P = Str::forward(P), Q = Str::forward(Q)) {
|
P = Str::forward(P), Q = Str::forward(Q)) {
|
||||||
|
@ -382,11 +383,11 @@ This would be a more elegant implementation:
|
||||||
if (d != 0) return d;
|
if (d != 0) return d;
|
||||||
}
|
}
|
||||||
return Str::len(S1) - Str::len(S2);
|
return Str::len(S1) - Str::len(S2);
|
||||||
=
|
|
||||||
But profiling shows that the following speeds up the Inform 7 compiler by
|
But profiling shows that the following speeds up the Inform 7 compiler by
|
||||||
around 1%.
|
around 1%.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Str::cmp(text_stream *S1, text_stream *S2) {
|
int Str::cmp(text_stream *S1, text_stream *S2) {
|
||||||
int L1 = Str::len(S1), L2 = Str::len(S2), M = L1;
|
int L1 = Str::len(S1), L2 = Str::len(S2), M = L1;
|
||||||
if (L2 < M) M = L2;
|
if (L2 < M) M = L2;
|
||||||
|
@ -408,16 +409,16 @@ int Str::cmp_insensitive(text_stream *S1, text_stream *S2) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ It's sometimes useful to see whether two strings agree on their last
|
@ It's sometimes useful to see whether two strings agree on their last
|
||||||
|N| characters, or their first |N|. For example,
|
[[N]] characters, or their first [[N]]. For example,
|
||||||
= (text as code)
|
|
||||||
Str::suffix_eq(I"wayzgoose", I"snow goose", N)
|
Str::suffix_eq(I"wayzgoose", I"snow goose", N)
|
||||||
=
|
|
||||||
will return |TRUE| for |N| equal to 0 to 5, and |FALSE| thereafter.
|
will return [[TRUE]] for [[N]] equal to 0 to 5, and [[FALSE]] thereafter.
|
||||||
|
|
||||||
(The Oxford English Dictionary defines a "wayzgoose" as a holiday outing
|
(The Oxford English Dictionary defines a "wayzgoose" as a holiday outing
|
||||||
for the staff of a publishing house.)
|
for the staff of a publishing house.)
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Str::prefix_eq(text_stream *S1, text_stream *S2, int N) {
|
int Str::prefix_eq(text_stream *S1, text_stream *S2, int N) {
|
||||||
int L1 = Str::len(S1), L2 = Str::len(S2);
|
int L1 = Str::len(S1), L2 = Str::len(S2);
|
||||||
if ((N > L1) || (N > L2)) return FALSE;
|
if ((N > L1) || (N > L2)) return FALSE;
|
||||||
|
@ -454,7 +455,7 @@ int Str::ends_with_wide_string(text_stream *S, wchar_t *suffix) {
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ =
|
<<*>>=
|
||||||
int Str::eq_wide_string(text_stream *S1, wchar_t *S2) {
|
int Str::eq_wide_string(text_stream *S1, wchar_t *S2) {
|
||||||
if (S2 == NULL) return (Str::len(S1) == 0)?TRUE:FALSE;
|
if (S2 == NULL) return (Str::len(S1) == 0)?TRUE:FALSE;
|
||||||
if (Str::len(S1) == (int) wcslen(S2)) {
|
if (Str::len(S1) == (int) wcslen(S2)) {
|
||||||
|
@ -481,9 +482,9 @@ int Str::ne_wide_string(text_stream *S1, wchar_t *S2) {
|
||||||
return (Str::eq_wide_string(S1, S2)?FALSE:TRUE);
|
return (Str::eq_wide_string(S1, S2)?FALSE:TRUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@h White space.
|
@ \section{White space.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Str::is_whitespace(text_stream *S) {
|
int Str::is_whitespace(text_stream *S) {
|
||||||
LOOP_THROUGH_TEXT(pos, S)
|
LOOP_THROUGH_TEXT(pos, S)
|
||||||
if (Characters::is_space_or_tab(Str::get(pos)) == FALSE)
|
if (Characters::is_space_or_tab(Str::get(pos)) == FALSE)
|
||||||
|
@ -493,7 +494,7 @@ int Str::is_whitespace(text_stream *S) {
|
||||||
|
|
||||||
@ This removes spaces and tabs from both ends:
|
@ This removes spaces and tabs from both ends:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Str::trim_white_space(text_stream *S) {
|
void Str::trim_white_space(text_stream *S) {
|
||||||
int len = Str::len(S), i = 0, j = 0;
|
int len = Str::len(S), i = 0, j = 0;
|
||||||
string_position F = Str::start(S);
|
string_position F = Str::start(S);
|
||||||
|
@ -545,9 +546,9 @@ int Str::trim_all_white_space_at_end(text_stream *S) {
|
||||||
return shortened;
|
return shortened;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Deleting characters.
|
@ \section{Deleting characters.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Str::delete_first_character(text_stream *S) {
|
void Str::delete_first_character(text_stream *S) {
|
||||||
Str::delete_nth_character(S, 0);
|
Str::delete_nth_character(S, 0);
|
||||||
}
|
}
|
||||||
|
@ -572,9 +573,9 @@ void Str::delete_n_characters(text_stream *S, int n) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Substrings.
|
@ \section{Substrings.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Str::substr(OUTPUT_STREAM, string_position from, string_position to) {
|
void Str::substr(OUTPUT_STREAM, string_position from, string_position to) {
|
||||||
if (from.S != to.S) internal_error("substr on two different strings");
|
if (from.S != to.S) internal_error("substr on two different strings");
|
||||||
for (int i = from.index; i < to.index; i++)
|
for (int i = from.index; i < to.index; i++)
|
||||||
|
@ -632,29 +633,29 @@ int Str::includes_at(text_stream *line, int i, text_stream *pattern) {
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Shim for literal storage.
|
@ \section{Shim for literal storage.}
|
||||||
This is where all of those I-literals created by Inweb are stored at run-time.
|
This is where all of those I-literals created by Inweb are stored at run-time.
|
||||||
Note that every instance of, say, |I"fish"| would return the same string,
|
Note that every instance of, say, [[I"fish"]] would return the same string,
|
||||||
that is, the same |text_stream *| value. To prevent nasty accidents, this
|
that is, the same [[text_stream *]] value. To prevent nasty accidents, this
|
||||||
is marked so that the stream value, "fish", cannot be modified at run-time.
|
is marked so that the stream value, "fish", cannot be modified at run-time.
|
||||||
|
|
||||||
The dictionary look-up here would not be thread-safe, so it's protected by
|
The dictionary look-up here would not be thread-safe, so it's protected by
|
||||||
a mutex. There's no real performance concern because the following routine
|
a mutex. There's no real performance concern because the following routine
|
||||||
is run just once per I-literal in the source code, when the program starts up.
|
is run just once per I-literal in the source code, when the program starts up.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
dictionary *string_literals_dictionary = NULL;
|
dictionary *string_literals_dictionary = NULL;
|
||||||
|
|
||||||
text_stream *Str::literal(wchar_t *wide_C_string) {
|
text_stream *Str::literal(wchar_t *wide_C_string) {
|
||||||
text_stream *answer = NULL;
|
text_stream *answer = NULL;
|
||||||
CREATE_MUTEX(mutex);
|
CREATE_MUTEX(mutex);
|
||||||
LOCK_MUTEX(mutex);
|
LOCK_MUTEX(mutex);
|
||||||
@<Look in dictionary of string literals@>;
|
<<Look in dictionary of string literals>>;
|
||||||
UNLOCK_MUTEX(mutex);
|
UNLOCK_MUTEX(mutex);
|
||||||
return answer;
|
return answer;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Look in dictionary of string literals@> =
|
<<Look in dictionary of string literals>>=
|
||||||
if (string_literals_dictionary == NULL)
|
if (string_literals_dictionary == NULL)
|
||||||
string_literals_dictionary = Dictionaries::new(100, TRUE);
|
string_literals_dictionary = Dictionaries::new(100, TRUE);
|
||||||
answer = Dictionaries::get_text_literal(string_literals_dictionary, wide_C_string);
|
answer = Dictionaries::get_text_literal(string_literals_dictionary, wide_C_string);
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
To read text files of whatever flavour, one line at a time.
|
To read text files of whatever flavour, one line at a time.
|
||||||
|
|
||||||
@h Text files.
|
@ \section{Text files.}
|
||||||
Foundation was written mainly to support command-line tools which, of their
|
Foundation was written mainly to support command-line tools which, of their
|
||||||
nature, deal with a lot of text files: source code of programs, configuration
|
nature, deal with a lot of text files: source code of programs, configuration
|
||||||
files, HTML, XML and so on. The main aim of this section is to provide a
|
files, HTML, XML and so on. The main aim of this section is to provide a
|
||||||
|
@ -10,13 +10,13 @@ standard way to read in and iterate through lines of a text file.
|
||||||
|
|
||||||
First, though, here is a perhaps clumsy but effective way to test if a
|
First, though, here is a perhaps clumsy but effective way to test if a
|
||||||
file actually exists on disc at a given filename. Note that under the C standard,
|
file actually exists on disc at a given filename. Note that under the C standard,
|
||||||
it's entirely legal for |fopen| to behave more or less as it likes if asked to
|
it's entirely legal for [[fopen]] to behave more or less as it likes if asked to
|
||||||
open a directory as a file; and on MacOS, it sometimes opens a directory exactly
|
open a directory as a file; and on MacOS, it sometimes opens a directory exactly
|
||||||
as if it were an empty text file. The safest way to ensure that a directory is
|
as if it were an empty text file. The safest way to ensure that a directory is
|
||||||
never confused with a file seems to be to try |opendir| on it, and the following
|
never confused with a file seems to be to try [[opendir]] on it, and the following
|
||||||
does essentially that.
|
does essentially that.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int TextFiles::exists(filename *F) {
|
int TextFiles::exists(filename *F) {
|
||||||
TEMPORARY_TEXT(pn)
|
TEMPORARY_TEXT(pn)
|
||||||
WRITE_TO(pn, "%f", F);
|
WRITE_TO(pn, "%f", F);
|
||||||
|
@ -32,10 +32,10 @@ int TextFiles::exists(filename *F) {
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Text file positions.
|
@ \section{Text file positions.}
|
||||||
Here's how we record a position in a text file:
|
Here's how we record a position in a text file:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct text_file_position {
|
typedef struct text_file_position {
|
||||||
struct filename *text_file_filename;
|
struct filename *text_file_filename;
|
||||||
FILE *handle_when_open;
|
FILE *handle_when_open;
|
||||||
|
@ -48,7 +48,7 @@ typedef struct text_file_position {
|
||||||
|
|
||||||
@ For access:
|
@ For access:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int TextFiles::get_line_count(text_file_position *tfp) {
|
int TextFiles::get_line_count(text_file_position *tfp) {
|
||||||
if (tfp == NULL) return 0;
|
if (tfp == NULL) return 0;
|
||||||
return tfp->line_count;
|
return tfp->line_count;
|
||||||
|
@ -56,7 +56,7 @@ int TextFiles::get_line_count(text_file_position *tfp) {
|
||||||
|
|
||||||
@ And this is for a real nowhere man:
|
@ And this is for a real nowhere man:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
text_file_position TextFiles::nowhere(void) {
|
text_file_position TextFiles::nowhere(void) {
|
||||||
text_file_position tfp;
|
text_file_position tfp;
|
||||||
tfp.text_file_filename = NULL;
|
tfp.text_file_filename = NULL;
|
||||||
|
@ -74,26 +74,26 @@ text_file_position TextFiles::at(filename *F, int line) {
|
||||||
return tfp;
|
return tfp;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Text file scanner.
|
@ \section{Text file scanner.}
|
||||||
We read lines in, delimited by any of the standard line-ending characters,
|
We read lines in, delimited by any of the standard line-ending characters,
|
||||||
and send them one at a time to a function called |iterator|. Throughout,
|
and send them one at a time to a function called [[iterator]]. Throughout,
|
||||||
we preserve a pointer called |state| to some object being used by the
|
we preserve a pointer called [[state]] to some object being used by the
|
||||||
client.
|
client.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int TextFiles::read(filename *F, int escape_oddities, char *message, int serious,
|
int TextFiles::read(filename *F, int escape_oddities, char *message, int serious,
|
||||||
void (iterator)(text_stream *, text_file_position *, void *),
|
void (iterator)(text_stream *, text_file_position *, void *),
|
||||||
text_file_position *start_at, void *state) {
|
text_file_position *start_at, void *state) {
|
||||||
text_file_position tfp;
|
text_file_position tfp;
|
||||||
tfp.ufb = TextFiles::create_ufb();
|
tfp.ufb = TextFiles::create_ufb();
|
||||||
@<Open the text file@>;
|
<<Open the text file>>;
|
||||||
@<Set the initial position, seeking it in the file if need be@>;
|
<<Set the initial position, seeking it in the file if need be>>;
|
||||||
@<Read in lines and send them one by one to the iterator@>;
|
<<Read in lines and send them one by one to the iterator>>;
|
||||||
fclose(tfp.handle_when_open);
|
fclose(tfp.handle_when_open);
|
||||||
return tfp.line_count;
|
return tfp.line_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Open the text file@> =
|
<<Open the text file>>=
|
||||||
tfp.handle_when_open = Filenames::fopen(F, "rb");
|
tfp.handle_when_open = Filenames::fopen(F, "rb");
|
||||||
if (tfp.handle_when_open == NULL) {
|
if (tfp.handle_when_open == NULL) {
|
||||||
if (message == NULL) return 0;
|
if (message == NULL) return 0;
|
||||||
|
@ -101,12 +101,12 @@ int TextFiles::read(filename *F, int escape_oddities, char *message, int serious
|
||||||
else { Errors::with_file(message, F); return 0; }
|
else { Errors::with_file(message, F); return 0; }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ The ANSI definition of |ftell| and |fseek| says that, with text files, the
|
@ The ANSI definition of [[ftell]] and [[fseek]] says that, with text files, the
|
||||||
only definite position value is 0 -- meaning the beginning of the file -- and
|
only definite position value is 0 -- meaning the beginning of the file -- and
|
||||||
this is what we initialise |line_position| to. We must otherwise only write
|
this is what we initialise [[line_position]] to. We must otherwise only write
|
||||||
values returned by |ftell| into this field.
|
values returned by [[ftell]] into this field.
|
||||||
|
|
||||||
@<Set the initial position, seeking it in the file if need be@> =
|
<<Set the initial position, seeking it in the file if need be>>=
|
||||||
if (start_at == NULL) {
|
if (start_at == NULL) {
|
||||||
tfp.line_count = 1;
|
tfp.line_count = 1;
|
||||||
tfp.line_position = 0;
|
tfp.line_position = 0;
|
||||||
|
@ -122,48 +122,48 @@ values returned by |ftell| into this field.
|
||||||
tfp.actively_scanning = TRUE;
|
tfp.actively_scanning = TRUE;
|
||||||
tfp.text_file_filename = F;
|
tfp.text_file_filename = F;
|
||||||
|
|
||||||
@ We aim to get this right whether the lines are terminated by |0A|, |0D|,
|
@ We aim to get this right whether the lines are terminated by [[0A]], [[0D]],
|
||||||
|0A 0D| or |0D 0A|. The final line is not required to be terminated.
|
[[0A 0D]] or [[0D 0A]]. The final line is not required to be terminated.
|
||||||
|
|
||||||
@<Read in lines and send them one by one to the iterator@> =
|
<<Read in lines and send them one by one to the iterator>>=
|
||||||
TEMPORARY_TEXT(line)
|
TEMPORARY_TEXT(line)
|
||||||
int i = 0, c = ' ';
|
int i = 0, c = ' ';
|
||||||
while ((c != EOF) && (tfp.actively_scanning)) {
|
while ((c != EOF) && (tfp.actively_scanning)) {
|
||||||
c = TextFiles::utf8_fgetc(tfp.handle_when_open, NULL, escape_oddities, &tfp.ufb);
|
c = TextFiles::utf8_fgetc(tfp.handle_when_open, NULL, escape_oddities, &tfp.ufb);
|
||||||
if ((c == EOF) || (c == '\x0a') || (c == '\x0d')) {
|
if ((c == EOF) [[| (c == '\x0a') |]] (c == '\x0d')) {
|
||||||
Str::put_at(line, i, 0);
|
Str::put_at(line, i, 0);
|
||||||
if ((i > 0) || (c != tfp.skip_terminator)) {
|
if ((i > 0) [[]] (c != tfp.skip_terminator)) {
|
||||||
@<Feed the completed line to the iterator routine@>;
|
<<Feed the completed line to the iterator routine>>;
|
||||||
if (c == '\x0a') tfp.skip_terminator = '\x0d';
|
if (c == '\x0a') tfp.skip_terminator = '\x0d';
|
||||||
if (c == '\x0d') tfp.skip_terminator = '\x0a';
|
if (c == '\x0d') tfp.skip_terminator = '\x0a';
|
||||||
} else tfp.skip_terminator = 'X';
|
} else tfp.skip_terminator = 'X';
|
||||||
@<Update the text file position@>;
|
<<Update the text file position>>;
|
||||||
i = 0;
|
i = 0;
|
||||||
} else {
|
} else {
|
||||||
Str::put_at(line, i++, (wchar_t) c);
|
Str::put_at(line, i++, (wchar_t) c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((i > 0) && (tfp.actively_scanning))
|
if ((i > 0) && (tfp.actively_scanning))
|
||||||
@<Feed the completed line to the iterator routine@>;
|
<<Feed the completed line to the iterator routine>>;
|
||||||
DISCARD_TEXT(line)
|
DISCARD_TEXT(line)
|
||||||
|
|
||||||
@ We update the line counter only when a line is actually sent:
|
@ We update the line counter only when a line is actually sent:
|
||||||
|
|
||||||
@<Feed the completed line to the iterator routine@> =
|
<<Feed the completed line to the iterator routine>>=
|
||||||
iterator(line, &tfp, state);
|
iterator(line, &tfp, state);
|
||||||
tfp.line_count++;
|
tfp.line_count++;
|
||||||
|
|
||||||
@ But we update the text file position after every apparent line terminator.
|
@ But we update the text file position after every apparent line terminator.
|
||||||
This is because we might otherwise, on a Windows text file, end up with an
|
This is because we might otherwise, on a Windows text file, end up with an
|
||||||
|ftell| position in between the |CR| and the |LF|; if we resume at that point,
|
[[ftell]] position in between the [[CR]] and the [[LF]]; if we resume at that point,
|
||||||
later on, we'll then have an off-by-one error in the line numbering in the
|
later on, we'll then have an off-by-one error in the line numbering in the
|
||||||
resumption as compared to during the original pass.
|
resumption as compared to during the original pass.
|
||||||
|
|
||||||
Properly speaking, |ftell| returns a long |int|, not an |int|, but on a
|
Properly speaking, [[ftell]] returns a long [[int]], not an [[int]], but on a
|
||||||
32-bit-or-more integer machine, this gives us room for files to run to 2GB.
|
32-bit-or-more integer machine, this gives us room for files to run to 2GB.
|
||||||
Text files seldom come that large.
|
Text files seldom come that large.
|
||||||
|
|
||||||
@<Update the text file position@> =
|
<<Update the text file position>>=
|
||||||
tfp.line_position = (int) (ftell(tfp.handle_when_open));
|
tfp.line_position = (int) (ftell(tfp.handle_when_open));
|
||||||
if (tfp.line_position == -1) {
|
if (tfp.line_position == -1) {
|
||||||
if (serious)
|
if (serious)
|
||||||
|
@ -172,15 +172,15 @@ Text files seldom come that large.
|
||||||
Errors::with_file("unable to determine position in file", F);
|
Errors::with_file("unable to determine position in file", F);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ =
|
<<*>>=
|
||||||
void TextFiles::read_line(OUTPUT_STREAM, int escape_oddities, text_file_position *tfp) {
|
void TextFiles::read_line(OUTPUT_STREAM, int escape_oddities, text_file_position *tfp) {
|
||||||
Str::clear(OUT);
|
Str::clear(OUT);
|
||||||
int i = 0, c = ' ';
|
int i = 0, c = ' ';
|
||||||
while ((c != EOF) && (tfp->actively_scanning)) {
|
while ((c != EOF) && (tfp->actively_scanning)) {
|
||||||
c = TextFiles::utf8_fgetc(tfp->handle_when_open, NULL, escape_oddities, &tfp->ufb);
|
c = TextFiles::utf8_fgetc(tfp->handle_when_open, NULL, escape_oddities, &tfp->ufb);
|
||||||
if ((c == EOF) || (c == '\x0a') || (c == '\x0d')) {
|
if ((c == EOF) [[| (c == '\x0a') |]] (c == '\x0d')) {
|
||||||
Str::put_at(OUT, i, 0);
|
Str::put_at(OUT, i, 0);
|
||||||
if ((i > 0) || (c != tfp->skip_terminator)) {
|
if ((i > 0) [[]] (c != tfp->skip_terminator)) {
|
||||||
if (c == '\x0a') tfp->skip_terminator = '\x0d';
|
if (c == '\x0a') tfp->skip_terminator = '\x0d';
|
||||||
if (c == '\x0d') tfp->skip_terminator = '\x0a';
|
if (c == '\x0d') tfp->skip_terminator = '\x0a';
|
||||||
} else tfp->skip_terminator = 'X';
|
} else tfp->skip_terminator = 'X';
|
||||||
|
@ -196,35 +196,35 @@ void TextFiles::read_line(OUTPUT_STREAM, int escape_oddities, text_file_position
|
||||||
@ The routine being iterated can indicate that it has had enough by
|
@ The routine being iterated can indicate that it has had enough by
|
||||||
calling the following:
|
calling the following:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void TextFiles::lose_interest(text_file_position *tfp) {
|
void TextFiles::lose_interest(text_file_position *tfp) {
|
||||||
tfp->actively_scanning = FALSE;
|
tfp->actively_scanning = FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Reading UTF-8 files.
|
@ \section{Reading UTF-8 files.}
|
||||||
The following routine reads a sequence of Unicode characters from a UTF-8
|
The following routine reads a sequence of Unicode characters from a UTF-8
|
||||||
encoded file, but returns them as a sequence of ISO Latin-1 characters, a
|
encoded file, but returns them as a sequence of ISO Latin-1 characters, a
|
||||||
trick it can only pull off by escaping non-ISO characters. This is done by
|
trick it can only pull off by escaping non-ISO characters. This is done by
|
||||||
taking character number |N| and feeding it out, one character at a time, as
|
taking character number [[N]] and feeding it out, one character at a time, as
|
||||||
the text |[unicode N]|, writing the number in decimal. Only one UTF-8
|
the text [[[unicode N]]], writing the number in decimal. Only one UTF-8
|
||||||
file like this will be being read at a time, and the routine will be
|
file like this will be being read at a time, and the routine will be
|
||||||
repeatedly called until |EOF| or a line division.
|
repeatedly called until [[EOF]] or a line division.
|
||||||
|
|
||||||
Strictly speaking, we transmit not as ISO Latin-1 but as that subset of ISO
|
Strictly speaking, we transmit not as ISO Latin-1 but as that subset of ISO
|
||||||
which have corresponding (different) codes in the ZSCII character set. This
|
which have corresponding (different) codes in the ZSCII character set. This
|
||||||
excludes some typewriter symbols and a handful of letterforms, as we shall
|
excludes some typewriter symbols and a handful of letterforms, as we shall
|
||||||
see.
|
see.
|
||||||
|
|
||||||
There are two exceptions: |TextFiles::utf8_fgetc| can also return the usual C
|
There are two exceptions: [[TextFiles::utf8_fgetc]] can also return the usual C
|
||||||
end-of-file pseudo-character |EOF|, and it can also return the Unicode BOM
|
end-of-file pseudo-character [[EOF]], and it can also return the Unicode BOM
|
||||||
(byte-ordering marker) pseudo-character, which is legal at the start of a
|
(byte-ordering marker) pseudo-character, which is legal at the start of a
|
||||||
file and which is automatically prepended by some text editors and
|
file and which is automatically prepended by some text editors and
|
||||||
word-processors when they save a UTF-8 file (though in fact it is not
|
word-processors when they save a UTF-8 file (though in fact it is not
|
||||||
required by the UTF-8 specification). Anyone calling |TextFiles::utf8_fgetc| must
|
required by the UTF-8 specification). Anyone calling [[TextFiles::utf8_fgetc]] must
|
||||||
check the return value for |EOF| every time, and for |0xFEFF| every time we
|
check the return value for [[EOF]] every time, and for [[0xFEFF]] every time we
|
||||||
might be at the start of the file being read.
|
might be at the start of the file being read.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct unicode_file_buffer {
|
typedef struct unicode_file_buffer {
|
||||||
char unicode_feed_buffer[32]; /* holds a single escape such as "[unicode 3106]" */
|
char unicode_feed_buffer[32]; /* holds a single escape such as "[unicode 3106]" */
|
||||||
int ufb_counter; /* position in the unicode feed buffer */
|
int ufb_counter; /* position in the unicode feed buffer */
|
||||||
|
@ -247,9 +247,9 @@ int TextFiles::utf8_fgetc(FILE *from, const char **or_from, int escape_oddities,
|
||||||
if (c == EOF) return c; /* ruling out EOF leaves a genuine byte from the file */
|
if (c == EOF) return c; /* ruling out EOF leaves a genuine byte from the file */
|
||||||
if (c<0x80) return c; /* in all other cases, a UTF-8 continuation sequence begins */
|
if (c<0x80) return c; /* in all other cases, a UTF-8 continuation sequence begins */
|
||||||
|
|
||||||
@<Unpack one to five continuation bytes to obtain the Unicode character code@>;
|
<<Unpack one to five continuation bytes to obtain the Unicode character code>>;
|
||||||
@<Return non-ASCII codes in the intersection of ISO Latin-1 and ZSCII as literals@>;
|
<<Return non-ASCII codes in the intersection of ISO Latin-1 and ZSCII as literals>>;
|
||||||
if (escape_oddities) @<Return Unicode fancy equivalents as simpler literals@>;
|
if (escape_oddities) <<Return Unicode fancy equivalents as simpler literals>>;
|
||||||
|
|
||||||
if (c == 0xFEFF) return c; /* the Unicode BOM non-character */
|
if (c == 0xFEFF) return c; /* the Unicode BOM non-character */
|
||||||
|
|
||||||
|
@ -268,7 +268,7 @@ fatal error (which is pretty well the only alternative here). The user
|
||||||
is likely to see problem messages later on which arise from the question
|
is likely to see problem messages later on which arise from the question
|
||||||
marks, and that will have to do.
|
marks, and that will have to do.
|
||||||
|
|
||||||
@<Unpack one to five continuation bytes to obtain the Unicode character code@> =
|
<<Unpack one to five continuation bytes to obtain the Unicode character code>>=
|
||||||
if (c<0xC0) return '?'; /* malformed UTF-8 */
|
if (c<0xC0) return '?'; /* malformed UTF-8 */
|
||||||
if (c<0xE0) { c = c & 0x1f; conts = 1; }
|
if (c<0xE0) { c = c & 0x1f; conts = 1; }
|
||||||
else if (c<0xF0) { c = c & 0xf; conts = 2; }
|
else if (c<0xF0) { c = c & 0xf; conts = 2; }
|
||||||
|
@ -293,12 +293,12 @@ fiction offerings. Had they been collaborating with J. R. R. Tolkien
|
||||||
rather than Douglas Adams, they might have filled this gap. As it was,
|
rather than Douglas Adams, they might have filled this gap. As it was,
|
||||||
"eth" never occurred in any of their works.)
|
"eth" never occurred in any of their works.)
|
||||||
|
|
||||||
We let the multiplication sign |0xd7| through even though ZSCII doesn't
|
We let the multiplication sign [[0xd7]] through even though ZSCII doesn't
|
||||||
support it, but convert it to an "x": this is so that we can parse numbers
|
support it, but convert it to an "x": this is so that we can parse numbers
|
||||||
in scientific notation.
|
in scientific notation.
|
||||||
|
|
||||||
@<Return non-ASCII codes in the intersection of ISO Latin-1 and ZSCII as literals@> =
|
<<Return non-ASCII codes in the intersection of ISO Latin-1 and ZSCII as literals>>=
|
||||||
if ((c == 0xa1) || (c == 0xa3) || (c == 0xbf)) return c; /* pound sign, inverted ! and ? */
|
if ((c == 0xa1) [[| (c == 0xa3) |]] (c == 0xbf)) return c; /* pound sign, inverted ! and ? */
|
||||||
if (c == 0xd7) return 'x'; /* convert multiplication sign to lower case "x" */
|
if (c == 0xd7) return 'x'; /* convert multiplication sign to lower case "x" */
|
||||||
if ((c >= 0xc0) && (c <= 0xff)) { /* accented West European letters, but... */
|
if ((c >= 0xc0) && (c <= 0xff)) { /* accented West European letters, but... */
|
||||||
if ((c != 0xd0) && (c != 0xf0) && /* not Icelandic eths */
|
if ((c != 0xd0) && (c != 0xf0) && /* not Icelandic eths */
|
||||||
|
@ -312,7 +312,7 @@ where we would normally expect hyphens and ordinary spaces: this is intended
|
||||||
for the benefit of users with helpful word-processors which autocorrect
|
for the benefit of users with helpful word-processors which autocorrect
|
||||||
hyphens into em-rules when they are flanked by spaces, and so on.
|
hyphens into em-rules when they are flanked by spaces, and so on.
|
||||||
|
|
||||||
@<Return Unicode fancy equivalents as simpler literals@> =
|
<<Return Unicode fancy equivalents as simpler literals>>=
|
||||||
if (c == 0x85) return '\x0d'; /* NEL, or "next line" */
|
if (c == 0x85) return '\x0d'; /* NEL, or "next line" */
|
||||||
if (c == 0xa0) return ' '; /* non-breaking space */
|
if (c == 0xa0) return ' '; /* non-breaking space */
|
||||||
if ((c >= 0x2000) && (c <= 0x200a)) return ' '; /* space variants */
|
if ((c >= 0x2000) && (c <= 0x200a)) return ' '; /* space variants */
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
To examine heads and tails of text, to see how it may inflect.
|
To examine heads and tails of text, to see how it may inflect.
|
||||||
|
|
||||||
@h Tries.
|
@ \section{Tries.}
|
||||||
The standard data structure for searches through possible prefixes or
|
The standard data structure for searches through possible prefixes or
|
||||||
suffixes is a "trie". The term goes back to Edward Fredkin in 1961;
|
suffixes is a "trie". The term goes back to Edward Fredkin in 1961;
|
||||||
some pronounce it "try" and some "tree", and either would be a fair
|
some pronounce it "try" and some "tree", and either would be a fair
|
||||||
|
@ -17,28 +17,29 @@ front of a text, whereas an end head represents matching from the back.
|
||||||
|
|
||||||
(b) "Choices". A choice node has a given match character, say an "f", and
|
(b) "Choices". A choice node has a given match character, say an "f", and
|
||||||
represents which node to go to next if this is the current character in the
|
represents which node to go to next if this is the current character in the
|
||||||
text. It must either be a valid Unicode character or |TRIE_ANYTHING|, which
|
text. It must either be a valid Unicode character or [[TRIE_ANYTHING]], which
|
||||||
is a wildcard representing "any text of any length here". Since a choice
|
is a wildcard representing "any text of any length here". Since a choice
|
||||||
must always lead somewhere, |on_success| must point to another node.
|
must always lead somewhere, [[on_success]] must point to another node.
|
||||||
There can be any number of choices at a given position, so choice nodes
|
There can be any number of choices at a given position, so choice nodes
|
||||||
are always organised in linked lists joined by |next|.
|
are always organised in linked lists joined by [[next]].
|
||||||
|
|
||||||
(c) "Terminals", always leaves, which have match character set to the
|
(c) "Terminals", always leaves, which have match character set to the
|
||||||
impossible value |TRIE_STOP|, and for which |match_outcome| is non-null; thus,
|
impossible value [[TRIE_STOP]], and for which [[match_outcome]] is non-null; thus,
|
||||||
different terminal nodes can result in different outcomes if they are ever
|
different terminal nodes can result in different outcomes if they are ever
|
||||||
reached at the end of a successful scan. A terminal node is always the only item
|
reached at the end of a successful scan. A terminal node is always the only item
|
||||||
in a list.
|
in a list.
|
||||||
|
|
||||||
@d TRIE_START -1 /* head: the root of a trie parsing forwards from the start */
|
<<*>>=
|
||||||
@d TRIE_END -2 /* head: the root of a trie parsing backwards from the end */
|
#define TRIE_START -1 /* head: the root of a trie parsing forwards from the start */
|
||||||
@d TRIE_ANYTHING 10003 /* choice: match any text here */
|
#define TRIE_END -2 /* head: the root of a trie parsing backwards from the end */
|
||||||
@d TRIE_ANY_GROUP 10001 /* choice: match any character from this group */
|
#define TRIE_ANYTHING 10003 /* choice: match any text here */
|
||||||
@d TRIE_NOT_GROUP 10002 /* choice: match any character not in this group */
|
#define TRIE_ANY_GROUP 10001 /* choice: match any character from this group */
|
||||||
@d TRIE_STOP -3 /* terminal: here's the outcome */
|
#define TRIE_NOT_GROUP 10002 /* choice: match any character not in this group */
|
||||||
|
#define TRIE_STOP -3 /* terminal: here's the outcome */
|
||||||
|
|
||||||
@d MAX_TRIE_GROUP_SIZE 26 /* size of the allowable groups of characters */
|
#define MAX_TRIE_GROUP_SIZE 26 /* size of the allowable groups of characters */
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct match_trie {
|
typedef struct match_trie {
|
||||||
int match_character; /* or one of the special cases above */
|
int match_character; /* or one of the special cases above */
|
||||||
wchar_t group_characters[MAX_TRIE_GROUP_SIZE+1];
|
wchar_t group_characters[MAX_TRIE_GROUP_SIZE+1];
|
||||||
|
@ -48,40 +49,41 @@ typedef struct match_trie {
|
||||||
} match_trie;
|
} match_trie;
|
||||||
|
|
||||||
@ We have just one routine for extending and scanning the trie: it either
|
@ We have just one routine for extending and scanning the trie: it either
|
||||||
tries to find whether a text |p| leads to any outcome in the existing trie,
|
tries to find whether a text [[p]] leads to any outcome in the existing trie,
|
||||||
or else forcibly extends the existing trie to ensure that it does.
|
or else forcibly extends the existing trie to ensure that it does.
|
||||||
|
|
||||||
It might look as if calling |Tries::search| always returns |add_outcome| when
|
It might look as if calling [[Tries::search]] always returns [[add_outcome]] when
|
||||||
this is set, but this isn't true: if the trie already contains a node
|
this is set, but this isn't true: if the trie already contains a node
|
||||||
representing how to deal with |p|, we get whatever outcome is already
|
representing how to deal with [[p]], we get whatever outcome is already
|
||||||
established.
|
established.
|
||||||
|
|
||||||
There are two motions to keep track of: our progress through the text |p|
|
There are two motions to keep track of: our progress through the text [[p]]
|
||||||
being scanned, and our progress through the trie which tells us how to scan it.
|
being scanned, and our progress through the trie which tells us how to scan it.
|
||||||
|
|
||||||
We scan the text either forwards or backwards, starting with the first or
|
We scan the text either forwards or backwards, starting with the first or
|
||||||
last character and then working through, finishing with a 0 terminator.
|
last character and then working through, finishing with a 0 terminator.
|
||||||
(This is true even if working backwards: we pretend the character stored
|
(This is true even if working backwards: we pretend the character stored
|
||||||
before the text began is 0.) |i| represents the index of our current position
|
before the text began is 0.) [[i]] represents the index of our current position
|
||||||
in |p|, and runs either from 0 up to |N| or from |N-1| down to |-1|,
|
in [[p]], and runs either from 0 up to [[N]] or from [[N-1]] down to [[-1]],
|
||||||
where |N| is the number of characters in |p|.
|
where [[N]] is the number of characters in [[p]].
|
||||||
|
|
||||||
We scan the trie using a pair of pointers. |prev| is the last node we
|
We scan the trie using a pair of pointers. [[prev]] is the last node we
|
||||||
successfully left, and |pos| is one we are currently at, which can be
|
successfully left, and [[pos]] is one we are currently at, which can be
|
||||||
either a terminal node or a choice node (in which case it's the head of
|
either a terminal node or a choice node (in which case it's the head of
|
||||||
a linked list of such nodes).
|
a linked list of such nodes).
|
||||||
|
|
||||||
@d MAX_TRIE_REWIND 10 /* that should be far, far more rewinding than necessary */
|
<<*>>=
|
||||||
|
#define MAX_TRIE_REWIND 10 /* that should be far, far more rewinding than necessary */
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) {
|
wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) {
|
||||||
if (T == NULL) internal_error("no trie to search");
|
if (T == NULL) internal_error("no trie to search");
|
||||||
|
|
||||||
int start, endpoint, delta;
|
int start, endpoint, delta;
|
||||||
@<Look at the root node of the trie, setting up the scan accordingly@>;
|
<<Look at the root node of the trie, setting up the scan accordingly>>;
|
||||||
|
|
||||||
match_trie *prev = NULL, *pos = T;
|
match_trie *prev = NULL, *pos = T;
|
||||||
@<Accept the current node of the trie@>;
|
<<Accept the current node of the trie>>;
|
||||||
|
|
||||||
int rewind_sp = 0;
|
int rewind_sp = 0;
|
||||||
int rewind_points[MAX_TRIE_REWIND];
|
int rewind_points[MAX_TRIE_REWIND];
|
||||||
|
@ -113,7 +115,7 @@ wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) {
|
||||||
if (c == '*') endpoint -= delta;
|
if (c == '*') endpoint -= delta;
|
||||||
|
|
||||||
RewindHere:
|
RewindHere:
|
||||||
@<Look through the possible exits from this position and move on if any match@>;
|
<<Look through the possible exits from this position and move on if any match>>;
|
||||||
if (add_outcome == NULL) {
|
if (add_outcome == NULL) {
|
||||||
if (rewind_sp > 0) {
|
if (rewind_sp > 0) {
|
||||||
i = rewind_points[rewind_sp-1];
|
i = rewind_points[rewind_sp-1];
|
||||||
|
@ -124,19 +126,19 @@ wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) {
|
||||||
}
|
}
|
||||||
return NULL; /* failure! */
|
return NULL; /* failure! */
|
||||||
}
|
}
|
||||||
@<We have run out of trie and must create a new exit to continue@>;
|
<<We have run out of trie and must create a new exit to continue>>;
|
||||||
}
|
}
|
||||||
if ((pos) && (pos->match_character == TRIE_ANYTHING)) @<Accept the current node of the trie@>;
|
if ((pos) && (pos->match_character == TRIE_ANYTHING)) <<Accept the current node of the trie>>;
|
||||||
if ((pos) && (pos->match_outcome)) return pos->match_outcome; /* success! */
|
if ((pos) && (pos->match_outcome)) return pos->match_outcome; /* success! */
|
||||||
if (add_outcome == NULL) return NULL; /* failure! */
|
if (add_outcome == NULL) return NULL; /* failure! */
|
||||||
|
|
||||||
if (pos == NULL)
|
if (pos == NULL)
|
||||||
@<We failed by running out of trie, so we must add a terminal node to make this string acceptable@>
|
<<We failed by running out of trie, so we must add a terminal node to make this string acceptable>>
|
||||||
else
|
else
|
||||||
@<We failed by finishing at a non-terminal node, so we must add an outcome@>;
|
<<We failed by finishing at a non-terminal node, so we must add an outcome>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<Look at the root node of the trie, setting up the scan accordingly@> =
|
<<Look at the root node of the trie, setting up the scan accordingly>>=
|
||||||
start = 0; endpoint = Str::len(p); delta = 1;
|
start = 0; endpoint = Str::len(p); delta = 1;
|
||||||
if (T->match_character == TRIE_END) { start = Str::len(p)-1; endpoint = -1; delta = -1; }
|
if (T->match_character == TRIE_END) { start = Str::len(p)-1; endpoint = -1; delta = -1; }
|
||||||
|
|
||||||
|
@ -146,10 +148,10 @@ this tends to make commonly used exits migrate upwards and rarities downwards.
|
||||||
But we aren't going to search these tries anything like intensively enough
|
But we aren't going to search these tries anything like intensively enough
|
||||||
to make it worth the trouble.
|
to make it worth the trouble.
|
||||||
|
|
||||||
(The following cannot be a |while| loop since C does not allow us to |break|
|
(The following cannot be a [[while]] loop since C does not allow us to [[break]]
|
||||||
or |continue| out of an outer loop from an inner one.)
|
or [[continue]] out of an outer loop from an inner one.)
|
||||||
|
|
||||||
@<Look through the possible exits from this position and move on if any match@> =
|
<<Look through the possible exits from this position and move on if any match>>=
|
||||||
int ambig = 0, unambig = 0;
|
int ambig = 0, unambig = 0;
|
||||||
match_trie *point;
|
match_trie *point;
|
||||||
for (point = pos; point; point = point->next)
|
for (point = pos; point; point = point->next)
|
||||||
|
@ -158,7 +160,7 @@ or |continue| out of an outer loop from an inner one.)
|
||||||
|
|
||||||
FauxWhileLoop:
|
FauxWhileLoop:
|
||||||
if (pos) {
|
if (pos) {
|
||||||
if ((add_outcome == NULL) || (Tries::is_ambiguous(pos) == FALSE))
|
if ((add_outcome == NULL) [[]] (Tries::is_ambiguous(pos) == FALSE))
|
||||||
if (Tries::matches(pos, c)) {
|
if (Tries::matches(pos, c)) {
|
||||||
if (pos->match_character == TRIE_ANYTHING) break;
|
if (pos->match_character == TRIE_ANYTHING) break;
|
||||||
if ((add_outcome == NULL) && (ambig > 0) && (ambig+unambig > 1)
|
if ((add_outcome == NULL) && (ambig > 0) && (ambig+unambig > 1)
|
||||||
|
@ -168,14 +170,14 @@ or |continue| out of an outer loop from an inner one.)
|
||||||
rewind_prev_positions[rewind_sp] = prev;
|
rewind_prev_positions[rewind_sp] = prev;
|
||||||
rewind_sp++;
|
rewind_sp++;
|
||||||
}
|
}
|
||||||
@<Accept the current node of the trie@>;
|
<<Accept the current node of the trie>>;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
pos = pos->next;
|
pos = pos->next;
|
||||||
goto FauxWhileLoop;
|
goto FauxWhileLoop;
|
||||||
}
|
}
|
||||||
|
|
||||||
@<We have run out of trie and must create a new exit to continue@> =
|
<<We have run out of trie and must create a new exit to continue>>=
|
||||||
match_trie *new_pos = NULL;
|
match_trie *new_pos = NULL;
|
||||||
if (g > 0) {
|
if (g > 0) {
|
||||||
int nt = TRIE_ANY_GROUP;
|
int nt = TRIE_ANY_GROUP;
|
||||||
|
@ -211,29 +213,29 @@ or |continue| out of an outer loop from an inner one.)
|
||||||
}
|
}
|
||||||
|
|
||||||
pos = new_pos;
|
pos = new_pos;
|
||||||
@<Accept the current node of the trie@>; continue;
|
<<Accept the current node of the trie>>; continue;
|
||||||
|
|
||||||
@<Accept the current node of the trie@> =
|
<<Accept the current node of the trie>>=
|
||||||
if (pos == NULL) internal_error("trie invariant broken");
|
if (pos == NULL) internal_error("trie invariant broken");
|
||||||
prev = pos; pos = prev->on_success;
|
prev = pos; pos = prev->on_success;
|
||||||
|
|
||||||
@ If |pos| is |NULL| then it follows that |prev->on_success| is |NULL|, since
|
@ If [[pos]] is [[NULL]] then it follows that [[prev->on_success]] is [[NULL]], since
|
||||||
this is how |pos| was calculated; so to add a new terminal node we simply add
|
this is how [[pos]] was calculated; so to add a new terminal node we simply add
|
||||||
it there.
|
it there.
|
||||||
|
|
||||||
@<We failed by running out of trie, so we must add a terminal node to make this string acceptable@> =
|
<<We failed by running out of trie, so we must add a terminal node to make this string acceptable>>=
|
||||||
prev->on_success = Tries::new(TRIE_STOP);
|
prev->on_success = Tries::new(TRIE_STOP);
|
||||||
prev->on_success->match_outcome = add_outcome;
|
prev->on_success->match_outcome = add_outcome;
|
||||||
return add_outcome;
|
return add_outcome;
|
||||||
|
|
||||||
@<We failed by finishing at a non-terminal node, so we must add an outcome@> =
|
<<We failed by finishing at a non-terminal node, so we must add an outcome>>=
|
||||||
prev->on_success = Tries::new(TRIE_STOP);
|
prev->on_success = Tries::new(TRIE_STOP);
|
||||||
prev->on_success->match_outcome = add_outcome;
|
prev->on_success->match_outcome = add_outcome;
|
||||||
return add_outcome;
|
return add_outcome;
|
||||||
|
|
||||||
@ Single nodes are matched thus:
|
@ Single nodes are matched thus:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Tries::matches(match_trie *pos, int c) {
|
int Tries::matches(match_trie *pos, int c) {
|
||||||
if (pos->match_character == TRIE_ANYTHING) return TRUE;
|
if (pos->match_character == TRIE_ANYTHING) return TRUE;
|
||||||
if (pos->match_character == TRIE_ANY_GROUP) {
|
if (pos->match_character == TRIE_ANY_GROUP) {
|
||||||
|
@ -263,7 +265,7 @@ int Tries::is_ambiguous(match_trie *pos) {
|
||||||
|
|
||||||
@ Where:
|
@ Where:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
match_trie *Tries::new(int mc) {
|
match_trie *Tries::new(int mc) {
|
||||||
match_trie *T = CREATE(match_trie);
|
match_trie *T = CREATE(match_trie);
|
||||||
T->match_character = mc;
|
T->match_character = mc;
|
||||||
|
@ -273,14 +275,14 @@ match_trie *Tries::new(int mc) {
|
||||||
return T;
|
return T;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Avinues.
|
@ \section{Avinues.}
|
||||||
A trie is only a limited form of finite state machine. We're not going to need
|
A trie is only a limited form of finite state machine. We're not going to need
|
||||||
the whole power of these, but we do find it useful to chain a series of tries
|
the whole power of these, but we do find it useful to chain a series of tries
|
||||||
together. The idea is to scan against one trie, then, if there's no result,
|
together. The idea is to scan against one trie, then, if there's no result,
|
||||||
start again with the next, and so on. Inform therefore often matches text
|
start again with the next, and so on. Inform therefore often matches text
|
||||||
against a linked list of tries: we'll call that an "avinue".
|
against a linked list of tries: we'll call that an "avinue".
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
typedef struct match_avinue {
|
typedef struct match_avinue {
|
||||||
struct match_trie *the_trie;
|
struct match_trie *the_trie;
|
||||||
struct match_avinue *next;
|
struct match_avinue *next;
|
||||||
|
@ -289,7 +291,7 @@ typedef struct match_avinue {
|
||||||
@ An avinue starts out with a single trie, which itself has just a single
|
@ An avinue starts out with a single trie, which itself has just a single
|
||||||
head node (of either sort).
|
head node (of either sort).
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
match_avinue *Tries::new_avinue(int from_start) {
|
match_avinue *Tries::new_avinue(int from_start) {
|
||||||
match_avinue *A = CREATE(match_avinue);
|
match_avinue *A = CREATE(match_avinue);
|
||||||
A->next = NULL;
|
A->next = NULL;
|
||||||
|
@ -298,14 +300,14 @@ match_avinue *Tries::new_avinue(int from_start) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Tries::add_to_avinue(match_avinue *mt, text_stream *from, wchar_t *to) {
|
void Tries::add_to_avinue(match_avinue *mt, text_stream *from, wchar_t *to) {
|
||||||
if ((mt == NULL) || (mt->the_trie == NULL)) internal_error("null trie");
|
if ((mt == NULL) [[]] (mt->the_trie == NULL)) internal_error("null trie");
|
||||||
Tries::search(mt->the_trie, from, to);
|
Tries::search(mt->the_trie, from, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ The following duplicates an avinue, pointing to the same sequence of
|
@ The following duplicates an avinue, pointing to the same sequence of
|
||||||
tries.
|
tries.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
match_avinue *Tries::duplicate_avinue(match_avinue *A) {
|
match_avinue *Tries::duplicate_avinue(match_avinue *A) {
|
||||||
match_avinue *F = NULL, *FL = NULL;
|
match_avinue *F = NULL, *FL = NULL;
|
||||||
while (A) {
|
while (A) {
|
||||||
|
@ -323,7 +325,7 @@ match_avinue *Tries::duplicate_avinue(match_avinue *A) {
|
||||||
@ As noted above, searching an avinue is a matter of searching with each
|
@ As noted above, searching an avinue is a matter of searching with each
|
||||||
trie in turn until one matches (if it does).
|
trie in turn until one matches (if it does).
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) {
|
wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) {
|
||||||
wchar_t *result = NULL;
|
wchar_t *result = NULL;
|
||||||
while ((T) && (result == NULL)) {
|
while ((T) && (result == NULL)) {
|
||||||
|
@ -333,9 +335,9 @@ wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@h Logging.
|
@ \section{Logging.}
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
void Tries::log_avinue(OUTPUT_STREAM, void *vA) {
|
void Tries::log_avinue(OUTPUT_STREAM, void *vA) {
|
||||||
match_avinue *A = (match_avinue *) vA;
|
match_avinue *A = (match_avinue *) vA;
|
||||||
WRITE("Avinue:\n"); INDENT;
|
WRITE("Avinue:\n"); INDENT;
|
|
@ -2,27 +2,27 @@
|
||||||
|
|
||||||
A minimal library for handling wide C strings.
|
A minimal library for handling wide C strings.
|
||||||
|
|
||||||
@ By "wide string", we mean an array of |wchar_t|. A pointer to this type
|
@ By "wide string", we mean an array of [[wchar_t]]. A pointer to this type
|
||||||
is what is returned by an L-literal in ANSI C, such as |L"look, I'm wide"|.
|
is what is returned by an L-literal in ANSI C, such as [[L"look, I'm wide"]].
|
||||||
A wide string is essentially a C string but with characters stored in full
|
A wide string is essentially a C string but with characters stored in full
|
||||||
words instead of bytes. The character values should be Unicode code points.
|
words instead of bytes. The character values should be Unicode code points.
|
||||||
|
|
||||||
We will do as little as possible with wide strings, and the following
|
We will do as little as possible with wide strings, and the following
|
||||||
wrappers simply abstract the standard C library's handling.
|
wrappers simply abstract the standard C library's handling.
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Wide::len(wchar_t *p) {
|
int Wide::len(wchar_t *p) {
|
||||||
return (int) wcslen(p);
|
return (int) wcslen(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ On the rare occasions when we need to sort alphabetically we'll also call:
|
@ On the rare occasions when we need to sort alphabetically we'll also call:
|
||||||
|
|
||||||
=
|
<<*>>=
|
||||||
int Wide::cmp(wchar_t *A, wchar_t *B) {
|
int Wide::cmp(wchar_t *A, wchar_t *B) {
|
||||||
return wcscmp(A, B);
|
return wcscmp(A, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ =
|
<<*>>=
|
||||||
int Wide::atoi(wchar_t *p) {
|
int Wide::atoi(wchar_t *p) {
|
||||||
return (int) wcstol(p, NULL, 10);
|
return (int) wcstol(p, NULL, 10);
|
||||||
}
|
}
|
Loading…
Reference in a new issue