2019-02-04 22:26:45 +00:00
|
|
|
[Tries::] Tries and Avinues.
|
|
|
|
|
|
|
|
To examine heads and tails of text, to see how it may inflect.
|
|
|
|
|
|
|
|
@h Tries.
|
|
|
|
The standard data structure for searches through possible prefixes or
|
|
|
|
suffixes is a "trie". The term goes back to Edward Fredkin in 1961;
|
|
|
|
some pronounce it "try" and some "tree", and either would be a fair
|
|
|
|
description. Like hash tables, tries are a means of minimising string
|
|
|
|
comparisons when sorting through possible outcomes based on a text.
|
|
|
|
|
|
|
|
The trie is a tree with three kinds of node:
|
|
|
|
|
|
|
|
(a) "Heads". Every trie has exactly one such node, and it's always the root.
|
|
|
|
There are two versions of this: a start head represents matching from the
|
|
|
|
front of a text, whereas an end head represents matching from the back.
|
|
|
|
|
|
|
|
(b) "Choices". A choice node has a given match character, say an "f", and
|
|
|
|
represents which node to go to next if this is the current character in the
|
|
|
|
text. It must either be a valid Unicode character or |TRIE_ANYTHING|, which
|
|
|
|
is a wildcard representing "any text of any length here". Since a choice
|
|
|
|
must always lead somewhere, |on_success| must point to another node.
|
|
|
|
There can be any number of choices at a given position, so choice nodes
|
|
|
|
are always organised in linked lists joined by |next|.
|
|
|
|
|
|
|
|
(c) "Terminals", always leaves, which have match character set to the
|
|
|
|
impossible value |TRIE_STOP|, and for which |match_outcome| is non-null; thus,
|
|
|
|
different terminal nodes can result in different outcomes if they are ever
|
|
|
|
reached at the end of a successful scan. A terminal node is always the only item
|
|
|
|
in a list.
|
|
|
|
|
|
|
|
@d TRIE_START -1 /* head: the root of a trie parsing forwards from the start */
|
|
|
|
@d TRIE_END -2 /* head: the root of a trie parsing backwards from the end */
|
|
|
|
@d TRIE_ANYTHING 10003 /* choice: match any text here */
|
|
|
|
@d TRIE_ANY_GROUP 10001 /* choice: match any character from this group */
|
|
|
|
@d TRIE_NOT_GROUP 10002 /* choice: match any character not in this group */
|
|
|
|
@d TRIE_STOP -3 /* terminal: here's the outcome */
|
|
|
|
|
|
|
|
@d MAX_TRIE_GROUP_SIZE 26 /* size of the allowable groups of characters */
|
|
|
|
|
|
|
|
=
|
|
|
|
typedef struct match_trie {
|
|
|
|
int match_character; /* or one of the special cases above */
|
|
|
|
wchar_t group_characters[MAX_TRIE_GROUP_SIZE+1];
|
|
|
|
wchar_t *match_outcome;
|
|
|
|
struct match_trie *on_success;
|
|
|
|
struct match_trie *next;
|
|
|
|
} match_trie;
|
|
|
|
|
|
|
|
@ We have just one routine for extending and scanning the trie: it either
|
|
|
|
tries to find whether a text |p| leads to any outcome in the existing trie,
|
|
|
|
or else forcibly extends the existing trie to ensure that it does.
|
|
|
|
|
|
|
|
It might look as if calling |Tries::search| always returns |add_outcome| when
|
|
|
|
this is set, but this isn't true: if the trie already contains a node
|
|
|
|
representing how to deal with |p|, we get whatever outcome is already
|
|
|
|
established.
|
|
|
|
|
|
|
|
There are two motions to keep track of: our progress through the text |p|
|
|
|
|
being scanned, and our progress through the trie which tells us how to scan it.
|
|
|
|
|
|
|
|
We scan the text either forwards or backwards, starting with the first or
|
|
|
|
last character and then working through, finishing with a 0 terminator.
|
|
|
|
(This is true even if working backwards: we pretend the character stored
|
|
|
|
before the text began is 0.) |i| represents the index of our current position
|
|
|
|
in |p|, and runs either from 0 up to |N| or from |N-1| down to |-1|,
|
|
|
|
where |N| is the number of characters in |p|.
|
|
|
|
|
|
|
|
We scan the trie using a pair of pointers. |prev| is the last node we
|
|
|
|
successfully left, and |pos| is one we are currently at, which can be
|
|
|
|
either a terminal node or a choice node (in which case it's the head of
|
|
|
|
a linked list of such nodes).
|
|
|
|
|
|
|
|
@d MAX_TRIE_REWIND 10 /* that should be far, far more rewinding than necessary */
|
|
|
|
|
|
|
|
=
|
|
|
|
wchar_t *Tries::search(match_trie *T, text_stream *p, wchar_t *add_outcome) {
|
|
|
|
if (T == NULL) internal_error("no trie to search");
|
|
|
|
|
|
|
|
int start, endpoint, delta;
|
|
|
|
@<Look at the root node of the trie, setting up the scan accordingly@>;
|
|
|
|
|
|
|
|
match_trie *prev = NULL, *pos = T;
|
|
|
|
@<Accept the current node of the trie@>;
|
|
|
|
|
|
|
|
int rewind_sp = 0;
|
|
|
|
int rewind_points[MAX_TRIE_REWIND];
|
|
|
|
match_trie *rewind_positions[MAX_TRIE_REWIND];
|
|
|
|
match_trie *rewind_prev_positions[MAX_TRIE_REWIND];
|
|
|
|
|
|
|
|
for (int i = start; i != endpoint+delta; i += delta) {
|
|
|
|
wchar_t group[MAX_TRIE_GROUP_SIZE+1];
|
|
|
|
int g = 0; /* size of group */
|
2020-07-05 09:35:31 +00:00
|
|
|
wchar_t c = (i<0)?0:(Str::get_at(p, i)); /* i.e., zero at the two ends of the text */
|
2019-02-04 22:26:45 +00:00
|
|
|
if ((c >= 0x20) && (c <= 0x7f)) c = Characters::tolower(c); /* normalise it within ASCII */
|
|
|
|
if (c == 0x20) { c = 0; i = endpoint - delta; } /* force any space to be equivalent to the final 0 */
|
|
|
|
if (add_outcome) {
|
2020-07-05 09:35:31 +00:00
|
|
|
wchar_t pairc = 0;
|
2019-02-04 22:26:45 +00:00
|
|
|
if (c == '<') pairc = '>';
|
|
|
|
if (c == '>') pairc = '<';
|
|
|
|
if (pairc) {
|
|
|
|
int j;
|
|
|
|
for (j = i+delta; j != endpoint; j += delta) {
|
2020-07-05 09:35:31 +00:00
|
|
|
wchar_t ch = (j<0)?0:(Str::get_at(p, j));
|
2019-02-04 22:26:45 +00:00
|
|
|
if (ch == pairc) break;
|
|
|
|
if (g > MAX_TRIE_GROUP_SIZE) { g = 0; break; }
|
|
|
|
group[g++] = ch;
|
|
|
|
}
|
|
|
|
group[g] = 0;
|
|
|
|
if (g > 0) i = j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (c == '*') endpoint -= delta;
|
|
|
|
|
|
|
|
RewindHere:
|
|
|
|
@<Look through the possible exits from this position and move on if any match@>;
|
|
|
|
if (add_outcome == NULL) {
|
|
|
|
if (rewind_sp > 0) {
|
|
|
|
i = rewind_points[rewind_sp-1];
|
|
|
|
pos = rewind_positions[rewind_sp-1];
|
|
|
|
prev = rewind_prev_positions[rewind_sp-1];
|
|
|
|
rewind_sp--;
|
|
|
|
goto RewindHere;
|
|
|
|
}
|
|
|
|
return NULL; /* failure! */
|
|
|
|
}
|
|
|
|
@<We have run out of trie and must create a new exit to continue@>;
|
|
|
|
}
|
|
|
|
if ((pos) && (pos->match_character == TRIE_ANYTHING)) @<Accept the current node of the trie@>;
|
|
|
|
if ((pos) && (pos->match_outcome)) return pos->match_outcome; /* success! */
|
|
|
|
if (add_outcome == NULL) return NULL; /* failure! */
|
|
|
|
|
|
|
|
if (pos == NULL)
|
|
|
|
@<We failed by running out of trie, so we must add a terminal node to make this string acceptable@>
|
|
|
|
else
|
|
|
|
@<We failed by finishing at a non-terminal node, so we must add an outcome@>;
|
|
|
|
}
|
|
|
|
|
|
|
|
@<Look at the root node of the trie, setting up the scan accordingly@> =
|
|
|
|
start = 0; endpoint = Str::len(p); delta = 1;
|
|
|
|
if (T->match_character == TRIE_END) { start = Str::len(p)-1; endpoint = -1; delta = -1; }
|
|
|
|
|
|
|
|
@ In general trie searches can be made more efficient if the trie is shuffled
|
|
|
|
so that the most recently matched exit in the list if moved to the top, as
|
|
|
|
this tends to make commonly used exits migrate upwards and rarities downwards.
|
|
|
|
But we aren't going to search these tries anything like intensively enough
|
|
|
|
to make it worth the trouble.
|
|
|
|
|
|
|
|
(The following cannot be a |while| loop since C does not allow us to |break|
|
|
|
|
or |continue| out of an outer loop from an inner one.)
|
|
|
|
|
|
|
|
@<Look through the possible exits from this position and move on if any match@> =
|
|
|
|
int ambig = 0, unambig = 0;
|
|
|
|
match_trie *point;
|
|
|
|
for (point = pos; point; point = point->next)
|
|
|
|
if (Tries::is_ambiguous(point)) ambig++;
|
|
|
|
else unambig++;
|
|
|
|
|
|
|
|
FauxWhileLoop:
|
|
|
|
if (pos) {
|
|
|
|
if ((add_outcome == NULL) || (Tries::is_ambiguous(pos) == FALSE))
|
|
|
|
if (Tries::matches(pos, c)) {
|
|
|
|
if (pos->match_character == TRIE_ANYTHING) break;
|
|
|
|
if ((add_outcome == NULL) && (ambig > 0) && (ambig+unambig > 1)
|
|
|
|
&& (rewind_sp < MAX_TRIE_REWIND)) {
|
|
|
|
rewind_points[rewind_sp] = i;
|
|
|
|
rewind_positions[rewind_sp] = pos->next;
|
|
|
|
rewind_prev_positions[rewind_sp] = prev;
|
|
|
|
rewind_sp++;
|
|
|
|
}
|
|
|
|
@<Accept the current node of the trie@>;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
pos = pos->next;
|
|
|
|
goto FauxWhileLoop;
|
|
|
|
}
|
|
|
|
|
|
|
|
@<We have run out of trie and must create a new exit to continue@> =
|
|
|
|
match_trie *new_pos = NULL;
|
|
|
|
if (g > 0) {
|
|
|
|
int nt = TRIE_ANY_GROUP;
|
|
|
|
wchar_t *from = group;
|
|
|
|
if (group[0] == '!') { from++; nt = TRIE_NOT_GROUP; }
|
|
|
|
if (group[(int) Wide::len(group)-1] == '!') {
|
|
|
|
group[(int) Wide::len(group)-1] = 0; nt = TRIE_NOT_GROUP;
|
|
|
|
}
|
|
|
|
new_pos = Tries::new(nt);
|
|
|
|
wcscpy(new_pos->group_characters, from);
|
|
|
|
} else if (c == '*') new_pos = Tries::new(TRIE_ANYTHING);
|
|
|
|
else new_pos = Tries::new(c);
|
|
|
|
|
|
|
|
if (prev->on_success == NULL) prev->on_success = new_pos;
|
|
|
|
else {
|
|
|
|
match_trie *ppoint = NULL, *point;
|
|
|
|
for (point = prev->on_success; point; ppoint = point, point = point->next) {
|
|
|
|
if (new_pos->match_character < point->match_character) {
|
|
|
|
if (ppoint == NULL) {
|
|
|
|
new_pos->next = prev->on_success;
|
|
|
|
prev->on_success = new_pos;
|
|
|
|
} else {
|
|
|
|
ppoint->next = new_pos;
|
|
|
|
new_pos->next = point;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (point->next == NULL) {
|
|
|
|
point->next = new_pos;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pos = new_pos;
|
|
|
|
@<Accept the current node of the trie@>; continue;
|
|
|
|
|
|
|
|
@<Accept the current node of the trie@> =
|
|
|
|
if (pos == NULL) internal_error("trie invariant broken");
|
|
|
|
prev = pos; pos = prev->on_success;
|
|
|
|
|
|
|
|
@ If |pos| is |NULL| then it follows that |prev->on_success| is |NULL|, since
|
|
|
|
this is how |pos| was calculated; so to add a new terminal node we simply add
|
|
|
|
it there.
|
|
|
|
|
|
|
|
@<We failed by running out of trie, so we must add a terminal node to make this string acceptable@> =
|
|
|
|
prev->on_success = Tries::new(TRIE_STOP);
|
|
|
|
prev->on_success->match_outcome = add_outcome;
|
|
|
|
return add_outcome;
|
|
|
|
|
|
|
|
@<We failed by finishing at a non-terminal node, so we must add an outcome@> =
|
|
|
|
prev->on_success = Tries::new(TRIE_STOP);
|
|
|
|
prev->on_success->match_outcome = add_outcome;
|
|
|
|
return add_outcome;
|
|
|
|
|
|
|
|
@ Single nodes are matched thus:
|
|
|
|
|
|
|
|
=
|
|
|
|
int Tries::matches(match_trie *pos, int c) {
|
|
|
|
if (pos->match_character == TRIE_ANYTHING) return TRUE;
|
|
|
|
if (pos->match_character == TRIE_ANY_GROUP) {
|
|
|
|
int k;
|
|
|
|
for (k = 0; pos->group_characters[k]; k++)
|
|
|
|
if (c == pos->group_characters[k])
|
|
|
|
return TRUE;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if (pos->match_character == TRIE_NOT_GROUP) {
|
|
|
|
int k;
|
|
|
|
for (k = 0; pos->group_characters[k]; k++)
|
|
|
|
if (c == pos->group_characters[k])
|
|
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if (pos->match_character == c) return TRUE;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Tries::is_ambiguous(match_trie *pos) {
|
|
|
|
if (pos->match_character == TRIE_ANYTHING) return TRUE;
|
|
|
|
if (pos->match_character == TRIE_ANY_GROUP) return TRUE;
|
|
|
|
if (pos->match_character == TRIE_NOT_GROUP) return TRUE;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
@ Where:
|
|
|
|
|
|
|
|
=
|
|
|
|
match_trie *Tries::new(int mc) {
|
|
|
|
match_trie *T = CREATE(match_trie);
|
|
|
|
T->match_character = mc;
|
|
|
|
T->match_outcome = NULL;
|
|
|
|
T->on_success = NULL;
|
|
|
|
T->next = NULL;
|
|
|
|
return T;
|
|
|
|
}
|
|
|
|
|
|
|
|
@h Avinues.
|
|
|
|
A trie is only a limited form of finite state machine. We're not going to need
|
|
|
|
the whole power of these, but we do find it useful to chain a series of tries
|
|
|
|
together. The idea is to scan against one trie, then, if there's no result,
|
|
|
|
start again with the next, and so on. Inform therefore often matches text
|
|
|
|
against a linked list of tries: we'll call that an "avinue".
|
|
|
|
|
|
|
|
=
|
|
|
|
typedef struct match_avinue {
|
|
|
|
struct match_trie *the_trie;
|
|
|
|
struct match_avinue *next;
|
|
|
|
} match_avinue;
|
|
|
|
|
|
|
|
@ An avinue starts out with a single trie, which itself has just a single
|
|
|
|
head node (of either sort).
|
|
|
|
|
|
|
|
=
|
|
|
|
match_avinue *Tries::new_avinue(int from_start) {
|
|
|
|
match_avinue *A = CREATE(match_avinue);
|
|
|
|
A->next = NULL;
|
|
|
|
A->the_trie = Tries::new(from_start);
|
|
|
|
return A;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Tries::add_to_avinue(match_avinue *mt, text_stream *from, wchar_t *to) {
|
|
|
|
if ((mt == NULL) || (mt->the_trie == NULL)) internal_error("null trie");
|
|
|
|
Tries::search(mt->the_trie, from, to);
|
|
|
|
}
|
|
|
|
|
|
|
|
@ The following duplicates an avinue, pointing to the same sequence of
|
|
|
|
tries.
|
|
|
|
|
|
|
|
=
|
|
|
|
match_avinue *Tries::duplicate_avinue(match_avinue *A) {
|
|
|
|
match_avinue *F = NULL, *FL = NULL;
|
|
|
|
while (A) {
|
|
|
|
match_avinue *FN = CREATE(match_avinue);
|
|
|
|
FN->next = NULL;
|
|
|
|
FN->the_trie = A->the_trie;
|
|
|
|
A = A->next;
|
|
|
|
if (FL) FL->next = FN;
|
|
|
|
if (F == NULL) F = FN;
|
|
|
|
FL = FN;
|
|
|
|
}
|
|
|
|
return F;
|
|
|
|
}
|
|
|
|
|
|
|
|
@ As noted above, searching an avinue is a matter of searching with each
|
|
|
|
trie in turn until one matches (if it does).
|
|
|
|
|
|
|
|
=
|
|
|
|
wchar_t *Tries::search_avinue(match_avinue *T, text_stream *p) {
|
|
|
|
wchar_t *result = NULL;
|
|
|
|
while ((T) && (result == NULL)) {
|
|
|
|
result = Tries::search(T->the_trie, p, NULL);
|
|
|
|
T = T->next;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
@h Logging.
|
|
|
|
|
|
|
|
=
|
|
|
|
void Tries::log_avinue(OUTPUT_STREAM, void *vA) {
|
|
|
|
match_avinue *A = (match_avinue *) vA;
|
|
|
|
WRITE("Avinue:\n"); INDENT;
|
|
|
|
int n = 1;
|
|
|
|
while (A) {
|
|
|
|
WRITE("Trie %d:\n", n++); INDENT;
|
|
|
|
Tries::log(OUT, A->the_trie);
|
|
|
|
OUTDENT;
|
|
|
|
A = A->next;
|
|
|
|
}
|
|
|
|
OUTDENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Tries::log(OUTPUT_STREAM, match_trie *T) {
|
|
|
|
for (; T; T = T->next) {
|
|
|
|
switch (T->match_character) {
|
|
|
|
case TRIE_START: WRITE("Start"); break;
|
|
|
|
case TRIE_END: WRITE("End"); break;
|
|
|
|
case TRIE_ANYTHING: WRITE("Anything"); break;
|
|
|
|
case TRIE_ANY_GROUP: WRITE("Group <%w>", T->group_characters); break;
|
|
|
|
case TRIE_NOT_GROUP: WRITE("Negated group <%w>", T->group_characters); break;
|
|
|
|
case TRIE_STOP: WRITE("Stop"); break;
|
|
|
|
case 0: WRITE("00"); break;
|
|
|
|
default: WRITE("%c", T->match_character); break;
|
|
|
|
}
|
|
|
|
if (T->match_outcome) WRITE(" --> %s", T->match_outcome);
|
|
|
|
WRITE("\n");
|
|
|
|
if (T->on_success) {
|
|
|
|
INDENT; Tries::log(OUT, T->on_success); OUTDENT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|