A simple syntax-colouring engine.


§1. This is a very simple syntax colouring algorithm. The work is done by the function Painter::syntax_colour, which can in principle be applied to texts of any length. But it's usually convenient to run it on a long file one line at a time, so that it is called repeatedly. The variable colouring_state remembers where we were at the end of the previous line, so that we can pick up again later at the start of the next.

Because of that, we need to call the following before we begin a run of calls to Painter::syntax_colour:

    int painter_count = 1;
    void Painter::reset_syntax_colouring(programming_language *pl) {
        colouring_state = PLAIN_COLOUR;
        painter_count = 1;
    }

§2. As we begin, the text to colour is in matter, while colouring is an equal-length text where each character represents the colour of its corresponding character in matter. For example, we might start as:

        int x = 55;
        ppppppppppp

with every character having PLAIN_COLOUR, but end up with:

        int x = 55;
        rrrpipppnnp

We get to that by using a language's rules on literals, and then executing its colouring program.

    int Painter::syntax_colour(programming_language *pl,
        hash_table *HT, text_stream *matter, text_stream *colouring, int with_comments) {
        int from = 0, to = Str::len(matter) - 1;
        if (with_comments) {
            TEMPORARY_TEXT(part_before_comment);
            TEMPORARY_TEXT(part_within_comment);
            if (LanguageMethods::parse_comment(pl,
                matter, part_before_comment, part_within_comment)) {
                int N = Str::len(matter);
                for (int i=Str::len(part_before_comment); i<N; i++)
                    Str::put_at(colouring, i, COMMENT_COLOUR);
                from = 0; to = Str::len(part_before_comment);
            }
            DISCARD_TEXT(part_before_comment);
            DISCARD_TEXT(part_within_comment);
        }
        Painter::syntax_colour_inner(pl, HT, matter, colouring, from, to);
        return FALSE;
    }

    void Painter::syntax_colour_inner(programming_language *pl,
        hash_table *HT, text_stream *matter, text_stream *colouring, int from, int to) {
        <Spot identifiers, literal text and character constants 2.1>;
        <Spot literal numerical constants 2.2>;
        <Now run the colouring program 2.4>;
    }

§2.1. <Spot identifiers, literal text and character constants 2.1> =

        int squote = Str::get_first_char(pl->character_literal);
        int squote_escape = Str::get_first_char(pl->character_literal_escape);
        int dquote = Str::get_first_char(pl->string_literal);
        int dquote_escape = Str::get_first_char(pl->string_literal_escape);
        for (int i=from; i <= to; i++) {
            int skip = NOT_A_COLOUR, one_off = -1, will_be = -1;
            switch (colouring_state) {
                case PLAIN_COLOUR: {
                    wchar_t c = Str::get_at(matter, i);
                    if (c == dquote) {
                        colouring_state = STRING_COLOUR;
                        break;
                    }
                    if (c == squote) {
                        colouring_state = CHAR_LITERAL_COLOUR;
                        break;
                    }
                    if (Painter::identifier_at(pl, matter, colouring, i))
                        one_off = IDENTIFIER_COLOUR;
                    break;
                }
                case CHAR_LITERAL_COLOUR: {
                    wchar_t c = Str::get_at(matter, i);
                    if (c == squote) will_be = PLAIN_COLOUR;
                    if (c == squote_escape) skip = CHAR_LITERAL_COLOUR;
                    break;
                }
                case STRING_COLOUR: {
                    wchar_t c = Str::get_at(matter, i);
                    if (c == dquote) will_be = PLAIN_COLOUR;
                    if (c == dquote_escape) skip = STRING_COLOUR;
                    break;
                }
            }
            if (one_off >= 0) Str::put_at(colouring, i, (char) one_off);
            else Str::put_at(colouring, i, (char) colouring_state);
            if (will_be >= 0) colouring_state = (char) will_be;
            if ((skip != NOT_A_COLOUR) && (i<to)) {
                i++; Str::put_at(colouring, i, skip);
            }
        }

This code is used in §2.

§2.2. <Spot literal numerical constants 2.2> =

        int base = -1, dec_possible = TRUE;
        for (int i=from; i <= to; i++) {
            if ((Str::get_at(colouring, i) == PLAIN_COLOUR) ||
                (Str::get_at(colouring, i) == IDENTIFIER_COLOUR)) {
                wchar_t c = Str::get_at(matter, i);
                if (Str::includes_at(matter, i, pl->binary_literal_prefix)) {
                    base = 2;
                    for (int j=0; j<Str::len(pl->binary_literal_prefix); j++)
                        Str::put_at(colouring, i+j, (char) CONSTANT_COLOUR);
                    dec_possible = TRUE;
                    continue;
                } else if (Str::includes_at(matter, i, pl->octal_literal_prefix)) {
                    base = 8;
                    for (int j=0; j<Str::len(pl->octal_literal_prefix); j++)
                        Str::put_at(colouring, i+j, (char) CONSTANT_COLOUR);
                    dec_possible = TRUE;
                    continue;
                } else if (Str::includes_at(matter, i, pl->hexadecimal_literal_prefix)) {
                    base = 16;
                    for (int j=0; j<Str::len(pl->hexadecimal_literal_prefix); j++)
                        Str::put_at(colouring, i+j, (char) CONSTANT_COLOUR);
                    dec_possible = TRUE;
                    continue;
                }
                if ((Str::includes_at(matter, i, pl->negative_literal_prefix)) &&
                    (dec_possible) && (base == 0)) {
                    base = 10;
                    Str::put_at(colouring, i, (char) CONSTANT_COLOUR);
                    continue;
                }
                int pass = FALSE;
                switch (base) {
                    case -1:
                        if ((dec_possible) && (Characters::isdigit(c))) {
                            base = 10; pass = TRUE;
                        }
                        break;
                    case 2: if ((c == '0') || (c == '1')) pass = TRUE; break;
                    case 10: if (Characters::isdigit(c)) pass = TRUE; break;
                    case 16: if (Characters::isdigit(c)) pass = TRUE;
                        int d = Characters::tolower(c);
                        if ((d == 'a') || (d == 'b') || (d == 'c') ||
                            (d == 'd') || (d == 'e') || (d == 'f')) pass = TRUE;
                        break;
                }
                if (pass) {
                    Str::put_at(colouring, i, (char) CONSTANT_COLOUR);
                } else {
                    if (Characters::is_whitespace(c)) dec_possible = TRUE;
                    else dec_possible = FALSE;
                    base = -1;
                }
            }
        }

This code is used in §2.

§2.3. For the moment, we always adopt the C rules on identifiers: they have to begin with an underscore or letter, then continue with underscores or alphanumeric characters, except that if the language allows it then they can contain a :: namespace divider.

    int Painter::identifier_at(programming_language *pl,
        text_stream *matter, text_stream *colouring, int i) {
        wchar_t c = Str::get_at(matter, i);
        if ((i > 0) && (Str::get_at(colouring, i-1) == IDENTIFIER_COLOUR)) {
            if ((c == '_') ||
                ((c >= 'A') && (c <= 'Z')) ||
                ((c >= 'a') && (c <= 'z')) ||
                ((c >= '0') && (c <= '9'))) return TRUE;
            if ((c == ':') && (pl->supports_namespaces)) return TRUE;
        } else {
            wchar_t d = 0;
            if (i > 0) d = Str::get_at(matter, i);
            if ((d >= '0') && (d <= '9')) return FALSE;
            if ((c == '_') ||
                ((c >= 'A') && (c <= 'Z')) ||
                ((c >= 'a') && (c <= 'z'))) return TRUE;
        }
        return FALSE;
    }

§2.4. With those preliminaries out of the way, the language's colouring program takes over.

<Now run the colouring program 2.4> =

        if (pl->program)
            Painter::execute(HT, pl->program, matter, colouring, from, to, painter_count++);

This code is used in §2.

§3. The run-type for a block determines what the rules in it apply to: the whole snippet of text, or each character on its own, or each run of characters of a given sort. Note that we work width-first, as it were: we complete each rule across the whole snippet before moving on to the next.

    void Painter::execute(hash_table *HT, colouring_language_block *block, text_stream *matter,
        text_stream *colouring, int from, int to, int N) {
        if (block == NULL) internal_error("no block");
        TEMPORARY_TEXT(colouring_at_start);
        Str::copy(colouring_at_start, colouring);
        colouring_rule *rule;
        LOOP_OVER_LINKED_LIST(rule, colouring_rule, block->rules) {
            switch (block->run) {
                case WHOLE_LINE_CRULE_RUN:
                    Painter::execute_rule(HT, rule, matter, colouring, from, to,
                        (N == 0)?1:N);
                    break;
                case CHARACTERS_CRULE_RUN:
                    for (int i=from; i<=to; i++)
                        Painter::execute_rule(HT, rule, matter, colouring, i, i, i-from+1);
                    break;
                case CHARACTERS_IN_CRULE_RUN:
                    for (int count=1, i=from; i<=to; i++)
                        for (int j=0; j<Str::len(block->char_set); j++)
                            if (Str::get_at(matter, i) == Str::get_at(block->char_set, j) ) {
                                Painter::execute_rule(HT, rule, matter, colouring, i, i, count++);
                                break;
                            }
                    break;
                case INSTANCES_CRULE_RUN: {
                    int L = Str::len(block->run_instance) - 1;
                    if (L >= 0)
                        for (int count=1, i=from; i<=to - L; i++)
                            if (Str::includes_at(matter, i, block->run_instance)) {
                                Painter::execute_rule(HT, rule, matter, colouring, i, i+L, count++);
                                i += L;
                            }
                    break;
                }
                case MATCHES_CRULE_RUN:
                    for (int count=1, i=from; i<=to; i++) {
                        int L = Regexp::match_from(&(block->mr), matter, block->match_regexp_text, i, TRUE);
                        if (L > 0) {
                            Painter::execute_rule(HT, rule, matter, colouring, i, i+L-1, count++);
                            i += L-1;
                        }
                    }
                    break;
                case BRACKETS_CRULE_RUN:
                    for (int i=0; i<MAX_BRACKETED_SUBEXPRESSIONS; i++)
                        if (block->mr.exp[i])
                            Str::clear(block->mr.exp[i]);
                    if (Regexp::match(&(block->mr), matter, block->match_regexp_text))
                        for (int count=1, i=0; i<MAX_BRACKETED_SUBEXPRESSIONS; i++)
                            if (block->mr.exp_at[i] >= 0)
                                Painter::execute_rule(HT, rule, matter, colouring,
                                    block->mr.exp_at[i],
                                    block->mr.exp_at[i] + Str::len(block->mr.exp[i])-1,
                                    count++);
                    break;
                default: {
                    int ident_from = -1, count = 1;
                    for (int i=from; i<=to; i++) {
                        int col = Str::get_at(colouring_at_start, i);
                        if ((col == block->run) ||
                            ((block->run == UNQUOTED_COLOUR) &&
                                ((col != STRING_COLOUR) && (col != CHAR_LITERAL_COLOUR)))) {
                            if (ident_from == -1) ident_from = i;
                        } else {
                            if (ident_from >= 0)
                                Painter::execute_rule(HT, rule, matter, colouring, ident_from, i-1, count++);
                            ident_from = -1;
                        }
                    }
                    if (ident_from >= 0)
                        Painter::execute_rule(HT, rule, matter, colouring, ident_from, to, count++);
                    break;
                }
            }
        }
        DISCARD_TEXT(colouring_at_start);
    }

§4. Rules have the form: if X, then Y.

    void Painter::execute_rule(hash_table *HT, colouring_rule *rule, text_stream *matter,
        text_stream *colouring, int from, int to, int N) {
        if (Painter::satisfies(HT, rule, matter, colouring, from, to, N) == rule->sense)
            Painter::follow(HT, rule, matter, colouring, from, to);
    }

§5. Here we test the "if X":

    define UNSPACED_RULE_PREFIX 2  for prefix P
    define SPACED_RULE_PREFIX 3  for spaced prefix P
    define OPTIONALLY_SPACED_RULE_PREFIX 4  for optionally spaced prefix P
    define UNSPACED_RULE_SUFFIX 5  for suffix P
    define SPACED_RULE_SUFFIX 6  for spaced suffix P
    define OPTIONALLY_SPACED_RULE_SUFFIX 7  for optionally spaced suffix P
    int Painter::satisfies(hash_table *HT, colouring_rule *rule, text_stream *matter,
        text_stream *colouring, int from, int to, int N) {
        if (rule->number > 0) {
            if (rule->number_of > 0) {
                if (rule->number != ((N-1)%(rule->number_of)) + 1) return FALSE;
            } else {
                if (rule->number != N) return FALSE;
            }
        } else if (rule->match_regexp_text[0]) {
            TEMPORARY_TEXT(T);
            for (int j=from; j<=to; j++) PUT_TO(T, Str::get_at(matter, j));
            int rv = Regexp::match(&(rule->mr), T, rule->match_regexp_text);
            DISCARD_TEXT(T);
            if (rv == FALSE) return FALSE;
        } else if (Str::len(rule->match_text) > 0) {
            if ((rule->match_prefix == UNSPACED_RULE_PREFIX) ||
                (rule->match_prefix == SPACED_RULE_PREFIX) ||
                (rule->match_prefix == OPTIONALLY_SPACED_RULE_PREFIX)) {
                int pos = from;
                if (rule->match_prefix != UNSPACED_RULE_PREFIX) {
                    while ((pos > 0) && (Characters::is_whitespace(pos-1))) pos--;
                    if ((rule->match_prefix == SPACED_RULE_PREFIX) && (pos == from))
                        return FALSE;
                }
                if (Str::includes_at(matter,
                    pos-Str::len(rule->match_text), rule->match_text) == FALSE)
                    return FALSE;
                rule->fix_position = pos-Str::len(rule->match_text);
            } else if ((rule->match_prefix == UNSPACED_RULE_SUFFIX) ||
                (rule->match_prefix == SPACED_RULE_SUFFIX) ||
                (rule->match_prefix == OPTIONALLY_SPACED_RULE_SUFFIX)) {
                int pos = to + 1;
                if (rule->match_prefix != UNSPACED_RULE_SUFFIX) {
                    while ((pos < Str::len(rule->match_text)) && (Characters::is_whitespace(pos))) pos++;
                    if ((rule->match_prefix == SPACED_RULE_SUFFIX) && (pos == from))
                        return FALSE;
                }
                if (Str::includes_at(matter, pos, rule->match_text) == FALSE)
                    return FALSE;
                rule->fix_position = pos;
            } else {
                if (Str::len(rule->match_text) != to-from+1)
                    return FALSE;
                for (int i=from; i<=to; i++)
                    if (Str::get_at(matter, i) != Str::get_at(rule->match_text, i-from))
                        return FALSE;
            }
        } else if (rule->match_keyword_of_colour != NOT_A_COLOUR) {
            TEMPORARY_TEXT(id);
            Str::substr(id, Str::at(matter, from), Str::at(matter, to+1));
            int rw = Analyser::is_reserved_word(HT, id, rule->match_keyword_of_colour);
            DISCARD_TEXT(id);
            if (rw == FALSE) return FALSE;
        } else if (rule->match_colour != NOT_A_COLOUR) {
            for (int i=from; i<=to; i++)
                if (Str::get_at(colouring, i) != rule->match_colour)
                    return FALSE;
        }
        return TRUE;
    }

§6. And here we carry out the "then Y":

    void Painter::follow(hash_table *HT, colouring_rule *rule, text_stream *matter,
        text_stream *colouring, int from, int to) {
        if (rule->execute_block)
            Painter::execute(HT, rule->execute_block, matter, colouring, from, to, 0);
        else if (rule->debug) <Print some debugging text 6.1>
        else {
            if (rule->set_to_colour != NOT_A_COLOUR)
                for (int i=from; i<=to; i++)
                    Str::put_at(colouring, i, rule->set_to_colour);
            if (rule->set_prefix_to_colour != NOT_A_COLOUR)
                for (int i=rule->fix_position; i<rule->fix_position+Str::len(rule->match_text); i++)
                    Str::put_at(colouring, i, rule->set_prefix_to_colour);
        }
    }

§6.1. <Print some debugging text 6.1> =

        PRINT("[%d, %d] text: ", from, to);
        for (int i=from; i<=to; i++)
            PUT_TO(STDOUT, Str::get_at(matter, i));
        PRINT("\n[%d, %d] cols: ", from, to);
        for (int i=from; i<=to; i++)
            PUT_TO(STDOUT, Str::get_at(colouring, i));
        PRINT("\n");

This code is used in §6.

§7. Painting a file.

    linked_list *Painter::lines(filename *F) {
        linked_list *L = NEW_LINKED_LIST(text_stream);
        TextFiles::read(F, FALSE, "unable to read file of textual extract", TRUE,
            &Painter::text_file_helper, NULL, L);
        int n = -1, c = 0;
        text_stream *T;
        LOOP_OVER_LINKED_LIST(T, text_stream, L) {
            c++;
            if (Str::is_whitespace(T) == FALSE)
                n = c;
        }
        if (n >= 0) {
            linked_list *R = NEW_LINKED_LIST(text_stream);
            c = 0;
            LOOP_OVER_LINKED_LIST(T, text_stream, L)
                if (++c <= n)
                    ADD_TO_LINKED_LIST(T, text_stream, R);
            return R;
        }
        return L;
    }

    void Painter::text_file_helper(text_stream *text, text_file_position *tfp, void *state) {
        linked_list *L = (linked_list *) state;
        ADD_TO_LINKED_LIST(Str::duplicate(text), text_stream, L);
    }

    void Painter::colour_file(programming_language *pl, filename *F, text_stream *to, text_stream *coloured) {
        linked_list *L = Painter::lines(F);
        if (pl) Painter::reset_syntax_colouring(pl);
        int c = 1;
        text_stream *T;
        LOOP_OVER_LINKED_LIST(T, text_stream, L) {
            if (c++ > 1) { PUT_TO(to, '\n'); PUT_TO(coloured, NEWLINE_COLOUR); }
            Str::trim_white_space_at_end(T);
            TEMPORARY_TEXT(ST);
            TEMPORARY_TEXT(SC);
            LOOP_THROUGH_TEXT(pos, T)
                if (Str::get(pos) == '\t')
                    WRITE_TO(ST, "    ");
                else
                    PUT_TO(ST, Str::get(pos));
            if (pl) {
                Painter::syntax_colour(pl, (pl)?(&(pl->built_in_keywords)):NULL, ST, SC, TRUE);
            } else {
                LOOP_THROUGH_TEXT(pos, ST)
                    PUT_TO(SC, PLAIN_COLOUR);
            }
            WRITE_TO(to, "%S", ST);
            WRITE_TO(coloured, "%S", SC);
        }
        if (c > 0) { PUT_TO(to, '\n'); PUT_TO(coloured, NEWLINE_COLOUR); }
    }