Cleanup etags.c to use locale-independent code

Although this doesn't alter behavior (as etags doesn't use
setlocale), the new version is more clearly locale-independent and
the executable is a bit smaller on my platform.
* lib-src/etags.c: Include <limits.h>, for UCHAR_MAX.
Include <c-ctype.h> instead of <ctype.h>.
(CHARS, CHAR, init, _wht, _nin, _itk, _btk, _etk, white, nonam, endtk)
(begtk, midtk):
Remove; no longer needed.
(iswhite, ISALNUM, ISALPHA, ISDIGIT, ISLOWER, lowcase): Remove.
All callers changed to use c_isspace, c_isalnum, c_isalpha, c_isdigit,
c_islower, c_tolower, respectively.
(notinname, begtoken, intoken, endtoken): Rewrite as functions
instead of macros, and initialize the tables at compile-time
rather than at run-time.
This commit is contained in:
Paul Eggert 2015-05-23 15:18:12 -07:00
parent fa658b2e90
commit fb11a5e117

View file

@ -119,17 +119,18 @@ char pot_etags_version[] = "@(#) pot revision number is 17.38.1.4";
# define O_CLOEXEC O_NOINHERIT
#endif /* WINDOWSNT */
#include <limits.h>
#include <unistd.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <sysstdio.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <binary-io.h>
#include <c-ctype.h>
#include <c-strcase.h>
#include <assert.h>
@ -156,21 +157,71 @@ char pot_etags_version[] = "@(#) pot revision number is 17.38.1.4";
#define strneq(s,t,n) (assert ((s)!=NULL || (t)!=NULL), !strncmp (s, t, n))
#define strncaseeq(s,t,n) (assert ((s)!=NULL && (t)!=NULL), !c_strncasecmp (s, t, n))
#define CHARS 256 /* 2^sizeof(char) */
#define CHAR(x) ((unsigned int)(x) & (CHARS - 1))
#define iswhite(c) (_wht[CHAR (c)]) /* c is white (see white) */
#define notinname(c) (_nin[CHAR (c)]) /* c is not in a name (see nonam) */
#define begtoken(c) (_btk[CHAR (c)]) /* c can start token (see begtk) */
#define intoken(c) (_itk[CHAR (c)]) /* c can be in token (see midtk) */
#define endtoken(c) (_etk[CHAR (c)]) /* c ends tokens (see endtk) */
/* C is not in a name. */
static bool
notinname (unsigned char c)
{
/* Look at make_tag before modifying! */
static bool const table[UCHAR_MAX + 1] = {
['\0']=1, ['\t']=1, ['\n']=1, ['\f']=1, ['\r']=1, [' ']=1,
['(']=1, [')']=1, [',']=1, [';']=1, ['=']=1
};
return table[c];
}
#define ISALNUM(c) isalnum (CHAR (c))
#define ISALPHA(c) isalpha (CHAR (c))
#define ISDIGIT(c) isdigit (CHAR (c))
#define ISLOWER(c) islower (CHAR (c))
/* C can start a token. */
static bool
begtoken (unsigned char c)
{
static bool const table[UCHAR_MAX + 1] = {
['$']=1, ['@']=1,
['A']=1, ['B']=1, ['C']=1, ['D']=1, ['E']=1, ['F']=1, ['G']=1, ['H']=1,
['I']=1, ['J']=1, ['K']=1, ['L']=1, ['M']=1, ['N']=1, ['O']=1, ['P']=1,
['Q']=1, ['R']=1, ['S']=1, ['T']=1, ['U']=1, ['V']=1, ['W']=1, ['X']=1,
['Y']=1, ['Z']=1,
['_']=1,
['a']=1, ['b']=1, ['c']=1, ['d']=1, ['e']=1, ['f']=1, ['g']=1, ['h']=1,
['i']=1, ['j']=1, ['k']=1, ['l']=1, ['m']=1, ['n']=1, ['o']=1, ['p']=1,
['q']=1, ['r']=1, ['s']=1, ['t']=1, ['u']=1, ['v']=1, ['w']=1, ['x']=1,
['y']=1, ['z']=1,
['~']=1
};
return table[c];
}
#define lowcase(c) tolower (CHAR (c))
/* C can be in the middle of a token. */
static bool
intoken (unsigned char c)
{
static bool const table[UCHAR_MAX + 1] = {
['0']=1, ['1']=1, ['2']=1, ['3']=1, ['4']=1,
['5']=1, ['6']=1, ['7']=1, ['8']=1, ['9']=1,
['A']=1, ['B']=1, ['C']=1, ['D']=1, ['E']=1, ['F']=1, ['G']=1, ['H']=1,
['I']=1, ['J']=1, ['K']=1, ['L']=1, ['M']=1, ['N']=1, ['O']=1, ['P']=1,
['Q']=1, ['R']=1, ['S']=1, ['T']=1, ['U']=1, ['V']=1, ['W']=1, ['X']=1,
['Y']=1, ['Z']=1,
['_']=1,
['a']=1, ['b']=1, ['c']=1, ['d']=1, ['e']=1, ['f']=1, ['g']=1, ['h']=1,
['i']=1, ['j']=1, ['k']=1, ['l']=1, ['m']=1, ['n']=1, ['o']=1, ['p']=1,
['q']=1, ['r']=1, ['s']=1, ['t']=1, ['u']=1, ['v']=1, ['w']=1, ['x']=1,
['y']=1, ['z']=1
};
return table[c];
}
/* C can end a token. */
static bool
endtoken (unsigned char c)
{
static bool const table[UCHAR_MAX + 1] = {
['\0']=1, ['\t']=1, ['\n']=1, ['\r']=1, [' ']=1,
['!']=1, ['"']=1, ['#']=1, ['%']=1, ['&']=1, ['\'']=1, ['(']=1, [')']=1,
['*']=1, ['+']=1, [',']=1, ['-']=1, ['.']=1, ['/']=1, [':']=1, [';']=1,
['<']=1, ['=']=1, ['>']=1, ['?']=1, ['[']=1, [']']=1, ['^']=1,
['{']=1, ['|']=1, ['}']=1, ['~']=1
};
return table[c];
}
/*
* xnew, xrnew -- allocate, reallocate storage
@ -316,7 +367,6 @@ _Noreturn void fatal (const char *, const char *);
static _Noreturn void pfatal (const char *);
static void add_node (node *, node **);
static void init (void);
static void process_file_name (char *, language *);
static void process_file (FILE *, char *, language *);
static void find_entries (FILE *);
@ -370,20 +420,6 @@ static linebuffer lb; /* the current line */
static linebuffer filebuf; /* a buffer containing the whole file */
static linebuffer token_name; /* a buffer containing a tag name */
/* boolean "functions" (see init) */
static bool _wht[CHARS], _nin[CHARS], _itk[CHARS], _btk[CHARS], _etk[CHARS];
static const char
/* white chars */
*white = " \f\t\n\r\v",
/* not in a name */
*nonam = " \f\t\n\r()=,;", /* look at make_tag before modifying! */
/* token ending chars */
*endtk = " \t\n\r\"'#()[]{}=-+%*/&|^~!<>;,.:?",
/* token starting chars */
*begtk = "ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz$~@",
/* valid in-token chars */
*midtk = "ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz$0123456789";
static bool append_to_tagfile; /* -a: append to tags */
/* The next five default to true in C and derived languages. */
static bool typedefs; /* -t: create tags for C and Ada typedefs */
@ -1167,8 +1203,6 @@ main (int argc, char **argv)
tagfiledir = absolute_dirname (tagfile, cwd);
}
init (); /* set up boolean "functions" */
linebuffer_init (&lb);
linebuffer_init (&filename_lb);
linebuffer_init (&filebuf);
@ -1636,34 +1670,6 @@ process_file (FILE *fh, char *fn, language *lang)
}
}
/*
* This routine sets up the boolean pseudo-functions which work
* by setting boolean flags dependent upon the corresponding character.
* Every char which is NOT in that string is not a white char. Therefore,
* all of the array "_wht" is set to false, and then the elements
* subscripted by the chars in "white" are set to true. Thus "_wht"
* of a char is true if it is the string "white", else false.
*/
static void
init (void)
{
const char *sp;
int i;
for (i = 0; i < CHARS; i++)
iswhite (i) = notinname (i) = begtoken (i) = intoken (i) = endtoken (i)
= false;
for (sp = white; *sp != '\0'; sp++) iswhite (*sp) = true;
for (sp = nonam; *sp != '\0'; sp++) notinname (*sp) = true;
notinname ('\0') = notinname ('\n');
for (sp = begtk; *sp != '\0'; sp++) begtoken (*sp) = true;
begtoken ('\0') = begtoken ('\n');
for (sp = midtk; *sp != '\0'; sp++) intoken (*sp) = true;
intoken ('\0') = intoken ('\n');
for (sp = endtk; *sp != '\0'; sp++) endtoken (*sp) = true;
endtoken ('\0') = endtoken ('\n');
}
/*
* This routine opens the specified file and calls the function
* which finds the function and type definitions.
@ -3179,7 +3185,7 @@ C_entries (int c_ext, FILE *inf)
followed by an end of comment, this is a preprocessor
token. */
for (cp = newlb.buffer; cp < lp-1; cp++)
if (!iswhite (*cp))
if (!c_isspace (*cp))
{
if (*cp == '*' && cp[1] == '/')
{
@ -3262,7 +3268,7 @@ C_entries (int c_ext, FILE *inf)
if (*lp != '\0')
lp += 1;
while (*lp != '\0'
&& !iswhite (*lp) && *lp != '(')
&& !c_isspace (*lp) && *lp != '(')
lp += 1;
c = *lp++;
toklen += lp - oldlp;
@ -3934,14 +3940,14 @@ F_takeprec (void)
dbp += 3;
return;
}
if (!ISDIGIT (*dbp))
if (!c_isdigit (*dbp))
{
--dbp; /* force failure */
return;
}
do
dbp++;
while (ISDIGIT (*dbp));
while (c_isdigit (*dbp));
}
static void
@ -3959,7 +3965,7 @@ F_getit (FILE *inf)
dbp += 6;
dbp = skip_spaces (dbp);
}
if (!ISALPHA (*dbp) && *dbp != '_' && *dbp != '$')
if (!c_isalpha (*dbp) && *dbp != '_' && *dbp != '$')
return;
for (cp = dbp + 1; *cp != '\0' && intoken (*cp); cp++)
continue;
@ -3988,7 +3994,7 @@ Fortran_functions (FILE *inf)
if (LOOKING_AT_NOCASE (dbp, "elemental"))
dbp = skip_spaces (dbp);
switch (lowcase (*dbp))
switch (c_tolower (*dbp))
{
case 'i':
if (nocase_tail ("integer"))
@ -4021,7 +4027,7 @@ Fortran_functions (FILE *inf)
dbp = skip_spaces (dbp);
if (*dbp == '\0')
continue;
switch (lowcase (*dbp))
switch (c_tolower (*dbp))
{
case 'f':
if (nocase_tail ("function"))
@ -4075,7 +4081,7 @@ Ada_getit (FILE *inf, const char *name_qualifier)
readline (&lb, inf);
dbp = lb.buffer;
}
switch (lowcase (*dbp))
switch (c_tolower (*dbp))
{
case 'b':
if (nocase_tail ("body"))
@ -4102,8 +4108,7 @@ Ada_getit (FILE *inf, const char *name_qualifier)
{
dbp = skip_spaces (dbp);
for (cp = dbp;
(*cp != '\0'
&& (ISALPHA (*cp) || ISDIGIT (*cp) || *cp == '_' || *cp == '.'));
c_isalnum (*cp) || *cp == '_' || *cp == '.';
cp++)
continue;
if (cp == dbp)
@ -4179,7 +4184,7 @@ Ada_funcs (FILE *inf)
}
/* We are at the beginning of a token. */
switch (lowcase (*dbp))
switch (c_tolower (*dbp))
{
case 'f':
if (!packages_only && nocase_tail ("function"))
@ -4244,13 +4249,13 @@ Asm_labels (FILE *inf)
{
/* If first char is alphabetic or one of [_.$], test for colon
following identifier. */
if (ISALPHA (*cp) || *cp == '_' || *cp == '.' || *cp == '$')
if (c_isalpha (*cp) || *cp == '_' || *cp == '.' || *cp == '$')
{
/* Read past label. */
cp++;
while (ISALNUM (*cp) || *cp == '_' || *cp == '.' || *cp == '$')
while (c_isalnum (*cp) || *cp == '_' || *cp == '.' || *cp == '$')
cp++;
if (*cp == ':' || iswhite (*cp))
if (*cp == ':' || c_isspace (*cp))
/* Found end of label, so copy it and add it to the table. */
make_tag (lb.buffer, cp - lb.buffer, true,
lb.buffer, cp - lb.buffer + 1, lineno, linecharno);
@ -4338,7 +4343,7 @@ Perl_functions (FILE *inf)
varstart += 1;
do
cp++;
while (ISALNUM (*cp) || *cp == '_');
while (c_isalnum (*cp) || *cp == '_');
}
else if (qual)
{
@ -4433,7 +4438,7 @@ PHP_functions (FILE *inf)
if (*cp != '\0')
{
name = cp;
while (*cp != '\0' && !iswhite (*cp))
while (*cp != '\0' && !c_isspace (*cp))
cp++;
make_tag (name, cp - name, false,
lb.buffer, cp - lb.buffer + 1, lineno, linecharno);
@ -4485,10 +4490,10 @@ Cobol_paragraphs (FILE *inf)
bp += 8;
/* If eoln, compiler option or comment ignore whole line. */
if (bp[-1] != ' ' || !ISALNUM (bp[0]))
if (bp[-1] != ' ' || !c_isalnum (bp[0]))
continue;
for (ep = bp; ISALNUM (*ep) || *ep == '-'; ep++)
for (ep = bp; c_isalnum (*ep) || *ep == '-'; ep++)
continue;
if (*ep++ == '.')
make_tag (bp, ep - bp, true,
@ -4636,7 +4641,7 @@ Pascal_functions (FILE *inf)
/* Check if this is an "extern" declaration. */
if (*dbp == '\0')
continue;
if (lowcase (*dbp) == 'e')
if (c_tolower (*dbp) == 'e')
{
if (nocase_tail ("extern")) /* superfluous, really! */
{
@ -4644,7 +4649,7 @@ Pascal_functions (FILE *inf)
verify_tag = false;
}
}
else if (lowcase (*dbp) == 'f')
else if (c_tolower (*dbp) == 'f')
{
if (nocase_tail ("forward")) /* check for forward reference */
{
@ -4691,7 +4696,7 @@ Pascal_functions (FILE *inf)
else if (!incomment && !inquote && !found_tag)
{
/* Check for proc/fn keywords. */
switch (lowcase (c))
switch (c_tolower (c))
{
case 'p':
if (nocase_tail ("rocedure")) /* c = 'p', dbp has advanced */
@ -4855,13 +4860,13 @@ Forth_words (FILE *inf)
LOOP_ON_INPUT_LINES (inf, lb, bp)
while ((bp = skip_spaces (bp))[0] != '\0')
if (bp[0] == '\\' && iswhite (bp[1]))
if (bp[0] == '\\' && c_isspace (bp[1]))
break; /* read next line */
else if (bp[0] == '(' && iswhite (bp[1]))
else if (bp[0] == '(' && c_isspace (bp[1]))
do /* skip to ) or eol */
bp++;
while (*bp != ')' && *bp != '\0');
else if ((bp[0] == ':' && iswhite (bp[1]) && bp++)
else if ((bp[0] == ':' && c_isspace (bp[1]) && bp++)
|| LOOKING_AT_NOCASE (bp, "constant")
|| LOOKING_AT_NOCASE (bp, "code")
|| LOOKING_AT_NOCASE (bp, "create")
@ -4972,7 +4977,7 @@ TeX_commands (FILE *inf)
cp++;
}
for (p = cp;
(!iswhite (*p) && *p != '#' &&
(!c_isspace (*p) && *p != '#' &&
*p != TEX_opgrp && *p != TEX_clgrp);
p++)
continue;
@ -5130,7 +5135,7 @@ HTML_labels (FILE *inf)
else if (intag) /* look for "name=" or "id=" */
{
while (*dbp != '\0' && *dbp != '>'
&& lowcase (*dbp) != 'n' && lowcase (*dbp) != 'i')
&& c_tolower (*dbp) != 'n' && c_tolower (*dbp) != 'i')
dbp++;
if (*dbp == '\0')
break; /* go to next line */
@ -5172,7 +5177,7 @@ HTML_labels (FILE *inf)
if (*dbp == '<')
{
intag = true;
inanchor = (lowcase (dbp[1]) == 'a' && !intoken (dbp[2]));
inanchor = (c_tolower (dbp[1]) == 'a' && !intoken (dbp[2]));
continue; /* look on the same line */
}
@ -5192,7 +5197,7 @@ HTML_labels (FILE *inf)
if (*dbp == '\0')
break; /* go to next line */
intag = true;
if (lowcase (dbp[1]) == 'a' && !intoken (dbp[2]))
if (c_tolower (dbp[1]) == 'a' && !intoken (dbp[2]))
{
inanchor = true;
continue; /* look on the same line */
@ -5239,7 +5244,7 @@ Prolog_functions (FILE *inf)
{
if (cp[0] == '\0') /* Empty line */
continue;
else if (iswhite (cp[0])) /* Not a predicate */
else if (c_isspace (cp[0])) /* Not a predicate */
continue;
else if (cp[0] == '/' && cp[1] == '*') /* comment. */
prolog_skip_comment (&lb, inf);
@ -5331,11 +5336,11 @@ prolog_atom (char *s, size_t pos)
origpos = pos;
if (ISLOWER (s[pos]) || (s[pos] == '_'))
if (c_islower (s[pos]) || s[pos] == '_')
{
/* The atom is unquoted. */
pos++;
while (ISALNUM (s[pos]) || (s[pos] == '_'))
while (c_isalnum (s[pos]) || s[pos] == '_')
{
pos++;
}
@ -5399,7 +5404,7 @@ Erlang_functions (FILE *inf)
{
if (cp[0] == '\0') /* Empty line */
continue;
else if (iswhite (cp[0])) /* Not function nor attribute */
else if (c_isspace (cp[0])) /* Not function nor attribute */
continue;
else if (cp[0] == '%') /* comment */
continue;
@ -5506,12 +5511,12 @@ erlang_atom (char *s)
{
int pos = 0;
if (ISALPHA (s[pos]) || s[pos] == '_')
if (c_isalpha (s[pos]) || s[pos] == '_')
{
/* The atom is unquoted. */
do
pos++;
while (ISALNUM (s[pos]) || s[pos] == '_');
while (c_isalnum (s[pos]) || s[pos] == '_');
}
else if (s[pos] == '\'')
{
@ -5729,10 +5734,10 @@ add_regex (char *regexp_pattern, language *lang)
*patbuf = zeropattern;
if (ignore_case)
{
static char lc_trans[CHARS];
static char lc_trans[UCHAR_MAX + 1];
int i;
for (i = 0; i < CHARS; i++)
lc_trans[i] = lowcase (i);
for (i = 0; i < UCHAR_MAX + 1; i++)
lc_trans[i] = c_tolower (i);
patbuf->translate = lc_trans; /* translation table to fold case */
}
@ -5787,7 +5792,7 @@ substitute (char *in, char *out, struct re_registers *regs)
for (t = strchr (out, '\\');
t != NULL;
t = strchr (t + 2, '\\'))
if (ISDIGIT (t[1]))
if (c_isdigit (t[1]))
{
dig = t[1] - '0';
diglen = regs->end[dig] - regs->start[dig];
@ -5801,7 +5806,7 @@ substitute (char *in, char *out, struct re_registers *regs)
result = xnew (size + 1, char);
for (t = result; *out != '\0'; out++)
if (*out == '\\' && ISDIGIT (*++out))
if (*out == '\\' && c_isdigit (*++out))
{
dig = *out - '0';
diglen = regs->end[dig] - regs->start[dig];
@ -5921,9 +5926,9 @@ regex_tag_multiline (void)
static bool
nocase_tail (const char *cp)
{
register int len = 0;
int len = 0;
while (*cp != '\0' && lowcase (*cp) == lowcase (dbp[len]))
while (*cp != '\0' && c_tolower (*cp) == c_tolower (dbp[len]))
cp++, len++;
if (*cp == '\0' && !intoken (dbp[len]))
{
@ -6257,7 +6262,7 @@ savenstr (const char *cp, int len)
static char *
skip_spaces (char *cp)
{
while (iswhite (*cp))
while (c_isspace (*cp))
cp++;
return cp;
}
@ -6266,7 +6271,7 @@ skip_spaces (char *cp)
static char *
skip_non_spaces (char *cp)
{
while (*cp != '\0' && !iswhite (*cp))
while (*cp != '\0' && !c_isspace (*cp))
cp++;
return cp;
}
@ -6527,7 +6532,7 @@ filename_is_absolute (char *fn)
{
return (fn[0] == '/'
#ifdef DOS_NT
|| (ISALPHA (fn[0]) && fn[1] == ':' && fn[2] == '/')
|| (c_isalpha (fn[0]) && fn[1] == ':' && fn[2] == '/')
#endif
);
}
@ -6541,9 +6546,8 @@ canonicalize_filename (register char *fn)
#ifdef DOS_NT
/* Canonicalize drive letter case. */
# define ISUPPER(c) isupper (CHAR (c))
if (fn[0] != '\0' && fn[1] == ':' && ISUPPER (fn[0]))
fn[0] = lowcase (fn[0]);
if (c_isupper (fn[0]) && fn[1] == ':')
fn[0] = c_tolower (fn[0]);
/* Collapse multiple forward- and back-slashes into a single forward
slash. */