ucnid-2011-1.c: New test.
gcc/testsuite: * c-c++-common/cpp/ucnid-2011-1.c: New test. libcpp: * ucnid.tab: Add C11 and C11NOSTART data. * makeucnid.c (digit): Rename enum value to N99. (C11, N11, all_languages): New enum values. (NUM_CODE_POINTS, MAX_CODE_POINT): New macros. (flags, decomp, combining_value): Use NUM_CODE_POINTS as array size. (decomp): Use unsigned int as element type. (all_decomp): New array. (read_ucnid): Handle C11 and C11NOSTART. Use MAX_CODE_POINT. (read_table): Use MAX_CODE_POINT. Store all decompositions in all_decomp. (read_derived): Use MAX_CODE_POINT. (write_table): Use NUM_CODE_POINTS. Print N99, C11 and N11 flags. Print whole array variable declaration rather than just array contents. (char_id_valid, write_context_switch): New functions. (main): Call write_context_switch. * ucnid.h: Regenerate. * include/cpplib.h (struct cpp_options): Add c11_identifiers. * init.c (struct lang_flags): Add c11_identifiers. (cpp_set_lang): Set c11_identifiers option from selected language. * internal.h (struct normalize_state): Document "previous" as previous starter character. (NORMALIZE_STATE_UPDATE_IDNUM): Take character as argument. * charset.c (DIG): Rename enum value to N99. (C11, N11): New enum values. (struct ucnrange): Give name to struct. Use short for flags and unsigned int for end of range. Include ucnid.h for whole variable declaration. (ucn_valid_in_identifier): Allow for characters up to 0x10FFFF. Allow for C11 in determining valid characters and valid start characters. Use check_nfc for non-Hangul context-dependent checks. Only store starter characters in nst->previous. (_cpp_valid_ucn): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. * lex.c (lex_identifier): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. Call NORMALIZE_STATE_UPDATE_IDNUM after initial non-UCN part of identifier. (lex_number): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. From-SVN: r204886
This commit is contained in:
parent
3d053a5f72
commit
d3f4ff8b51
11 changed files with 4783 additions and 840 deletions
|
@ -29,15 +29,22 @@ along with this program; see the file COPYING3. If not see
|
|||
enum {
|
||||
C99 = 1,
|
||||
CXX = 2,
|
||||
digit = 4,
|
||||
not_NFC = 8,
|
||||
not_NFKC = 16,
|
||||
maybe_not_NFC = 32
|
||||
N99 = 4,
|
||||
C11 = 8,
|
||||
N11 = 16,
|
||||
all_languages = C99 | CXX | C11,
|
||||
not_NFC = 32,
|
||||
not_NFKC = 64,
|
||||
maybe_not_NFC = 128
|
||||
};
|
||||
|
||||
static unsigned flags[65536];
|
||||
static unsigned short decomp[65536][2];
|
||||
static unsigned char combining_value[65536];
|
||||
#define NUM_CODE_POINTS 0x110000
|
||||
#define MAX_CODE_POINT 0x10ffff
|
||||
|
||||
static unsigned flags[NUM_CODE_POINTS];
|
||||
static unsigned int all_decomp[NUM_CODE_POINTS][2];
|
||||
static unsigned int decomp[NUM_CODE_POINTS][2];
|
||||
static unsigned char combining_value[NUM_CODE_POINTS];
|
||||
|
||||
/* Die! */
|
||||
|
||||
|
@ -48,7 +55,7 @@ fail (const char *s)
|
|||
exit (1);
|
||||
}
|
||||
|
||||
/* Read ucnid.tab and set the C99 and CXX flags in header[]. */
|
||||
/* Read ucnid.tab and set the flags for language versions in header[]. */
|
||||
|
||||
static void
|
||||
read_ucnid (const char *fname)
|
||||
|
@ -66,10 +73,14 @@ read_ucnid (const char *fname)
|
|||
break;
|
||||
if (strcmp (line, "[C99]\n") == 0)
|
||||
fl = C99;
|
||||
if (strcmp (line, "[C99DIG]\n") == 0)
|
||||
fl = C99|digit;
|
||||
else if (strcmp (line, "[C99DIG]\n") == 0)
|
||||
fl = C99|N99;
|
||||
else if (strcmp (line, "[CXX]\n") == 0)
|
||||
fl = CXX;
|
||||
else if (strcmp (line, "[C11]\n") == 0)
|
||||
fl = C11;
|
||||
else if (strcmp (line, "[C11NOSTART]\n") == 0)
|
||||
fl = C11|N11;
|
||||
else if (isxdigit (line[0]))
|
||||
{
|
||||
char *l = line;
|
||||
|
@ -94,7 +105,7 @@ read_ucnid (const char *fname)
|
|||
}
|
||||
while (isspace (*l))
|
||||
l++;
|
||||
if (end > 0xFFFF)
|
||||
if (end > MAX_CODE_POINT)
|
||||
fail ("parsing ucnid.tab, end too large");
|
||||
while (start <= end)
|
||||
flags[start++] |= fl;
|
||||
|
@ -108,8 +119,10 @@ read_ucnid (const char *fname)
|
|||
|
||||
/* Read UnicodeData.txt and fill in the 'decomp' table to be the
|
||||
decompositions of characters for which both the character
|
||||
decomposed and all the code points in the decomposition are either
|
||||
C99 or CXX. */
|
||||
decomposed and all the code points in the decomposition are valid
|
||||
for some supported language version, and the 'all_decomp' table to
|
||||
be the decompositions of all characters without those
|
||||
constraints. */
|
||||
|
||||
static void
|
||||
read_table (char *fname)
|
||||
|
@ -123,7 +136,7 @@ read_table (char *fname)
|
|||
char line[256];
|
||||
unsigned long codepoint, this_decomp[4];
|
||||
char *l;
|
||||
int i;
|
||||
int i, j;
|
||||
int decomp_useful;
|
||||
|
||||
if (!fgets (line, sizeof (line), f))
|
||||
|
@ -131,8 +144,8 @@ read_table (char *fname)
|
|||
codepoint = strtoul (line, &l, 16);
|
||||
if (l == line || *l != ';')
|
||||
fail ("parsing UnicodeData.txt, reading code point");
|
||||
if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX)))
|
||||
continue;
|
||||
if (codepoint > MAX_CODE_POINT)
|
||||
fail ("parsing UnicodeData.txt, code point too large");
|
||||
|
||||
do {
|
||||
l++;
|
||||
|
@ -171,7 +184,9 @@ read_table (char *fname)
|
|||
}
|
||||
if (i > 2) /* Decomposition too long. */
|
||||
fail ("parsing UnicodeData.txt, decomposition too long");
|
||||
if (decomp_useful)
|
||||
for (j = 0; j < i; j++)
|
||||
all_decomp[codepoint][j] = this_decomp[j];
|
||||
if ((flags[codepoint] & all_languages) && decomp_useful)
|
||||
while (--i >= 0)
|
||||
decomp[codepoint][i] = this_decomp[i];
|
||||
}
|
||||
|
@ -208,8 +223,8 @@ read_derived (const char *fname)
|
|||
start = strtoul (line, &l, 16);
|
||||
if (l == line)
|
||||
fail ("parsing DerivedNormalizationProps.txt, reading start");
|
||||
if (start > 0xffff)
|
||||
continue;
|
||||
if (start > MAX_CODE_POINT)
|
||||
fail ("parsing DerivedNormalizationProps.txt, code point too large");
|
||||
if (*l == '.' && l[1] == '.')
|
||||
end = strtoul (l + 2, &l, 16);
|
||||
else
|
||||
|
@ -237,17 +252,21 @@ write_table (void)
|
|||
unsigned last_flag = flags[0];
|
||||
bool really_safe = decomp[0][0] == 0;
|
||||
unsigned char last_combine = combining_value[0];
|
||||
|
||||
printf ("static const struct ucnrange ucnranges[] = {\n");
|
||||
|
||||
for (i = 1; i <= 65536; i++)
|
||||
if (i == 65536
|
||||
|| (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX)))
|
||||
for (i = 1; i <= NUM_CODE_POINTS; i++)
|
||||
if (i == NUM_CODE_POINTS
|
||||
|| (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
|
||||
|| really_safe != (decomp[i][0] == 0)
|
||||
|| combining_value[i] != last_combine)
|
||||
{
|
||||
printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
|
||||
printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
|
||||
last_flag & C99 ? "C99" : " 0",
|
||||
last_flag & digit ? "DIG" : " 0",
|
||||
last_flag & N99 ? "N99" : " 0",
|
||||
last_flag & CXX ? "CXX" : " 0",
|
||||
last_flag & C11 ? "C11" : " 0",
|
||||
last_flag & N11 ? "N11" : " 0",
|
||||
really_safe ? "CID" : " 0",
|
||||
last_flag & not_NFC ? " 0" : "NFC",
|
||||
last_flag & not_NFKC ? " 0" : "NKC",
|
||||
|
@ -258,6 +277,98 @@ write_table (void)
|
|||
last_combine = combining_value[0];
|
||||
really_safe = decomp[i][0] == 0;
|
||||
}
|
||||
|
||||
printf ("};\n");
|
||||
}
|
||||
|
||||
/* Return whether a given character is valid in an identifier for some
|
||||
supported language, either as itself or as a UCN. */
|
||||
|
||||
static bool
|
||||
char_id_valid (unsigned int c)
|
||||
{
|
||||
return ((flags[c] & all_languages)
|
||||
|| (c == 0x24)
|
||||
|| (c >= 0x30 && c <= 0x39)
|
||||
|| (c >= 0x41 && c <= 0x5a)
|
||||
|| (c >= 0x61 && c <= 0x7a));
|
||||
}
|
||||
|
||||
/* Write out the switch statement over characters for which it is
|
||||
context-dependent whether they are in NFC. */
|
||||
|
||||
static void
|
||||
write_context_switch (void)
|
||||
{
|
||||
unsigned i;
|
||||
printf ("static bool\n"
|
||||
"check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
|
||||
"{\n"
|
||||
" switch (c)\n"
|
||||
" {\n");
|
||||
for (i = 0; i < NUM_CODE_POINTS; i++)
|
||||
{
|
||||
bool found_case = false;
|
||||
unsigned j;
|
||||
if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
|
||||
continue;
|
||||
if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
|
||||
continue; /* Hangul handled algorithmically. */
|
||||
printf (" case %#06x:\n"
|
||||
" switch (p)\n"
|
||||
"\t{\n", i);
|
||||
/* If an NFC starter character decomposes with this character I
|
||||
as the second character and an NFC starter character S as the
|
||||
first character, that latter character as a previous
|
||||
character means this character is not NFC. Furthermore, any
|
||||
NFC starter character K made by a series of compositions of S
|
||||
with combining characters whose combining class is greater
|
||||
than that of I also means this character is not NFC. */
|
||||
for (j = 0; j < NUM_CODE_POINTS; j++)
|
||||
{
|
||||
unsigned s, k;
|
||||
if (all_decomp[j][1] != i)
|
||||
continue;
|
||||
s = all_decomp[j][0];
|
||||
if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
|
||||
continue;
|
||||
if (char_id_valid (s))
|
||||
{
|
||||
found_case = true;
|
||||
printf ("\tcase %#06x:\n", s);
|
||||
}
|
||||
for (k = 0; k < NUM_CODE_POINTS; k++)
|
||||
{
|
||||
unsigned t = k;
|
||||
if (k == s || !char_id_valid (k))
|
||||
continue;
|
||||
while (all_decomp[t][1] != 0
|
||||
&& combining_value[all_decomp[t][1]] > combining_value[i])
|
||||
{
|
||||
if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
|
||||
break;
|
||||
t = all_decomp[t][0];
|
||||
}
|
||||
if (t == s)
|
||||
{
|
||||
found_case = true;
|
||||
printf ("\tcase %#06x:\n", k);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (found_case)
|
||||
printf ("\t return false;\n");
|
||||
else
|
||||
printf ("\t/* Non-NFC cases not applicable to C/C++. */\n");
|
||||
printf ("\tdefault:\n"
|
||||
"\t return true;\n"
|
||||
"\t}\n\n");
|
||||
}
|
||||
printf (" default:\n"
|
||||
" cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
|
||||
" return true;\n"
|
||||
" }\n"
|
||||
"}\n");
|
||||
}
|
||||
|
||||
/* Print out the huge copyright notice. */
|
||||
|
@ -336,5 +447,6 @@ main(int argc, char ** argv)
|
|||
|
||||
write_copyright ();
|
||||
write_table ();
|
||||
write_context_switch ();
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue