
These scripts both print "generated by $file, do not edit" header but one of them prints the wrong filename. Use the built-in __file__ attribute to ensure it's correct. contrib/ChangeLog: * unicode/gen_libstdcxx_unicode_data.py: Fix header of generated file to name the correct script. libstdc++-v3/ChangeLog: * include/bits/text_encoding-data.h: Regenerate. * include/bits/unicode-data.h: Regenerate. * scripts/gen_text_encoding_data.py: Fix header of generated file to name the correct script.
123 lines
4.5 KiB
Python
Executable file
123 lines
4.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
#
|
|
# Script to generate tables for libstdc++ std::text_encoding.
|
|
#
|
|
# This file is part of GCC.
|
|
#
|
|
# GCC is free software; you can redistribute it and/or modify it under
|
|
# the terms of the GNU General Public License as published by the Free
|
|
# Software Foundation; either version 3, or (at your option) any later
|
|
# version.
|
|
#
|
|
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
# for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with GCC; see the file COPYING3. If not see
|
|
# <http://www.gnu.org/licenses/>.
|
|
|
|
# To update the Libstdc++ static data in <bits/text_encoding-data.h> download
|
|
# the latest:
|
|
# https://www.iana.org/assignments/character-sets/character-sets-1.csv
|
|
# Then run this script and save the output to
|
|
# include/bits/text_encoding-data.h
|
|
|
|
import sys
|
|
import csv
|
|
import os
|
|
|
|
if len(sys.argv) != 2:
|
|
print("Usage: %s <character sets csv>" % sys.argv[0], file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
self = os.path.basename(__file__)
|
|
print("// Generated by scripts/{}, do not edit.".format(self))
|
|
print("""
|
|
|
|
// Copyright The GNU Toolchain Authors.
|
|
//
|
|
// This file is part of the GNU ISO C++ Library. This library is free
|
|
// software; you can redistribute it and/or modify it under the
|
|
// terms of the GNU General Public License as published by the
|
|
// Free Software Foundation; either version 3, or (at your option)
|
|
// any later version.
|
|
|
|
// This library is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
|
|
// Under Section 7 of GPL version 3, you are granted additional
|
|
// permissions described in the GCC Runtime Library Exception, version
|
|
// 3.1, as published by the Free Software Foundation.
|
|
|
|
// You should have received a copy of the GNU General Public License and
|
|
// a copy of the GCC Runtime Library Exception along with this program;
|
|
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
// <http://www.gnu.org/licenses/>.
|
|
|
|
/** @file bits/text_encoding-data.h
|
|
* This is an internal header file, included by other library headers.
|
|
* Do not attempt to use it directly. @headername{text_encoding}
|
|
*/
|
|
""")
|
|
print("#ifndef _GLIBCXX_GET_ENCODING_DATA")
|
|
print('# error "This is not a public header, do not include it directly"')
|
|
print("#endif\n")
|
|
|
|
# We need to generate a list of initializers of the form { mib, alias }, e.g.,
|
|
# { 3, "US-ASCII" },
|
|
# { 3, "ISO646-US" },
|
|
# { 3, "csASCII" },
|
|
# { 4, "ISO_8859-1:1987" },
|
|
# { 4, "latin1" },
|
|
# The initializers must be sorted by the mib value. The first entry for
|
|
# a given mib must be the primary name for the encoding. Any aliases for
|
|
# the encoding come after the primary name.
|
|
# We also define a macro _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET which is the
|
|
# offset into the list of the mib=106, alias="UTF-8" entry. This is used
|
|
# to optimize the common case, so we don't need to search for "UTF-8".
|
|
|
|
charsets = {}
|
|
with open(sys.argv[1], newline='') as f:
|
|
reader = csv.reader(f)
|
|
next(reader) # skip header row
|
|
for row in reader:
|
|
mib = int(row[2])
|
|
if mib in charsets:
|
|
raise ValueError("Multiple rows for mibEnum={}".format(mib))
|
|
name = row[1]
|
|
aliases = row[5].split()
|
|
# Ensure primary name comes first
|
|
if name in aliases:
|
|
aliases.remove(name)
|
|
charsets[mib] = [name] + aliases
|
|
|
|
# Remove "NATS-DANO" and "NATS-DANO-ADD" as specified by the C++ standard.
|
|
charsets.pop(33, None)
|
|
charsets.pop(34, None)
|
|
|
|
# This is not an official IANA alias, but we include it in the
|
|
# implementation-defined superset of aliases for US-ASCII.
|
|
# See also LWG 4043.
|
|
extra_aliases = {3: ["ASCII"]}
|
|
|
|
count = 0
|
|
for mib in sorted(charsets.keys()):
|
|
names = charsets[mib]
|
|
if names[0] == "UTF-8":
|
|
print("#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET {}".format(count))
|
|
for name in names:
|
|
print(' {{ {:4}, "{}" }},'.format(mib, name))
|
|
count += len(names)
|
|
if mib in extra_aliases:
|
|
names = extra_aliases[mib]
|
|
for name in names:
|
|
print(' {{ {:4}, "{}" }}, // libstdc++ extension'.format(mib, name))
|
|
count += len(names)
|
|
|
|
# <text_encoding> gives an error if this macro is left defined.
|
|
# Do this last, so that the generated output is not usable unless we reach here.
|
|
print("\n#undef _GLIBCXX_GET_ENCODING_DATA")
|