#!/usr/bin/env python3
#
# Script to generate tables for libstdc++ std::text_encoding.
#
# This file is part of GCC.
#
# GCC is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.
#
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with GCC; see the file COPYING3. If not see
# .
# To update the Libstdc++ static data in download
# the latest:
# https://www.iana.org/assignments/character-sets/character-sets-1.csv
# Then run this script and save the output to
# include/bits/text_encoding-data.h
import sys
import csv
if len(sys.argv) != 2:
print("Usage: %s " % sys.argv[0], file=sys.stderr)
sys.exit(1)
print("// Generated by gen_text_encoding_data.py, do not edit.\n")
print("#ifndef _GLIBCXX_GET_ENCODING_DATA")
print('# error "This is not a public header, do not include it directly"')
print("#endif\n")
# We need to generate a list of initializers of the form { mib, alias }, e.g.,
# { 3, "US-ASCII" },
# { 3, "ISO646-US" },
# { 3, "csASCII" },
# { 4, "ISO_8859-1:1987" },
# { 4, "latin1" },
# The initializers must be sorted by the mib value. The first entry for
# a given mib must be the primary name for the encoding. Any aliases for
# the encoding come after the primary name.
# We also define a macro _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET which is the
# offset into the list of the mib=106, alias="UTF-8" entry. This is used
# to optimize the common case, so we don't need to search for "UTF-8".
charsets = {}
with open(sys.argv[1], newline='') as f:
reader = csv.reader(f)
next(reader) # skip header row
for row in reader:
mib = int(row[2])
if mib in charsets:
raise ValueError("Multiple rows for mibEnum={}".format(mib))
name = row[1]
aliases = row[5].split()
# Ensure primary name comes first
if name in aliases:
aliases.remove(name)
charsets[mib] = [name] + aliases
# Remove "NATS-DANO" and "NATS-DANO-ADD" as specified by the C++ standard.
charsets.pop(33, None)
charsets.pop(34, None)
# This is not an official IANA alias, but we include it in the
# implementation-defined superset of aliases for US-ASCII.
# See also LWG 4043.
extra_aliases = {3: ["ASCII"]}
count = 0
for mib in sorted(charsets.keys()):
names = charsets[mib]
if names[0] == "UTF-8":
print("#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET {}".format(count))
for name in names:
print(' {{ {:4}, "{}" }},'.format(mib, name))
count += len(names)
if mib in extra_aliases:
names = extra_aliases[mib]
for name in names:
print(' {{ {:4}, "{}" }}, // libstdc++ extension'.format(mib, name))
count += len(names)
# gives an error if this macro is left defined.
# Do this last, so that the generated output is not usable unless we reach here.
print("\n#undef _GLIBCXX_GET_ENCODING_DATA")