gcc/libstdc++-v3/testsuite/ext/unicode/properties.cc

129 lines
6.4 KiB
C++
Raw Permalink Normal View History

libstdc++: Fix Unicode property detection functions Fix some copy & pasted logic in __is_extended_pictographic. This function should yield false for the values before the first edge, not true. Also add a missing boundary condition check in __incb_property. Also Fix an off-by-one error in _Utf_iterator::operator++() that would make dereferencing a past-the-end iterator undefined (where the intended design is that the iterator is always incrementable and dereferenceable, for better memory safety). Also simplify the grapheme view iterator, which still contained some remnants of an earlier design I was experimenting with. Slightly tweak the gen_libstdcxx_unicode_data.py script so that the _Gcb_property enumerators are in the order we encounter them in the data file, instead of sorting them alphabetically. Start with the "Other" property at value 0, because that's the default property for anything not in the file. This makes no practical difference, but seems cleaner. It causes the values in the __gcb_edges table to change, so can only be done now before anybody is using this code yet. The enumerator values and table entries become ABI artefacts for the function using them. contrib/ChangeLog: * unicode/gen_libstdcxx_unicode_data.py: Print out Gcb_property enumerators in the order they're seen, not alphabetical order. libstdc++-v3/ChangeLog: * include/bits/unicode-data.h: Regenerate. * include/bits/unicode.h (_Utf_iterator::operator++()): Fix off by one error. (__incb_property): Add missing check for values before the first edge. (__is_extended_pictographic): Invert return values to fix copy&pasted logic. (_Grapheme_cluster_view::_Iterator): Remove second iterator member and find end of cluster lazily. * testsuite/ext/unicode/grapheme_view.cc: New test. * testsuite/ext/unicode/properties.cc: New test. * testsuite/ext/unicode/view.cc: New test.
2024-01-09 14:43:40 +00:00
// { dg-do compile { target c++20 } }
#include <format> // includes <bits/unicode.h>
#include <string_view>
#include <ranges>
#include <testsuite_hooks.h>
namespace uc = std::__unicode;
using namespace std::string_view_literals;
constexpr char32_t riA = U'\N{REGIONAL INDICATOR SYMBOL LETTER A}';
constexpr char32_t riZ = U'\N{REGIONAL INDICATOR SYMBOL LETTER Z}';
static_assert( uc::__field_width(U'\0') == 1 );
static_assert( uc::__field_width(U'1') == 1 );
static_assert( uc::__field_width(U'a') == 1 );
static_assert( uc::__field_width(riA) == 1 );
static_assert( uc::__field_width(U'\N{OBLIQUE HYPHEN}') == 1 );
static_assert( uc::__field_width(U'\N{CIRCLED NUMBER EIGHTY ON BLACK SQUARE}')
== 1 );
static_assert( uc::__field_width(U'\N{SESQUIQUADRATE}') == 1 );
static_assert( uc::__field_width(U'\N{SOCCER BALL}') == 2 );
static_assert( uc::__field_width(U'\N{BASEBALL}') == 2 );
static_assert( uc::__field_width(U'\N{SQUARED KEY}') == 1 );
static_assert( uc::__field_width(U'\N{BLACK DRAUGHTS KING}') == 1 );
static_assert( uc::__field_width(U'\N{SNOWMAN WITHOUT SNOW}') == 2 );
static_assert( uc::__field_width(U'\N{IDEOGRAPHIC SPACE}') == 2 );
static_assert( uc::__field_width(U'\N{IDEOGRAPHIC COMMA}') == 2 );
static_assert( uc::__field_width(U'\N{CIRCLED IDEOGRAPH ONE}') == 2 );
// EastAsianWidth.txt says these are normal width, but C++ says width 2:
static_assert( uc::__field_width(U'\u4DC0') == 2 );
static_assert( uc::__field_width(U'\u4DC1') == 2 );
static_assert( uc::__field_width(U'\u4DFF') == 2 );
// EastAsianWidth.txt says W and C++ says 2:
static_assert( uc::__field_width(U'\U0001F300') == 2 );
static_assert( uc::__field_width(U'\U0001F320') == 2 );
// EastAsianWidth.txt says N but C++ says 2:
static_assert( uc::__field_width(U'\U0001F321') == 2 );
static_assert( uc::__field_width(U'\U0001F5FA') == 2 );
// EastAsianWidth.txt says W and C++ says 2:
static_assert( uc::__field_width(U'\U0001F5FF') == 2 );
static_assert( uc::__field_width(U'\U0001F600') == 2 );
static_assert( uc::__field_width(U'\U0001F900') == 2 );
static_assert( uc::__field_width(U'\U0001F90B') == 2 );
static_assert( uc::__field_width(U'\U0001F90C') == 2 );
static_assert( uc::__field_width(U'\U0001F93B') == 2 );
static_assert( uc::__field_width(U'\U0001F9FF') == 2 );
static_assert( uc::__field_width(U'\U0001FA00') == 1 );
static_assert( uc::__field_width(U'\U0001FA69') == 1 );
static_assert( uc::__field_width(U'\U0001FA70') == 2 );
static_assert( uc::__field_width(U'\U0001FAF8') == 2 );
static_assert( uc::__field_width(U'\U0001FAF9') == 1 );
using enum uc::_Gcb_property;
static_assert( uc::__grapheme_cluster_break_property(U'\0') == _Gcb_Control );
static_assert( uc::__grapheme_cluster_break_property(U'a') == _Gcb_Other );
static_assert( uc::__grapheme_cluster_break_property(riA)
== _Gcb_Regional_Indicator );
static_assert( uc::__grapheme_cluster_break_property(riZ)
== _Gcb_Regional_Indicator );
static_assert( uc::__grapheme_cluster_break_property(riA - 1) == _Gcb_Other );
static_assert( uc::__grapheme_cluster_break_property(riZ + 1) == _Gcb_Other );
static_assert( uc::__grapheme_cluster_break_property(U'\uD788') == _Gcb_LV );
static_assert( uc::__grapheme_cluster_break_property(U'\uD7A3') == _Gcb_LVT );
static_assert( uc::__grapheme_cluster_break_property(U'\u200D') == _Gcb_ZWJ );
contrib, libcpp, libstdc++: Update to Unicode 16.0 It is autumn again and there is a new Unicode version 16.0. The following patch updates our Unicode stuff in contrib, libcpp and libstdc++ from that Unicode version. 2024-10-08 Jakub Jelinek <jakub@redhat.com> contrib/ * unicode/README: Update glibc git commit hash, replace Unicode 15 or 15.1 versions with 16. * unicode/gen_libstdcxx_unicode_data.py: Use 160000 instead of 150100 in _GLIBCXX_GET_UNICODE_DATA test. * unicode/from_glibc/utf8_gen.py: Updated from glibc 064c708c78cc2a6b5802dce73108fc0c1c6bfc80 commit. * unicode/DerivedCoreProperties.txt: Updated from Unicode 16.0. * unicode/emoji-data.txt: Likewise. * unicode/PropList.txt: Likewise. * unicode/GraphemeBreakProperty.txt: Likewise. * unicode/DerivedNormalizationProps.txt: Likewise. * unicode/NameAliases.txt: Likewise. * unicode/UnicodeData.txt: Likewise. * unicode/EastAsianWidth.txt: Likewise. gcc/testsuite/ * c-c++-common/cpp/named-universal-char-escape-1.c: Add tests for some Unicode 16.0 characters, both normal and generated. libcpp/ * makeucnid.cc (write_copyright): Update Unicode Copyright years. * makeuname2c.cc (generated_ranges): Adjust Unicode version from 15.1 to 16.0. Add EGYPTIAN HIEROGLYPH- generated range, adjust indexes in following entries. (write_copyright): Update Unicode Copyright years. * generated_cpp_wcwidth.h: Regenerated. * ucnid.h: Regenerated. * uname2c.h: Regenerated. libstdc++-v3/ * include/bits/unicode.h (std::__unicode::__v15_1_0): Rename inline namespace to ... (std::__unicode::__v16_0_0): ... this. (_GLIBCXX_GET_UNICODE_DATA): Change from 150100 to 160000. * include/bits/unicode-data.h: Regenerated. * testsuite/ext/unicode/properties.cc: Check for _Gcb_SpacingMark on U+11F03 rather than U+1D16D as the latter lost SpacingMark property in Unicode 16.0.
2024-10-08 10:01:47 +02:00
static_assert( uc::__grapheme_cluster_break_property(U'\U00011F03')
libstdc++: Fix Unicode property detection functions Fix some copy & pasted logic in __is_extended_pictographic. This function should yield false for the values before the first edge, not true. Also add a missing boundary condition check in __incb_property. Also Fix an off-by-one error in _Utf_iterator::operator++() that would make dereferencing a past-the-end iterator undefined (where the intended design is that the iterator is always incrementable and dereferenceable, for better memory safety). Also simplify the grapheme view iterator, which still contained some remnants of an earlier design I was experimenting with. Slightly tweak the gen_libstdcxx_unicode_data.py script so that the _Gcb_property enumerators are in the order we encounter them in the data file, instead of sorting them alphabetically. Start with the "Other" property at value 0, because that's the default property for anything not in the file. This makes no practical difference, but seems cleaner. It causes the values in the __gcb_edges table to change, so can only be done now before anybody is using this code yet. The enumerator values and table entries become ABI artefacts for the function using them. contrib/ChangeLog: * unicode/gen_libstdcxx_unicode_data.py: Print out Gcb_property enumerators in the order they're seen, not alphabetical order. libstdc++-v3/ChangeLog: * include/bits/unicode-data.h: Regenerate. * include/bits/unicode.h (_Utf_iterator::operator++()): Fix off by one error. (__incb_property): Add missing check for values before the first edge. (__is_extended_pictographic): Invert return values to fix copy&pasted logic. (_Grapheme_cluster_view::_Iterator): Remove second iterator member and find end of cluster lazily. * testsuite/ext/unicode/grapheme_view.cc: New test. * testsuite/ext/unicode/properties.cc: New test. * testsuite/ext/unicode/view.cc: New test.
2024-01-09 14:43:40 +00:00
== _Gcb_SpacingMark );
static_assert( uc::__grapheme_cluster_break_property(U'\U0001D16E')
== _Gcb_Extend );
static_assert( uc::__grapheme_cluster_break_property(U'\U000E01EF')
== _Gcb_Extend );
static_assert( uc::__grapheme_cluster_break_property(U'\U000E01F0')
== _Gcb_Control );
static_assert( uc::__grapheme_cluster_break_property(U'\U000E0FFF')
== _Gcb_Control );
static_assert( uc::__grapheme_cluster_break_property(U'\U000E1000')
== _Gcb_Other );
static_assert( uc::__incb_property(U'\0') == uc::_InCB{0} );
static_assert( uc::__incb_property(U'a') == uc::_InCB{0} );
static_assert( uc::__incb_property(U'\N{DEVANAGARI LETTER KA}')
== uc::_InCB::_Consonant );
static_assert( uc::__incb_property(U'\N{DEVANAGARI LETTER RA}')
== uc::_InCB::_Consonant );
static_assert( uc::__incb_property(U'\N{DEVANAGARI LETTER YYA}')
== uc::_InCB::_Consonant );
static_assert( uc::__incb_property(U'\N{DEVANAGARI LETTER YYA}' + 1)
== uc::_InCB{0} );
static_assert( uc::__incb_property(U'\N{DEVANAGARI SIGN NUKTA}')
== uc::_InCB::_Extend );
static_assert( uc::__incb_property(U'\N{DEVANAGARI SIGN NUKTA}' + 1)
== uc::_InCB{0} );
static_assert( uc::__incb_property(U'\U0001E94A') == uc::_InCB::_Extend );
static_assert( uc::__incb_property(U'\U0001E94B') == uc::_InCB{0} );
static_assert( ! uc::__is_incb_linker(U'\0') );
static_assert( ! uc::__is_incb_linker(U'a') );
static_assert( uc::__is_incb_linker(U'\N{DEVANAGARI SIGN VIRAMA}') );
static_assert( ! uc::__is_incb_linker(U'\N{DEVANAGARI SIGN VIRAMA}' + 1) );
static_assert( ! uc::__is_incb_linker(U'\N{DEVANAGARI SIGN VIRAMA}' - 1) );
static_assert( ! uc::__is_incb_linker(U'\u0FFF') );
static_assert( ! uc::__is_incb_linker(U'\uFFFD') );
static_assert( ! uc::__is_extended_pictographic(U'\0') );
static_assert( ! uc::__is_extended_pictographic(U'a') );
static_assert( ! uc::__is_extended_pictographic(riA) );
static_assert( ! uc::__is_extended_pictographic(riZ) );
static_assert( ! uc::__is_extended_pictographic(U'\N{COPYRIGHT SIGN}' - 1) );
static_assert( uc::__is_extended_pictographic(U'\N{COPYRIGHT SIGN}') );
static_assert( ! uc::__is_extended_pictographic(U'\N{COPYRIGHT SIGN}' + 1) );
static_assert( ! uc::__is_extended_pictographic(U'\N{INFORMATION SOURCE}' - 1) );
static_assert( uc::__is_extended_pictographic(U'\N{INFORMATION SOURCE}') );
static_assert( ! uc::__is_extended_pictographic(U'\N{INFORMATION SOURCE}' + 1) );
static_assert( ! uc::__is_extended_pictographic(U'\N{LEFT RIGHT ARROW}' - 1) );
static_assert( uc::__is_extended_pictographic(U'\N{LEFT RIGHT ARROW}') );
static_assert( uc::__is_extended_pictographic(U'\N{LEFT RIGHT ARROW}' + 1) );
static_assert( uc::__is_extended_pictographic(U'\N{SOUTH WEST ARROW}') );
static_assert( ! uc::__is_extended_pictographic(U'\N{SOUTH WEST ARROW}' + 1) );
static_assert( uc::__is_extended_pictographic(U'\N{POSTBOX}') );
static_assert( ! uc::__is_extended_pictographic(U'\U0001EFFF') );
static_assert( uc::__is_extended_pictographic(U'\U0001F000') );
static_assert( uc::__is_extended_pictographic(U'\U0001FFFD') );
static_assert( ! uc::__is_extended_pictographic(U'\U0001FFFE') );
static_assert( ! uc::__is_extended_pictographic(U'\U0001FFFF') );