libstdc++: Fix handling of surrogate CP in codecvt [PR108976]
This patch fixes the handling of surrogate code points in all standard facets for transcoding Unicode that are based on std::codecvt. Surrogate code points should always be treated as error. On the other hand surrogate code units can only appear in UTF-16 and only when they come in a proper pair. Additionally, it fixes a bug in std::codecvt_utf16::in() when odd number of bytes were given in the range [from, from_end), error was returned always. The last byte in such range does not form a full UTF-16 code unit and we can not make any decisions for error, instead partial should be returned. The testsuite for testing these facets was updated in the following order: 1. All functions that test codecvts that work with UTF-8 were refactored and made more generic so they accept codecvt that works with the char type char8_t. 2. The same functions were updated with new test cases for transcoding errors and now additionally test for surrogates, overlong UTF-8 sequences, code points out of the Unicode range, and more tests for missing leading and trailing code units. 3. New tests were added to test codecvt_utf16 in both of its variants, UTF-16 <-> UTF-32/UCS-4 and UTF-16 <-> UCS-2. libstdc++-v3/ChangeLog: PR libstdc++/108976 * src/c++11/codecvt.cc (read_utf8_code_point): Fix handing of surrogates in UTF-8. (ucs4_out): Fix handling of surrogates in UCS-4 -> UTF-8. (ucs4_in): Fix handling of range with odd number of bytes. (ucs4_out): Fix handling of surrogates in UCS-4 -> UTF-16. (ucs2_out): Fix handling of surrogates in UCS-2 -> UTF-16. (ucs2_in): Fix handling of range with odd number of bytes. (__codecvt_utf16_base<char16_t>::do_in): Likewise. (__codecvt_utf16_base<char32_t>::do_in): Likewise. (__codecvt_utf16_base<wchar_t>::do_in): Likewise. * testsuite/22_locale/codecvt/codecvt_unicode.cc: Renames, add tests for codecvt_utf16<char16_t> and codecvt_utf16<char32_t>. * testsuite/22_locale/codecvt/codecvt_unicode.h: Refactor UTF-8 testing functions for char8_t, add more test cases for errors, add testing functions for codecvt_utf16. * testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc: Renames, add tests for codecvt_utf16<whchar_t>. * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc (test06): Fix test. * testsuite/22_locale/codecvt/codecvt_unicode_char8_t.cc: New test.
This commit is contained in:
parent
28adad7a32
commit
a8b9c32da7
6 changed files with 1488 additions and 444 deletions
|
@ -284,6 +284,8 @@ namespace
|
|||
return invalid_mb_sequence;
|
||||
if (c1 == 0xE0 && c2 < 0xA0) [[unlikely]] // overlong
|
||||
return invalid_mb_sequence;
|
||||
if (c1 == 0xED && c2 >= 0xA0) [[unlikely]] // surrogate
|
||||
return invalid_mb_sequence;
|
||||
if (avail < 3) [[unlikely]]
|
||||
return incomplete_mb_character;
|
||||
char32_t c3 = (unsigned char) from[2];
|
||||
|
@ -484,6 +486,8 @@ namespace
|
|||
while (from.size())
|
||||
{
|
||||
const char32_t c = from[0];
|
||||
if (0xD800 <= c && c <= 0xDFFF) [[unlikely]]
|
||||
return codecvt_base::error;
|
||||
if (c > maxcode) [[unlikely]]
|
||||
return codecvt_base::error;
|
||||
if (!write_utf8_code_point(to, c)) [[unlikely]]
|
||||
|
@ -508,7 +512,7 @@ namespace
|
|||
return codecvt_base::error;
|
||||
to = codepoint;
|
||||
}
|
||||
return from.size() ? codecvt_base::partial : codecvt_base::ok;
|
||||
return from.nbytes() ? codecvt_base::partial : codecvt_base::ok;
|
||||
}
|
||||
|
||||
// ucs4 -> utf16
|
||||
|
@ -521,6 +525,8 @@ namespace
|
|||
while (from.size())
|
||||
{
|
||||
const char32_t c = from[0];
|
||||
if (0xD800 <= c && c <= 0xDFFF) [[unlikely]]
|
||||
return codecvt_base::error;
|
||||
if (c > maxcode) [[unlikely]]
|
||||
return codecvt_base::error;
|
||||
if (!write_utf16_code_point(to, c, mode)) [[unlikely]]
|
||||
|
@ -653,7 +659,7 @@ namespace
|
|||
while (from.size() && to.size())
|
||||
{
|
||||
char16_t c = from[0];
|
||||
if (is_high_surrogate(c))
|
||||
if (0xD800 <= c && c <= 0xDFFF)
|
||||
return codecvt_base::error;
|
||||
if (c > maxcode)
|
||||
return codecvt_base::error;
|
||||
|
@ -680,7 +686,7 @@ namespace
|
|||
return codecvt_base::error;
|
||||
to = c;
|
||||
}
|
||||
return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
|
||||
return from.nbytes() == 0 ? codecvt_base::ok : codecvt_base::partial;
|
||||
}
|
||||
|
||||
const char16_t*
|
||||
|
@ -1344,8 +1350,6 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
|
|||
auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
|
||||
__from_next = reinterpret_cast<const char*>(from.next);
|
||||
__to_next = to.next;
|
||||
if (res == codecvt_base::ok && __from_next != __from_end)
|
||||
res = codecvt_base::error;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -1419,8 +1423,6 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
|
|||
auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
|
||||
__from_next = reinterpret_cast<const char*>(from.next);
|
||||
__to_next = to.next;
|
||||
if (res == codecvt_base::ok && __from_next != __from_end)
|
||||
res = codecvt_base::error;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -1521,8 +1523,6 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
|
|||
#endif
|
||||
__from_next = reinterpret_cast<const char*>(from.next);
|
||||
__to_next = reinterpret_cast<wchar_t*>(to.next);
|
||||
if (res == codecvt_base::ok && __from_next != __from_end)
|
||||
res = codecvt_base::error;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,38 +27,58 @@ void
|
|||
test_utf8_utf32_codecvts ()
|
||||
{
|
||||
using codecvt_c32 = codecvt<char32_t, char, mbstate_t>;
|
||||
auto loc_c = locale::classic ();
|
||||
auto &loc_c = locale::classic ();
|
||||
VERIFY (has_facet<codecvt_c32> (loc_c));
|
||||
|
||||
auto &cvt = use_facet<codecvt_c32> (loc_c);
|
||||
test_utf8_utf32_codecvts (cvt);
|
||||
test_utf8_utf32_cvt (cvt);
|
||||
|
||||
codecvt_utf8<char32_t> cvt2;
|
||||
test_utf8_utf32_codecvts (cvt2);
|
||||
test_utf8_utf32_cvt (cvt2);
|
||||
}
|
||||
|
||||
void
|
||||
test_utf8_utf16_codecvts ()
|
||||
{
|
||||
using codecvt_c16 = codecvt<char16_t, char, mbstate_t>;
|
||||
auto loc_c = locale::classic ();
|
||||
auto &loc_c = locale::classic ();
|
||||
VERIFY (has_facet<codecvt_c16> (loc_c));
|
||||
|
||||
auto &cvt = use_facet<codecvt_c16> (loc_c);
|
||||
test_utf8_utf16_cvts (cvt);
|
||||
test_utf8_utf16_cvt (cvt);
|
||||
|
||||
codecvt_utf8_utf16<char16_t> cvt2;
|
||||
test_utf8_utf16_cvts (cvt2);
|
||||
test_utf8_utf16_cvt (cvt2);
|
||||
|
||||
codecvt_utf8_utf16<char32_t> cvt3;
|
||||
test_utf8_utf16_cvts (cvt3);
|
||||
test_utf8_utf16_cvt (cvt3);
|
||||
}
|
||||
|
||||
void
|
||||
test_utf8_ucs2_codecvts ()
|
||||
{
|
||||
codecvt_utf8<char16_t> cvt;
|
||||
test_utf8_ucs2_cvts (cvt);
|
||||
test_utf8_ucs2_cvt (cvt);
|
||||
}
|
||||
|
||||
void
|
||||
test_utf16_utf32_codecvts ()
|
||||
{
|
||||
codecvt_utf16<char32_t> cvt;
|
||||
test_utf16_utf32_cvt (cvt, utf16_big_endian);
|
||||
|
||||
codecvt_utf16<char32_t, 0x10FFFF, codecvt_mode::little_endian> cvt2;
|
||||
test_utf16_utf32_cvt (cvt2, utf16_little_endian);
|
||||
}
|
||||
|
||||
void
|
||||
test_utf16_ucs2_codecvts ()
|
||||
{
|
||||
codecvt_utf16<char16_t> cvt;
|
||||
test_utf16_ucs2_cvt (cvt, utf16_big_endian);
|
||||
|
||||
codecvt_utf16<char16_t, 0x10FFFF, codecvt_mode::little_endian> cvt2;
|
||||
test_utf16_ucs2_cvt (cvt2, utf16_little_endian);
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -67,4 +87,6 @@ main ()
|
|||
test_utf8_utf32_codecvts ();
|
||||
test_utf8_utf16_codecvts ();
|
||||
test_utf8_ucs2_codecvts ();
|
||||
test_utf16_utf32_codecvts ();
|
||||
test_utf16_ucs2_codecvts ();
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,53 @@
|
|||
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
|
||||
//
|
||||
// This file is part of the GNU ISO C++ Library. This library is free
|
||||
// software; you can redistribute it and/or modify it under the
|
||||
// terms of the GNU General Public License as published by the
|
||||
// Free Software Foundation; either version 3, or (at your option)
|
||||
// any later version.
|
||||
|
||||
// This library is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License along
|
||||
// with this library; see the file COPYING3. If not see
|
||||
// <http://www.gnu.org/licenses/>.
|
||||
|
||||
// { dg-do run { target c++11 } }
|
||||
// { dg-require-cstdint "" }
|
||||
// { dg-options "-fchar8_t" }
|
||||
|
||||
#include "codecvt_unicode.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void
|
||||
test_utf8_utf32_codecvts ()
|
||||
{
|
||||
using codecvt_c32_c8 = codecvt<char32_t, char8_t, mbstate_t>;
|
||||
auto &loc_c = locale::classic ();
|
||||
VERIFY (has_facet<codecvt_c32_c8> (loc_c));
|
||||
|
||||
auto &cvt = use_facet<codecvt_c32_c8> (loc_c);
|
||||
test_utf8_utf32_cvt (cvt);
|
||||
}
|
||||
|
||||
void
|
||||
test_utf8_utf16_codecvts ()
|
||||
{
|
||||
using codecvt_c16_c8 = codecvt<char16_t, char8_t, mbstate_t>;
|
||||
auto &loc_c = locale::classic ();
|
||||
VERIFY (has_facet<codecvt_c16_c8> (loc_c));
|
||||
|
||||
auto &cvt = use_facet<codecvt_c16_c8> (loc_c);
|
||||
test_utf8_utf16_cvt (cvt);
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
test_utf8_utf32_codecvts ();
|
||||
test_utf8_utf16_codecvts ();
|
||||
}
|
|
@ -28,7 +28,7 @@ test_utf8_utf32_codecvts ()
|
|||
{
|
||||
#if __SIZEOF_WCHAR_T__ == 4
|
||||
codecvt_utf8<wchar_t> cvt;
|
||||
test_utf8_utf32_codecvts (cvt);
|
||||
test_utf8_utf32_cvt (cvt);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,7 @@ test_utf8_utf16_codecvts ()
|
|||
{
|
||||
#if __SIZEOF_WCHAR_T__ >= 2
|
||||
codecvt_utf8_utf16<wchar_t> cvt;
|
||||
test_utf8_utf16_cvts (cvt);
|
||||
test_utf8_utf16_cvt (cvt);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -46,7 +46,31 @@ test_utf8_ucs2_codecvts ()
|
|||
{
|
||||
#if __SIZEOF_WCHAR_T__ == 2
|
||||
codecvt_utf8<wchar_t> cvt;
|
||||
test_utf8_ucs2_cvts (cvt);
|
||||
test_utf8_ucs2_cvt (cvt);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
test_utf16_utf32_codecvts ()
|
||||
{
|
||||
#if __SIZEOF_WCHAR_T__ == 4
|
||||
codecvt_utf16<wchar_t> cvt3;
|
||||
test_utf16_utf32_cvt (cvt3, utf16_big_endian);
|
||||
|
||||
codecvt_utf16<wchar_t, 0x10FFFF, codecvt_mode::little_endian> cvt4;
|
||||
test_utf16_utf32_cvt (cvt4, utf16_little_endian);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
test_utf16_ucs2_codecvts ()
|
||||
{
|
||||
#if __SIZEOF_WCHAR_T__ == 2
|
||||
codecvt_utf16<wchar_t> cvt3;
|
||||
test_utf16_ucs2_cvt (cvt3, utf16_big_endian);
|
||||
|
||||
codecvt_utf16<wchar_t, 0x10FFFF, codecvt_mode::little_endian> cvt4;
|
||||
test_utf16_ucs2_cvt (cvt4, utf16_little_endian);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -56,4 +80,6 @@ main ()
|
|||
test_utf8_utf32_codecvts ();
|
||||
test_utf8_utf16_codecvts ();
|
||||
test_utf8_ucs2_codecvts ();
|
||||
test_utf16_utf32_codecvts ();
|
||||
test_utf16_ucs2_codecvts ();
|
||||
}
|
||||
|
|
|
@ -83,7 +83,7 @@ test06()
|
|||
const char src[] = "\0\x61\xAB\xCD";
|
||||
Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
|
||||
std::u16string result = conv.from_bytes(src, src+3); // incomplete character
|
||||
VERIFY( result == u"from_bytes failed" );
|
||||
VERIFY( result == u"\u0061" );
|
||||
VERIFY( conv.converted() == 2 );
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue