libstdc++: Fix wstring conversions in filesystem::path [PR95048]
In commit r9-7381-g91756c4abc1757 I changed filesystem::path to use std::codecvt<CharT, char, mbstate_t> for conversions from all wide strings to UTF-8, instead of using std::codecvt_utf8<CharT>. This was done because for 16-bit wchar_t, std::codecvt_utf8<wchar_t> only supports UCS-2 and not UTF-16. The rationale for the change was sound, but the actual fix was not. It's OK to use std::codecvt for char16_t or char32_t, because the specializations for those types always use UTF-8 , but std::codecvt<wchar_t, char, mbstate_t> uses the current locale's encodings, and the narrow encoding is probably ASCII and can't support non-ASCII characters. The correct fix is to use std::codecvt only for char16_t and char32_t. For 32-bit wchar_t we could have continued using std::codecvt_utf8 because that uses UTF-32 which is fine, switching to std::codecvt broke non-Windows targets with 32-bit wchar_t. For 16-bit wchar_t we did need to change, but should have changed to std::codecvt_utf8_utf16<wchar_t> instead, as that always uses UTF-16 not UCS-2. I actually noted that in the commit message for r9-7381-g91756c4abc1757 but didn't use that option. Oops. This replaces the unconditional std::codecvt<CharT, char, mbstate_t> with a type defined via template specialization, so it can vary depending on the wide character type. The code is also simplified to remove some of the mess of #ifdef and if-constexpr conditions. libstdc++-v3/ChangeLog: PR libstdc++/95048 * include/bits/fs_path.h (path::_Codecvt): New class template that selects the kind of code conversion done. (path::_Codecvt<wchar_t>): Select based on sizeof(wchar_t). (_GLIBCXX_CONV_FROM_UTF8): New macro to allow the same code to be used for Windows and POSIX. (path::_S_convert(const EcharT*, const EcharT*)): Simplify by using _Codecvt and _GLIBCXX_CONV_FROM_UTF8 abstractions. (path::_S_str_convert(basic_string_view<value_type>, const A&)): Simplify nested conditions. * include/experimental/bits/fs_path.h (path::_Cvt): Define nested typedef controlling type of code conversion done. (path::_Cvt::_S_wconvert): Use new typedef. (path::string(const A&)): Likewise. * testsuite/27_io/filesystem/path/construct/95048.cc: New test. * testsuite/experimental/filesystem/path/construct/95048.cc: New test.
This commit is contained in:
parent
dab5d73959
commit
b331bf303b
4 changed files with 204 additions and 67 deletions
|
@ -727,6 +727,8 @@ namespace __detail
|
|||
_List _M_cmpts;
|
||||
|
||||
struct _Parser;
|
||||
|
||||
template<typename _EcharT> struct _Codecvt;
|
||||
};
|
||||
|
||||
/// @{
|
||||
|
@ -855,55 +857,72 @@ namespace __detail
|
|||
size_t _M_pos;
|
||||
};
|
||||
|
||||
// path::_Codecvt<C> Performs conversions between C and path::string_type.
|
||||
// The native encoding of char strings is the OS-dependent current
|
||||
// encoding for pathnames. FIXME: We assume this is UTF-8 everywhere,
|
||||
// but should use a Windows API to query it.
|
||||
|
||||
// Converts between native pathname encoding and char16_t or char32_t.
|
||||
template<typename _EcharT>
|
||||
struct path::_Codecvt
|
||||
// Need derived class here because std::codecvt has protected destructor.
|
||||
: std::codecvt<_EcharT, char, mbstate_t>
|
||||
{ };
|
||||
|
||||
// Converts between native pathname encoding and native wide encoding.
|
||||
// The native encoding for wide strings is the execution wide-character
|
||||
// set encoding. FIXME: We assume that this is either UTF-32 or UTF-16
|
||||
// (depending on the width of wchar_t). That matches GCC's default,
|
||||
// but can be changed with -fwide-exec-charset.
|
||||
// We need a custom codecvt converting the native pathname encoding
|
||||
// to/from the native wide encoding.
|
||||
template<>
|
||||
struct path::_Codecvt<wchar_t>
|
||||
: __conditional_t<sizeof(wchar_t) == sizeof(char32_t),
|
||||
std::codecvt_utf8<wchar_t>, // UTF-8 <-> UTF-32
|
||||
std::codecvt_utf8_utf16<wchar_t>> // UTF-8 <-> UTF-16
|
||||
{ };
|
||||
|
||||
template<typename _EcharT>
|
||||
auto
|
||||
path::_S_convert(const _EcharT* __f, const _EcharT* __l)
|
||||
{
|
||||
static_assert(__detail::__is_encoded_char<_EcharT>);
|
||||
|
||||
#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS
|
||||
# define _GLIBCXX_CONV_FROM_UTF8(S) __detail::__wstr_from_utf8(S)
|
||||
#else
|
||||
# define _GLIBCXX_CONV_FROM_UTF8(S) S
|
||||
#endif
|
||||
|
||||
if constexpr (is_same_v<_EcharT, value_type>)
|
||||
return basic_string_view<value_type>(__f, __l - __f);
|
||||
#if !defined _GLIBCXX_FILESYSTEM_IS_WINDOWS && defined _GLIBCXX_USE_CHAR8_T
|
||||
#ifdef _GLIBCXX_USE_CHAR8_T
|
||||
else if constexpr (is_same_v<_EcharT, char8_t>)
|
||||
// For POSIX converting from char8_t to char is also 'noconv'
|
||||
return string_view(reinterpret_cast<const char*>(__f), __l - __f);
|
||||
{
|
||||
string_view __str(reinterpret_cast<const char*>(__f), __l - __f);
|
||||
return _GLIBCXX_CONV_FROM_UTF8(__str);
|
||||
}
|
||||
#endif
|
||||
#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS
|
||||
else if constexpr (is_same_v<_EcharT, char>)
|
||||
{
|
||||
std::wstring __wstr;
|
||||
path::_Codecvt<wchar_t> __cvt;
|
||||
if (__str_codecvt_in_all(__f, __l, __wstr, __cvt))
|
||||
return __wstr;
|
||||
}
|
||||
#endif
|
||||
else
|
||||
{
|
||||
#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS
|
||||
std::wstring __wstr;
|
||||
if constexpr (is_same_v<_EcharT, char>)
|
||||
{
|
||||
struct _UCvt : std::codecvt<wchar_t, char, std::mbstate_t>
|
||||
{ } __cvt;
|
||||
if (__str_codecvt_in_all(__f, __l, __wstr, __cvt))
|
||||
return __wstr;
|
||||
}
|
||||
#ifdef _GLIBCXX_USE_CHAR8_T
|
||||
else if constexpr (is_same_v<_EcharT, char8_t>)
|
||||
{
|
||||
const auto __f2 = reinterpret_cast<const char*>(__f);
|
||||
return __detail::__wstr_from_utf8(string_view(__f2, __l - __f));
|
||||
}
|
||||
#endif
|
||||
else // char16_t or char32_t
|
||||
{
|
||||
struct _UCvt : std::codecvt<_EcharT, char, std::mbstate_t>
|
||||
{ } __cvt;
|
||||
std::string __str;
|
||||
if (__str_codecvt_out_all(__f, __l, __str, __cvt))
|
||||
return __detail::__wstr_from_utf8(__str);
|
||||
}
|
||||
#else // ! windows
|
||||
struct _UCvt : std::codecvt<_EcharT, char, std::mbstate_t>
|
||||
{ } __cvt;
|
||||
path::_Codecvt<_EcharT> __cvt;
|
||||
std::string __str;
|
||||
if (__str_codecvt_out_all(__f, __l, __str, __cvt))
|
||||
return __str;
|
||||
#endif
|
||||
__detail::__throw_conversion_error();
|
||||
return _GLIBCXX_CONV_FROM_UTF8(__str);
|
||||
}
|
||||
__detail::__throw_conversion_error();
|
||||
}
|
||||
#undef _GLIBCXX_CONV_FROM_UTF8
|
||||
|
||||
/// @endcond
|
||||
|
||||
|
@ -1085,7 +1104,9 @@ namespace __detail
|
|||
if (__str.size() == 0)
|
||||
return _WString(__a);
|
||||
|
||||
#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS
|
||||
#ifndef _GLIBCXX_FILESYSTEM_IS_WINDOWS
|
||||
string_view __u8str = __str;
|
||||
#else
|
||||
// First convert native string from UTF-16 to to UTF-8.
|
||||
// XXX This assumes that the execution wide-character set is UTF-16.
|
||||
std::codecvt_utf8_utf16<value_type> __cvt;
|
||||
|
@ -1095,35 +1116,30 @@ namespace __detail
|
|||
_String __u8str{_CharAlloc{__a}};
|
||||
const value_type* __wfirst = __str.data();
|
||||
const value_type* __wlast = __wfirst + __str.size();
|
||||
if (__str_codecvt_out_all(__wfirst, __wlast, __u8str, __cvt)) {
|
||||
if (!__str_codecvt_out_all(__wfirst, __wlast, __u8str, __cvt))
|
||||
__detail::__throw_conversion_error();
|
||||
if constexpr (is_same_v<_CharT, char>)
|
||||
return __u8str; // XXX assumes native ordinary encoding is UTF-8.
|
||||
else {
|
||||
|
||||
const char* __first = __u8str.data();
|
||||
const char* __last = __first + __u8str.size();
|
||||
#else
|
||||
const value_type* __first = __str.data();
|
||||
const value_type* __last = __first + __str.size();
|
||||
#endif
|
||||
|
||||
// Convert UTF-8 string to requested format.
|
||||
#ifdef _GLIBCXX_USE_CHAR8_T
|
||||
if constexpr (is_same_v<_CharT, char8_t>)
|
||||
return _WString(__first, __last, __a);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
// Convert UTF-8 to wide string.
|
||||
_WString __wstr(__a);
|
||||
struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t> { } __cvt;
|
||||
if (__str_codecvt_in_all(__first, __last, __wstr, __cvt))
|
||||
return __wstr;
|
||||
}
|
||||
const char* __first = __u8str.data();
|
||||
const char* __last = __first + __u8str.size();
|
||||
|
||||
#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS
|
||||
} }
|
||||
// Convert UTF-8 string to requested format.
|
||||
#ifdef _GLIBCXX_USE_CHAR8_T
|
||||
if constexpr (is_same_v<_CharT, char8_t>)
|
||||
return _WString(__first, __last, __a);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
// Convert UTF-8 to wide string.
|
||||
_WString __wstr(__a);
|
||||
path::_Codecvt<_CharT> __cvt;
|
||||
if (__str_codecvt_in_all(__first, __last, __wstr, __cvt))
|
||||
return __wstr;
|
||||
}
|
||||
}
|
||||
__detail::__throw_conversion_error();
|
||||
}
|
||||
/// @endcond
|
||||
|
|
|
@ -734,15 +734,47 @@ namespace __detail
|
|||
template<>
|
||||
struct path::_Cvt<path::value_type>
|
||||
{
|
||||
// We need this type to be defined because we don't have `if constexpr`
|
||||
// in C++11 and so path::string<C,T,A>(const A&) needs to be able to
|
||||
// declare a variable of this type and pass it to __str_codecvt_in_all.
|
||||
using __codecvt_utf8_to_wide = _Cvt;
|
||||
// Dummy overload used for unreachable calls in path::string<C,T,A>.
|
||||
template<typename _WStr>
|
||||
friend bool
|
||||
__str_codecvt_in_all(const char*, const char*,
|
||||
_WStr&, __codecvt_utf8_to_wide&) noexcept
|
||||
{ return true; }
|
||||
|
||||
template<typename _Iter>
|
||||
static string_type
|
||||
_S_convert(_Iter __first, _Iter __last)
|
||||
{ return string_type{__first, __last}; }
|
||||
};
|
||||
|
||||
// Performs conversions from _CharT to path::string_type.
|
||||
template<typename _CharT>
|
||||
struct path::_Cvt
|
||||
{
|
||||
// FIXME: We currently assume that the native wide encoding for wchar_t
|
||||
// is either UTF-32 or UTF-16 (depending on the width of wchar_t).
|
||||
// See comments in <bits/fs_path.h> for further details.
|
||||
using __codecvt_utf8_to_wchar
|
||||
= __conditional_t<sizeof(wchar_t) == sizeof(char32_t),
|
||||
std::codecvt_utf8<wchar_t>, // from UTF-32
|
||||
std::codecvt_utf8_utf16<wchar_t>>; // from UTF-16
|
||||
|
||||
// Converts from char16_t or char32_t using std::codecvt<charNN_t, char>.
|
||||
// Need derived class here because std::codecvt has protected destructor.
|
||||
struct __codecvt_utf8_to_utfNN : std::codecvt<_CharT, char, mbstate_t>
|
||||
{ };
|
||||
|
||||
// Convert from native pathname format (assumed to be UTF-8 everywhere)
|
||||
// to the encoding implied by the wide character type _CharT.
|
||||
using __codecvt_utf8_to_wide
|
||||
= __conditional_t<is_same<_CharT, wchar_t>::value,
|
||||
__codecvt_utf8_to_wchar,
|
||||
__codecvt_utf8_to_utfNN>;
|
||||
|
||||
#ifdef _GLIBCXX_FILESYSTEM_IS_WINDOWS
|
||||
#ifdef _GLIBCXX_USE_CHAR8_T
|
||||
static string_type
|
||||
|
@ -760,7 +792,7 @@ namespace __detail
|
|||
static string_type
|
||||
_S_wconvert(const char* __f, const char* __l, const char*)
|
||||
{
|
||||
using _Cvt = std::codecvt<wchar_t, char, mbstate_t>;
|
||||
using _Cvt = std::codecvt_utf8_utf16<wchar_t>;
|
||||
const auto& __cvt = std::use_facet<_Cvt>(std::locale{});
|
||||
std::wstring __wstr;
|
||||
if (__str_codecvt_in_all(__f, __l, __wstr, __cvt))
|
||||
|
@ -773,8 +805,7 @@ namespace __detail
|
|||
static string_type
|
||||
_S_wconvert(const _CharT* __f, const _CharT* __l, const void*)
|
||||
{
|
||||
struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t>
|
||||
{ } __cvt;
|
||||
__codecvt_utf8_to_wide __cvt;
|
||||
std::string __str;
|
||||
if (__str_codecvt_out_all(__f, __l, __str, __cvt))
|
||||
{
|
||||
|
@ -805,8 +836,7 @@ namespace __detail
|
|||
else
|
||||
#endif
|
||||
{
|
||||
struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t>
|
||||
{ } __cvt;
|
||||
__codecvt_utf8_to_wide __cvt;
|
||||
std::string __str;
|
||||
if (__str_codecvt_out_all(__f, __l, __str, __cvt))
|
||||
return __str;
|
||||
|
@ -1013,7 +1043,7 @@ namespace __detail
|
|||
inline std::basic_string<_CharT, _Traits, _Allocator>
|
||||
path::string(const _Allocator& __a) const
|
||||
{
|
||||
if (is_same<_CharT, value_type>::value)
|
||||
if _GLIBCXX_CONSTEXPR (is_same<_CharT, value_type>::value)
|
||||
return { _M_pathname.begin(), _M_pathname.end(), __a };
|
||||
|
||||
using _WString = basic_string<_CharT, _Traits, _Allocator>;
|
||||
|
@ -1049,9 +1079,8 @@ namespace __detail
|
|||
else
|
||||
#endif
|
||||
{
|
||||
// Convert UTF-8 to wide string.
|
||||
struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t>
|
||||
{ } __cvt;
|
||||
// Convert UTF-8 to char16_t or char32_t string.
|
||||
typename path::_Cvt<_CharT>::__codecvt_utf8_to_wide __cvt;
|
||||
const char* __f = __from.data();
|
||||
const char* __l = __f + __from.size();
|
||||
if (__str_codecvt_in_all(__f, __l, __to, __cvt))
|
||||
|
@ -1064,14 +1093,14 @@ namespace __detail
|
|||
if (auto* __p = __dispatch(__u8str, __wstr, is_same<_CharT, char>{}))
|
||||
return *__p;
|
||||
}
|
||||
#else
|
||||
#else // ! Windows
|
||||
#ifdef _GLIBCXX_USE_CHAR8_T
|
||||
if constexpr (is_same<_CharT, char8_t>::value)
|
||||
return _WString(__first, __last, __a);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
struct _UCvt : std::codecvt<_CharT, char, std::mbstate_t> { } __cvt;
|
||||
typename path::_Cvt<_CharT>::__codecvt_utf8_to_wide __cvt;
|
||||
_WString __wstr(__a);
|
||||
if (__str_codecvt_in_all(__first, __last, __wstr, __cvt))
|
||||
return __wstr;
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
// { dg-do run { target c++17 } }
|
||||
|
||||
// C++17 30.10.8.4.1 path constructors [fs.path.construct]
|
||||
|
||||
#include <filesystem>
|
||||
#include <testsuite_hooks.h>
|
||||
|
||||
using std::filesystem::path;
|
||||
|
||||
#define CHECK(E, S) (path(E##S) == path(u8##S))
|
||||
|
||||
void
|
||||
test_wide()
|
||||
{
|
||||
VERIFY( CHECK(L, "\u00E4") ); // PR libstdc++/95048
|
||||
VERIFY( CHECK(L, "\U0001F4C1") ); // folder
|
||||
VERIFY( CHECK(L, "\U0001F4C2") ); // open folder
|
||||
VERIFY( CHECK(L, "\U0001F4C4") ); // filing cabient
|
||||
}
|
||||
|
||||
void
|
||||
test_u16()
|
||||
{
|
||||
VERIFY( CHECK(u, "\u00E4") ); // PR libstdc++/95048
|
||||
VERIFY( CHECK(u, "\U0001F4C1") ); // folder
|
||||
VERIFY( CHECK(u, "\U0001F4C2") ); // open folder
|
||||
VERIFY( CHECK(u, "\U0001F4C4") ); // filing cabient
|
||||
}
|
||||
|
||||
void
|
||||
test_u32()
|
||||
{
|
||||
VERIFY( CHECK(U, "\u00E4") ); // PR libstdc++/95048
|
||||
VERIFY( CHECK(U, "\U0001F4C1") ); // folder
|
||||
VERIFY( CHECK(U, "\U0001F4C2") ); // open folder
|
||||
VERIFY( CHECK(U, "\U0001F4C4") ); // filing cabient
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
test_wide();
|
||||
test_u16();
|
||||
test_u32();
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
// { dg-options "-lstdc++fs" }
|
||||
// { dg-do run { target c++11 } }
|
||||
// { dg-require-filesystem-ts "" }
|
||||
|
||||
// 8.4.1 path constructors [path.construct]
|
||||
|
||||
#include <experimental/filesystem>
|
||||
#include <testsuite_hooks.h>
|
||||
|
||||
using std::experimental::filesystem::path;
|
||||
|
||||
#define CHECK(E, S) (path(E##S) == path(u8##S))
|
||||
|
||||
void
|
||||
test_wide()
|
||||
{
|
||||
VERIFY( CHECK(L, "\u00E4") ); // PR libstdc++/95048
|
||||
VERIFY( CHECK(L, "\U0001F4C1") ); // folder
|
||||
VERIFY( CHECK(L, "\U0001F4C2") ); // open folder
|
||||
VERIFY( CHECK(L, "\U0001F4C4") ); // filing cabient
|
||||
}
|
||||
|
||||
void
|
||||
test_u16()
|
||||
{
|
||||
VERIFY( CHECK(u, "\u00E4") ); // PR libstdc++/95048
|
||||
VERIFY( CHECK(u, "\U0001F4C1") ); // folder
|
||||
VERIFY( CHECK(u, "\U0001F4C2") ); // open folder
|
||||
VERIFY( CHECK(u, "\U0001F4C4") ); // filing cabient
|
||||
}
|
||||
|
||||
void
|
||||
test_u32()
|
||||
{
|
||||
VERIFY( CHECK(U, "\u00E4") ); // PR libstdc++/95048
|
||||
VERIFY( CHECK(U, "\U0001F4C1") ); // folder
|
||||
VERIFY( CHECK(U, "\U0001F4C2") ); // open folder
|
||||
VERIFY( CHECK(U, "\U0001F4C4") ); // filing cabient
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
test_wide();
|
||||
test_u16();
|
||||
test_u32();
|
||||
}
|
Loading…
Add table
Reference in a new issue