From f9ce3c163948ece546d642ac0c62fbc11d8481e4 Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Wed, 29 Jul 2015 03:45:35 +0000 Subject: [PATCH] re PR libstdc++/67015 ("^[a-z0-9][a-z0-9-]*$", std::regex::extended is miscompiled) PR libstdc++/67015 * include/bits/regex_compiler.h (_Compiler<>::_M_expression_term, _BracketMatcher<>::_M_add_collating_element): Change signature to make checking the and of bracket expression easier. * include/bits/regex_compiler.tcc (_Compiler<>::_M_expression_term): Treat '-' as a valid literal if it's at the end of bracket expression. * testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc: New testcases. From-SVN: r226336 --- libstdc++-v3/ChangeLog | 11 ++++ libstdc++-v3/include/bits/regex_compiler.h | 9 ++- libstdc++-v3/include/bits/regex_compiler.tcc | 35 ++++++++---- .../regex_match/cstring_bracket_01.cc | 57 +++++++++++++++++++ 4 files changed, 99 insertions(+), 13 deletions(-) diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index d7dbedd8143..4447b3d9f48 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,14 @@ +2015-07-29 Tim Shen + + PR libstdc++/67015 + * include/bits/regex_compiler.h (_Compiler<>::_M_expression_term, + _BracketMatcher<>::_M_add_collating_element): Change signature + to make checking the and of bracket expression easier. + * include/bits/regex_compiler.tcc (_Compiler<>::_M_expression_term): + Treat '-' as a valid literal if it's at the end of bracket expression. + * testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc: + New testcases. + 2015-07-24 Jonathan Wakely * include/bits/atomic_futex.h [_GLIBCXX_HAVE_LINUX_FUTEX] diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 4472116227d..0cb0c04b1eb 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -116,8 +116,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_insert_bracket_matcher(bool __neg); + // Returns true if successfully matched one term and should continue. + // Returns false if the compiler should move on. template - void + bool _M_expression_term(pair& __last_char, _BracketMatcher<_TraitsT, __icase, __collate>& __matcher); @@ -389,8 +391,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION #endif } - void - _M_add_collating_element(const _StringT& __s) + _StringT + _M_add_collate_element(const _StringT& __s) { auto __st = _M_traits.lookup_collatename(__s.data(), __s.data() + __s.size()); @@ -400,6 +402,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION #ifdef _GLIBCXX_DEBUG _M_is_ready = false; #endif + return __st; } void diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 33d7118e024..9a623111555 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -424,8 +424,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __last_char.first = true; __last_char.second = _M_value[0]; } - while (!_M_match_token(_ScannerT::_S_token_bracket_end)) - _M_expression_term(__last_char, __matcher); + while (_M_expression_term(__last_char, __matcher)); __matcher._M_ready(); _M_stack.push(_StateSeqT( *_M_nfa, @@ -434,21 +433,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template template - void + bool _Compiler<_TraitsT>:: _M_expression_term(pair& __last_char, _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) { + if (_M_match_token(_ScannerT::_S_token_bracket_end)) + return false; + if (_M_match_token(_ScannerT::_S_token_collsymbol)) - __matcher._M_add_collating_element(_M_value); + { + auto __symbol = __matcher._M_add_collate_element(_M_value); + if (__symbol.size() == 1) + { + __last_char.first = true; + __last_char.second = __symbol[0]; + } + } else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) __matcher._M_add_equivalence_class(_M_value); else if (_M_match_token(_ScannerT::_S_token_char_class_name)) __matcher._M_add_character_class(_M_value, false); - // POSIX doesn't permit '-' as a start-range char (say [a-z--0]), - // except when the '-' is the first character in the bracket expression - // ([--0]). ECMAScript treats all '-' after a range as a normal character. - // Also see above, where _M_expression_term gets called. + // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), + // except when the '-' is the first or last character in the bracket + // expression ([--0]). ECMAScript treats all '-' after a range as a + // normal character. Also see above, where _M_expression_term gets called. // // As a result, POSIX rejects [-----], but ECMAScript doesn't. // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. @@ -459,10 +468,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (!__last_char.first) { + __matcher._M_add_char(_M_value[0]); if (_M_value[0] == '-' && !(_M_flags & regex_constants::ECMAScript)) - __throw_regex_error(regex_constants::error_range); - __matcher._M_add_char(_M_value[0]); + { + if (_M_match_token(_ScannerT::_S_token_bracket_end)) + return false; + __throw_regex_error(regex_constants::error_range); + } __last_char.first = true; __last_char.second = _M_value[0]; } @@ -496,6 +509,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_value[0])); else __throw_regex_error(regex_constants::error_brack); + + return true; } template diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc index f7653c6dc9d..62131a0bcc9 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc @@ -82,6 +82,22 @@ test02() VERIFY(e.code() == std::regex_constants::error_range); } std::regex re("[-----]", std::regex::ECMAScript); + + VERIFY(!regex_match("b", regex("[-ac]", regex_constants::extended))); + VERIFY(!regex_match("b", regex("[ac-]", regex_constants::extended))); + VERIFY(regex_match("b", regex("[^-ac]", regex_constants::extended))); + VERIFY(regex_match("b", regex("[^ac-]", regex_constants::extended))); + VERIFY(regex_match("&", regex("[%--]", regex_constants::extended))); + VERIFY(regex_match(".", regex("[--@]", regex_constants::extended))); + try + { + regex("[a--@]", regex_constants::extended); + VERIFY(false); + } + catch (const std::regex_error& e) + { + } + VERIFY(regex_match("].", regex("[][.hyphen.]-0]*", regex_constants::extended))); } void @@ -115,6 +131,44 @@ test04() VERIFY(regex_match_debug("w", re)); } +// libstdc++/67015 +void +test05() +{ + bool test __attribute__((unused)) = true; + + regex lanana_namespace("^[a-z0-9]+$", regex::extended); + regex lsb_namespace("^_?([a-z0-9_.]+-, regex::extended)+[a-z0-9]+$"); + regex debian_dpkg_conffile_cruft("dpkg-(old|dist|new|tmp, regex::extended)$"); + regex debian_cron_namespace("^[a-z0-9][a-z0-9-]*$", regex::extended); + VERIFY(regex_match("test", debian_cron_namespace)); + VERIFY(!regex_match("-a", debian_cron_namespace)); + VERIFY(regex_match("a-", debian_cron_namespace)); + regex debian_cron_namespace_ok("^[a-z0-9][-a-z0-9]*$", regex::extended); + VERIFY(regex_match("test", debian_cron_namespace_ok)); + VERIFY(!regex_match("-a", debian_cron_namespace_ok)); + VERIFY(regex_match("a-", debian_cron_namespace_ok)); +} + +// libstdc++/67015 +void +test06() +{ + bool test __attribute__((unused)) = true; + + regex lanana_namespace("^[a-z0-9]+$"); + regex lsb_namespace("^_?([a-z0-9_.]+-)+[a-z0-9]+$"); + regex debian_dpkg_conffile_cruft("dpkg-(old|dist|new|tmp)$"); + regex debian_cron_namespace("^[a-z0-9][a-z0-9-]*$"); + VERIFY(regex_match("test", debian_cron_namespace)); + VERIFY(!regex_match("-a", debian_cron_namespace)); + VERIFY(regex_match("a-", debian_cron_namespace)); + regex debian_cron_namespace_ok("^[a-z0-9][-a-z0-9]*$"); + VERIFY(regex_match("test", debian_cron_namespace_ok)); + VERIFY(!regex_match("-a", debian_cron_namespace_ok)); + VERIFY(regex_match("a-", debian_cron_namespace_ok)); +} + int main() { @@ -122,5 +176,8 @@ main() test02(); test03(); test04(); + test05(); + test06(); + return 0; }