diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index baf8ff1a9cf..ad33ecaa14b 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -424,6 +424,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11 static constexpr flag_type awk = regex_constants::awk; static constexpr flag_type grep = regex_constants::grep; static constexpr flag_type egrep = regex_constants::egrep; +#if __cplusplus >= 2017 + static constexpr flag_type multiline = regex_constants::multiline; +#endif ///@} // [7.8.2] construct/copy/destroy diff --git a/libstdc++-v3/include/bits/regex_constants.h b/libstdc++-v3/include/bits/regex_constants.h index 1c3dd36d57c..af689ff93af 100644 --- a/libstdc++-v3/include/bits/regex_constants.h +++ b/libstdc++-v3/include/bits/regex_constants.h @@ -64,7 +64,7 @@ namespace regex_constants _S_grep, _S_egrep, _S_polynomial, - _S_syntax_last + _S_multiline }; /** @@ -170,6 +170,22 @@ namespace regex_constants _GLIBCXX17_INLINE constexpr syntax_option_type egrep = static_cast(1 << _S_egrep); +#if __cplusplus >= 201703L || !defined __STRICT_ANSI__ + /** + * Specifies that the `^` anchor matches at the beginning of a line, + * and the `$` anchor matches at the end of a line, not only at the + * beginning/end of the input. + * Valid for the ECMAScript syntax, ignored otherwise. + * @since C++17 + */ + _GLIBCXX17_INLINE constexpr syntax_option_type multiline = + static_cast(1 << _S_multiline); +#endif + + /// Extension: Equivalent to regex_constants::multiline for C++11 and C++14. + _GLIBCXX17_INLINE constexpr syntax_option_type __multiline = + static_cast(1 << _S_multiline); + /** * Extension: Ensure both space complexity of compiled regex and * time complexity execution are not exponential. @@ -310,9 +326,10 @@ namespace regex_constants static_cast(1 << _S_continuous); /** - * --first is a valid iterator position. When this flag is set then the - * flags match_not_bol and match_not_bow are ignored by the regular - * expression algorithms 28.11 and iterators 28.12. + * `--first` is a valid iterator position. When this flag is set then the + * flags `match_not_bol` and `match_not_bow` are ignored by the algorithms + * `regex_match`, `regex_search`, and `regex_replace`, and by the iterators + * `regex_iterator` and `regex_token_iterator`. */ _GLIBCXX17_INLINE constexpr match_flag_type match_prev_avail = static_cast(1 << _S_prev_avail); diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 3422893371a..167d40624d9 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -78,12 +78,12 @@ namespace __detail _M_results(__results), _M_rep_count(_M_nfa.size()), _M_states(_M_nfa._M_start(), _M_nfa.size()), - _M_flags((__flags & regex_constants::match_prev_avail) - ? (__flags - & ~regex_constants::match_not_bol - & ~regex_constants::match_not_bow) - : __flags) - { } + _M_flags(__flags) + { + using namespace regex_constants; + if (__flags & match_prev_avail) // ignore not_bol and not_bow + _M_flags &= ~(match_not_bol | match_not_bow); + } // Set matched when string exactly matches the pattern. bool @@ -165,16 +165,39 @@ namespace __detail bool _M_at_begin() const { - return _M_current == _M_begin - && !(_M_flags & (regex_constants::match_not_bol - | regex_constants::match_prev_avail)); + if (_M_current == _M_begin) + { + // match_not_bol means ^ does not match [_M_begin,_M_begin) + if (_M_flags & regex_constants::match_not_bol) + return false; + // match_prev_avail means _M_begin is not the start of the input. + if (_M_flags & regex_constants::match_prev_avail) + { + // For ECMAScript multiline matches, check if the previous + // character is a line terminator. + if (_M_match_multiline()) + return _M_is_line_terminator(*std::prev(_M_current)); + else + return false; + } + else // ^ matches at _M_begin + return true; + } + else if (_M_match_multiline()) + return _M_is_line_terminator(*std::prev(_M_current)); + else + return false; } bool _M_at_end() const { - return _M_current == _M_end - && !(_M_flags & regex_constants::match_not_eol); + if (_M_current == _M_end) + return !(_M_flags & regex_constants::match_not_eol); + else if (_M_match_multiline()) + return _M_is_line_terminator(*_M_current); + else + return false; } bool @@ -183,6 +206,31 @@ namespace __detail bool _M_lookahead(_StateIdT __next); + bool + _M_is_line_terminator(_CharT __c) const + { + const auto& __traits = _M_re._M_automaton->_M_traits; + const auto& __ct = use_facet>(__traits.getloc()); + const char __n{ __ct.narrow(__c, ' ') }; + if (__n == '\n') + return true; + if (_M_re._M_automaton->_M_options() & regex_constants::ECMAScript) + { + if (__n == '\r') + return true; + // FIXME: U+2028 (line separator) and U+2029 (paragraph separator) + } + return false; + } + + bool + _M_match_multiline() const noexcept + { + constexpr auto __m + = regex_constants::ECMAScript | regex_constants::__multiline; + return (_M_re._M_automaton->_M_options() & __m) == __m; + } + // Holds additional information used in BFS-mode. template struct _State_info; diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/multiline.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/multiline.cc new file mode 100644 index 00000000000..a1982fc8f78 --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/multiline.cc @@ -0,0 +1,74 @@ +// { dg-do run { target c++11 } } +#include +#include + +#if __cplusplus >= 201703L || !defined __STRICT_ANSI__ +static_assert( std::regex_constants::multiline == std::regex::multiline ); +static_assert( std::regex_constants::__multiline == std::regex::multiline ); +#else +namespace test { constexpr int multiline = 0; } +namespace check { + using namespace test; + using namespace std::regex_constants; + int ml = multiline; +} +#endif + +void +test01() +{ + using namespace std::regex_constants; + + std::regex ml{"^a.$", __multiline}; + VERIFY( ml.flags() == __multiline ); + VERIFY(!std::regex_search("abx\nxab", ml)); + VERIFY(std::regex_search("x\nab", ml)); + VERIFY(std::regex_search("ab\n", ml)); + VERIFY(std::regex_search("x\nab\nx", ml)); + + ml.assign("a$\n^b$\n^c", ECMAScript|__multiline); + VERIFY( ml.flags() == ECMAScript|__multiline ); + VERIFY( regex_search("a\nb\nc", ml) ); + + ml.assign("a$\n^b$\n^c", ECMAScript|__multiline|icase); + VERIFY( ml.flags() == ECMAScript|__multiline|icase ); + VERIFY( regex_search("A\nB\nC", ml) ); +} + +void +test_pr102480() +{ + using namespace std::regex_constants; + + std::regex re("^a"); + std::regex reml("^a", __multiline); + VERIFY( std::regex_match("\na" + 1, re)); + VERIFY( std::regex_match("\na" + 1, reml)); + // PR libstdc++/102480 + VERIFY(!std::regex_match("\na" + 1, re, match_prev_avail)); + VERIFY( std::regex_match("\na" + 1, reml, match_prev_avail)); + VERIFY(!std::regex_match("\na" + 1, re, match_not_bol)); + VERIFY(!std::regex_match("\na" + 1, re, match_prev_avail|match_not_bol)); + VERIFY( std::regex_match("\na" + 1, reml, match_prev_avail|match_not_bol)); + VERIFY(!std::regex_match("\ra" + 1, re, match_prev_avail)); + VERIFY( std::regex_match("\ra" + 1, reml, match_prev_avail)); + VERIFY(!std::regex_match("xa" + 1, re, match_prev_avail)); + VERIFY(!std::regex_match("xa" + 1, reml, match_prev_avail)); + + std::regex bre("^a", basic|__multiline); + VERIFY(std::regex_match("\na" + 1, bre)); + VERIFY(!std::regex_match("\na" + 1, bre, match_not_bol)); + // multiline is ignored for any grammar except ECMAScript, + // so none of the following should match even though + // match_prev_avail is set and *--first == '\n'. + VERIFY(!std::regex_match("\na" + 1, bre, match_prev_avail)); + VERIFY(!std::regex_match("\na" + 1, bre, match_prev_avail|match_not_bol)); + VERIFY(!std::regex_match("\ra" + 1, bre, match_prev_avail)); + VERIFY(!std::regex_match("xa" + 1, bre, match_prev_avail)); +} + +int main() +{ + test01(); + test_pr102480(); +}