re PR libstdc++/77356 (regex error for a ECMAScript syntax string)

PR libstdc++/77356
	* include/bits/regex_compiler.tcc(_M_insert_bracket_matcher,
	_M_expression_term): Modify to support dash literal.
	* include/bits/regex_scanner.h: Add dash as a token type to make
	a different from the mandated dash literal by escaping.
	* include/bits/regex_scanner.tcc(_M_scan_in_bracket): Emit dash
	token in bracket expression parsing.
	* testsuite/28_regex/regression.cc: Add new testcases.

From-SVN: r239794
This commit is contained in:
Tim Shen 2016-08-27 02:03:23 +00:00 committed by Tim Shen
parent d8921e81e9
commit 4aebb4e4a6
5 changed files with 111 additions and 42 deletions

View File

@ -1,3 +1,14 @@
2016-08-27 Tim Shen <timshen@google.com>
PR libstdc++/77356
* include/bits/regex_compiler.tcc(_M_insert_bracket_matcher,
_M_expression_term): Modify to support dash literal.
* include/bits/regex_scanner.h: Add dash as a token type to make
a different from the mandated dash literal by escaping.
* include/bits/regex_scanner.tcc(_M_scan_in_bracket): Emit dash
token in bracket expression parsing.
* testsuite/28_regex/regression.cc: Add new testcases.
2016-08-26 Jonathan Wakely <jwakely@redhat.com> 2016-08-26 Jonathan Wakely <jwakely@redhat.com>
PR libstdc++/51960 PR libstdc++/51960

View File

@ -426,13 +426,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
pair<bool, _CharT> __last_char; // Optional<_CharT> pair<bool, _CharT> __last_char; // Optional<_CharT>
__last_char.first = false; __last_char.first = false;
if (!(_M_flags & regex_constants::ECMAScript)) if (!(_M_flags & regex_constants::ECMAScript))
if (_M_try_char()) {
{ if (_M_try_char())
__matcher._M_add_char(_M_value[0]); {
__last_char.first = true; __last_char.first = true;
__last_char.second = _M_value[0]; __last_char.second = _M_value[0];
} }
else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
{
__last_char.first = true;
__last_char.second = '-';
}
}
while (_M_expression_term(__last_char, __matcher)); while (_M_expression_term(__last_char, __matcher));
if (__last_char.first)
__matcher._M_add_char(__last_char.second);
__matcher._M_ready(); __matcher._M_ready();
_M_stack.push(_StateSeqT( _M_stack.push(_StateSeqT(
*_M_nfa, *_M_nfa,
@ -449,19 +457,43 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
if (_M_match_token(_ScannerT::_S_token_bracket_end)) if (_M_match_token(_ScannerT::_S_token_bracket_end))
return false; return false;
const auto __push_char = [&](_CharT __ch)
{
if (__last_char.first)
__matcher._M_add_char(__last_char.second);
else
__last_char.first = true;
__last_char.second = __ch;
};
const auto __flush = [&]
{
if (__last_char.first)
{
__matcher._M_add_char(__last_char.second);
__last_char.first = false;
}
};
if (_M_match_token(_ScannerT::_S_token_collsymbol)) if (_M_match_token(_ScannerT::_S_token_collsymbol))
{ {
auto __symbol = __matcher._M_add_collate_element(_M_value); auto __symbol = __matcher._M_add_collate_element(_M_value);
if (__symbol.size() == 1) if (__symbol.size() == 1)
{ __push_char(__symbol[0]);
__last_char.first = true; else
__last_char.second = __symbol[0]; __flush();
}
} }
else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
__matcher._M_add_equivalence_class(_M_value); {
__flush();
__matcher._M_add_equivalence_class(_M_value);
}
else if (_M_match_token(_ScannerT::_S_token_char_class_name)) else if (_M_match_token(_ScannerT::_S_token_char_class_name))
__matcher._M_add_character_class(_M_value, false); {
__flush();
__matcher._M_add_character_class(_M_value, false);
}
else if (_M_try_char())
__push_char(_M_value[0]);
// POSIX doesn't allow '-' as a start-range char (say [a-z--0]), // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
// except when the '-' is the first or last character in the bracket // except when the '-' is the first or last character in the bracket
// expression ([--0]). ECMAScript treats all '-' after a range as a // expression ([--0]). ECMAScript treats all '-' after a range as a
@ -472,55 +504,55 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// Clang (3.5) always uses ECMAScript style even in its POSIX syntax. // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
// //
// It turns out that no one reads BNFs ;) // It turns out that no one reads BNFs ;)
else if (_M_try_char()) else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
{ {
if (!__last_char.first) if (!__last_char.first)
{ {
__matcher._M_add_char(_M_value[0]); if (!(_M_flags & regex_constants::ECMAScript))
if (_M_value[0] == '-'
&& !(_M_flags & regex_constants::ECMAScript))
{ {
if (_M_match_token(_ScannerT::_S_token_bracket_end)) if (_M_match_token(_ScannerT::_S_token_bracket_end))
return false; {
__push_char('-');
return false;
}
__throw_regex_error( __throw_regex_error(
regex_constants::error_range, regex_constants::error_range,
"Unexpected dash in bracket expression. For POSIX syntax, " "Unexpected dash in bracket expression. For POSIX syntax, "
"a dash is not treated literally only when it is at " "a dash is not treated literally only when it is at "
"beginning or end."); "beginning or end.");
} }
__last_char.first = true; __push_char('-');
__last_char.second = _M_value[0];
} }
else else
{ {
if (_M_value[0] == '-') if (_M_try_char())
{ {
if (_M_try_char()) __matcher._M_make_range(__last_char.second, _M_value[0]);
{ __last_char.first = false;
__matcher._M_make_range(__last_char.second , _M_value[0]); }
__last_char.first = false; else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
} {
else __matcher._M_make_range(__last_char.second, '-');
{ __last_char.first = false;
if (_M_scanner._M_get_token()
!= _ScannerT::_S_token_bracket_end)
__throw_regex_error(
regex_constants::error_range,
"Unexpected end of bracket expression.");
__matcher._M_add_char(_M_value[0]);
}
} }
else else
{ {
__matcher._M_add_char(_M_value[0]); if (_M_scanner._M_get_token()
__last_char.second = _M_value[0]; != _ScannerT::_S_token_bracket_end)
__throw_regex_error(
regex_constants::error_range,
"Character is expected after a dash.");
__push_char('-');
} }
} }
} }
else if (_M_match_token(_ScannerT::_S_token_quoted_class)) else if (_M_match_token(_ScannerT::_S_token_quoted_class))
__matcher._M_add_character_class(_M_value, {
_M_ctype.is(_CtypeT::upper, __flush();
_M_value[0])); __matcher._M_add_character_class(_M_value,
_M_ctype.is(_CtypeT::upper,
_M_value[0]));
}
else else
__throw_regex_error(regex_constants::error_brack, __throw_regex_error(regex_constants::error_brack,
"Unexpected character in bracket expression."); "Unexpected character in bracket expression.");

View File

@ -43,7 +43,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{ {
public: public:
/// Token types returned from the scanner. /// Token types returned from the scanner.
enum _TokenT enum _TokenT : unsigned
{ {
_S_token_anychar, _S_token_anychar,
_S_token_ord_char, _S_token_ord_char,
@ -73,7 +73,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_S_token_comma, _S_token_comma,
_S_token_dup_count, _S_token_dup_count,
_S_token_eof, _S_token_eof,
_S_token_unknown _S_token_bracket_dash,
_S_token_unknown = -1u
}; };
protected: protected:

View File

@ -210,7 +210,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
auto __c = *_M_current++; auto __c = *_M_current++;
if (__c == '[') if (__c == '-')
_M_token = _S_token_bracket_dash;
else if (__c == '[')
{ {
if (_M_current == _M_end) if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_brack, __throw_regex_error(regex_constants::error_brack,

View File

@ -61,12 +61,35 @@ test03()
VERIFY(!regex_search_debug("a", regex(R"(\b$)"), regex_constants::match_not_eow)); VERIFY(!regex_search_debug("a", regex(R"(\b$)"), regex_constants::match_not_eow));
} }
// PR libstdc++/77356
void
test04()
{
bool test __attribute__((unused)) = true;
static const char* kNumericAnchor ="(\\$|usd)(usd|\\$|to|and|up to|[0-9,\\.\\-\\sk])+";
const std::regex re(kNumericAnchor);
(void)re;
}
void
test05()
{
bool test __attribute__((unused)) = true;
VERIFY(regex_match_debug("!", std::regex("[![:alnum:]]")));
VERIFY(regex_match_debug("-", std::regex("[a-]", regex_constants::basic)));
VERIFY(regex_match_debug("-", std::regex("[a-]")));
}
int int
main() main()
{ {
test01(); test01();
test02(); test02();
test03(); test03();
test04();
test05();
return 0; return 0;
} }