Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ add_benchmark(path_lexically_normal src/path_lexically_normal.cpp)
add_benchmark(priority_queue_push_range src/priority_queue_push_range.cpp)
add_benchmark(random_integer_generation src/random_integer_generation.cpp)
add_benchmark(ranges_div_ceil src/ranges_div_ceil.cpp)
add_benchmark(regex_match src/regex_match.cpp)
add_benchmark(regex_search src/regex_search.cpp)
add_benchmark(remove src/remove.cpp)
add_benchmark(replace src/replace.cpp)
Expand Down
39 changes: 39 additions & 0 deletions benchmarks/src/regex_match.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <benchmark/benchmark.h>
#include <cstddef>
#include <regex>
#include <string>

using namespace std;
using namespace regex_constants;

void bm_match_sequence_of_as(benchmark::State& state, const char* pattern, syntax_option_type syntax = ECMAScript) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No change requested: The syntax is never customized, but I see that it's imitating regex_search.cpp, and I suppose it's not too confusing to leave as-is.

string input(static_cast<size_t>(state.range()), 'a');
regex re{pattern, syntax};

for (auto _ : state) {
benchmark::DoNotOptimize(input);
const char* pos = input.data();
const char* end = input.data() + input.size();
cmatch match;
regex_match(pos, end, match, re);
}
}

void common_args(auto bm) {
bm->Arg(100)->Arg(200)->Arg(400);
}

BENCHMARK_CAPTURE(bm_match_sequence_of_as, "a*", "a*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "a*?", "a*?")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(?:a)*", "(?:a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)*", "(a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(?:b|a)*", "(?:b|a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(b|a)*", "(b|a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(?:b|a)*", "(a)(?:b|a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(b|a)*", "(a)(b|a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(?:b|a)*c", "(a)(?:b|a)*c")->Apply(common_args);

BENCHMARK_MAIN();
26 changes: 15 additions & 11 deletions benchmarks/src/regex_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,20 @@ void bm_lorem_search(benchmark::State& state, const char* pattern, syntax_option
}
}

BENCHMARK_CAPTURE(bm_lorem_search, "^bibe", "^bibe")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, "bibe".collate, "bibe", regex_constants::collate)->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)", "(bibe)")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, "(?:bibe)+", "(?:bibe)+")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, R"(\bbibe)", R"(\bbibe)")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, R"(\Bibe)", R"(\Bibe)")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, R"((?=....)bibe)", R"((?=....)bibe)")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, R"((?=bibe)....)", R"((?=bibe)....)")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, R"((?!lorem)bibe)", R"((?!lorem)bibe)")->Arg(2)->Arg(3)->Arg(4);
void common_args(auto bm) {
bm->Arg(2)->Arg(3)->Arg(4);
}

BENCHMARK_CAPTURE(bm_lorem_search, "^bibe", "^bibe")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, "bibe".collate, "bibe", regex_constants::collate)->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)", "(bibe)")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, "(?:bibe)+", "(?:bibe)+")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, R"(\bbibe)", R"(\bbibe)")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, R"(\Bibe)", R"(\Bibe)")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, R"((?=....)bibe)", R"((?=....)bibe)")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, R"((?=bibe)....)", R"((?=bibe)....)")->Apply(common_args);
BENCHMARK_CAPTURE(bm_lorem_search, R"((?!lorem)bibe)", R"((?!lorem)bibe)")->Apply(common_args);

BENCHMARK_MAIN();
82 changes: 60 additions & 22 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1681,6 +1681,8 @@ enum class _Rx_unwind_ops {
_Loop_nongreedy,
_Loop_greedy,
_Loop_restore_vals,
_Capture_restore_begin,
_Capture_restore_end,
};

template <class _BidIt>
Expand All @@ -1689,7 +1691,7 @@ public:
_Rx_unwind_ops _Code;
int _Loop_idx_sav;
_Node_base* _Node;
_Tgt_state_t<_BidIt> _Match_state;
_Bt_state_t<_BidIt> _Match_state;
size_t _Loop_frame_idx_sav;
};

Expand Down Expand Up @@ -3919,25 +3921,41 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
}

case _N_end_assert:
for (;;) {
--_Frames_count;
const auto& _Frame = _Frames[_Frames_count];
const auto _Code = _Frame._Code;
if (_Code == _Rx_unwind_ops::_After_assert || _Code == _Rx_unwind_ops::_After_neg_assert) {
_Tgt_state._Cur = _Frame._Match_state._Cur;
_Decrease_stack_usage_count();
if (_Code == _Rx_unwind_ops::_After_assert) {
_Next = _Frame._Node->_Next;
} else {
_Failed = true;
{
size_t _Last_capture_restore_frame = 0U;
for (;;) {
--_Frames_count;
const auto& _Frame = _Frames[_Frames_count];
const auto _Code = _Frame._Code;
if (_Code == _Rx_unwind_ops::_After_assert || _Code == _Rx_unwind_ops::_After_neg_assert) {
_Tgt_state._Cur = _Frame._Match_state._Cur;
_Decrease_stack_usage_count();
if (_Code == _Rx_unwind_ops::_After_assert) {
_Next = _Frame._Node->_Next;
if (_Last_capture_restore_frame != 0U) {
auto _Not_capture_restore = [](const auto& _Other_frame) _STATIC_LAMBDA {
return _Other_frame._Code != _Rx_unwind_ops::_Capture_restore_begin
&& _Other_frame._Code != _Rx_unwind_ops::_Capture_restore_end;
};
const auto _Effective_frames_end =
_STD remove_if(_Frames.begin() + static_cast<ptrdiff_t>(_Frames_count),
_Frames.begin() + static_cast<ptrdiff_t>(_Last_capture_restore_frame) + 1,
_Not_capture_restore);
_Frames_count = static_cast<size_t>(_Effective_frames_end - _Frames.begin());
}
} else {
_Failed = true;
}
break;
} else if (_Code == _Rx_unwind_ops::_Disjunction_eval_alt_on_failure
|| _Code == _Rx_unwind_ops::_Disjunction_eval_alt_always
|| _Code == _Rx_unwind_ops::_Loop_greedy //
|| _Code == _Rx_unwind_ops::_Loop_nongreedy
|| _Code == _Rx_unwind_ops::_Loop_restore_vals) {
_Decrease_stack_usage_count();
} else if (_Code == _Rx_unwind_ops::_Capture_restore_end && _Last_capture_restore_frame == 0U) {
_Last_capture_restore_frame = _Frames_count;
}
break;
} else if (_Code == _Rx_unwind_ops::_Disjunction_eval_alt_on_failure
|| _Code == _Rx_unwind_ops::_Disjunction_eval_alt_always
|| _Code == _Rx_unwind_ops::_Loop_greedy //
|| _Code == _Rx_unwind_ops::_Loop_nongreedy
|| _Code == _Rx_unwind_ops::_Loop_restore_vals) {
_Decrease_stack_usage_count();
}
}
break;
Expand All @@ -3946,7 +3964,10 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
{ // record current position
_Node_capture* _Node = static_cast<_Node_capture*>(_Nx);
if (_Node->_Idx != 0U) {
_Tgt_state._Grps[_Node->_Idx]._Begin = _Tgt_state._Cur;
auto& _Group = _Tgt_state._Grps[_Node->_Idx];
auto _Frame_idx = _Push_frame(_Rx_unwind_ops::_Capture_restore_begin, _Node);
_Frames[_Frame_idx]._Match_state._Cur = _Group._Begin;
_Group._Begin = _Tgt_state._Cur;
Comment on lines +3969 to +3970
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No change requested: I observe that _STD exchange could be used for this code pattern, although we don't universally use it. (This also occurs immediately below for _N_end_capture.)

}
break;
}
Expand All @@ -3956,8 +3977,11 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
_Node_end_group* _Node = static_cast<_Node_end_group*>(_Nx);
_Node_capture* _Node0 = static_cast<_Node_capture*>(_Node->_Back);
if (_Node0->_Idx != 0U) { // update capture data
_Tgt_state._Grp_valid[_Node0->_Idx] = true;
_Tgt_state._Grps[_Node0->_Idx]._End = _Tgt_state._Cur;
auto& _Group = _Tgt_state._Grps[_Node0->_Idx];
auto _Frame_idx = _Push_frame(_Rx_unwind_ops::_Capture_restore_end, _Node0);
_Frames[_Frame_idx]._Match_state._Cur = _Group._End;
_Tgt_state._Grp_valid[_Node0->_Idx] = true;
_Group._End = _Tgt_state._Cur;
}
break;
}
Expand Down Expand Up @@ -4325,6 +4349,20 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
}
break;

case _Rx_unwind_ops::_Capture_restore_begin:
{ // restore begin of capturing group
auto _Node = static_cast<_Node_capture*>(_Frame._Node);
_Tgt_state._Grps[_Node->_Idx]._Begin = _Frame._Match_state._Cur;
}
break;

case _Rx_unwind_ops::_Capture_restore_end:
{ // restore end of capturing group
auto _Node = static_cast<_Node_capture*>(_Frame._Node);
_Tgt_state._Grps[_Node->_Idx]._End = _Frame._Match_state._Cur;
}
break;

default:
#if _ITERATOR_DEBUG_LEVEL != 0
_STL_REPORT_ERROR("internal stack of regex matcher corrupted");
Expand Down
10 changes: 10 additions & 0 deletions tests/std/tests/VSO_0000000_regex_use/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2350,6 +2350,15 @@ void test_gh_5798() {
}
}

void test_gh_5865() {
// GH-5865: <regex>: Remove capture extent vectors from stack frames
// These tests check correct restoration of capturing groups
// when backtracking over positive lookahead assertions that matched successfully.
g_regexTester.should_capture("ab", "(?:(?=(.*))ab)*", "ab");
g_regexTester.should_capture("abcd", "(?:(?=(.*))ab)*cd", "abcd");
g_regexTester.should_capture("abab", "(?:(?=(.*))ab)*ab", "abab");
}

int main() {
test_dev10_449367_case_insensitivity_should_work();
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
Expand Down Expand Up @@ -2407,6 +2416,7 @@ int main() {
test_gh_5792();
test_gh_5797();
test_gh_5798();
test_gh_5865();

return g_regexTester.result();
}
Loading