Skip to content

Commit 82bf822

Browse files
committed
[Enhancement] Better error handling when using underscore in rex regex pattern Co-authored-by: Simeon Widdis <sawiddis@amazon.com>
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent ca5a5bd commit 82bf822

4 files changed

Lines changed: 50 additions & 4 deletions

File tree

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,11 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
277277
"Rex pattern must contain at least one named capture group");
278278
}
279279

280+
if (namedGroups.stream().anyMatch(g -> g.contains("_"))) {
281+
throw new IllegalArgumentException(
282+
"Underscores are not permitted in Java Regex capture group names");
283+
}
284+
280285
List<RexNode> newFields = new ArrayList<>();
281286
List<String> newFieldNames = new ArrayList<>();
282287

core/src/main/java/org/opensearch/sql/expression/parse/RegexCommonUtils.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
public class RegexCommonUtils {
2121

2222
private static final Pattern NAMED_GROUP_PATTERN =
23-
Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>");
23+
Pattern.compile("\\(\\?<([a-zA-Z_][a-zA-Z0-9_]*)>");
2424

2525
private static final int MAX_CACHE_SIZE = 1000;
2626

core/src/test/java/org/opensearch/sql/expression/parse/RegexCommonUtilsTest.java

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,11 +195,12 @@ public void testGetNamedGroupCandidatesWithNumericNames() {
195195

196196
@Test
197197
public void testGetNamedGroupCandidatesSpecialCharacters() {
198-
// Test that groups with special characters are not captured (only alphanumeric starting with
199-
// letter)
198+
// Test that groups with special characters (except underscores) are not captured
199+
// Numbers at start and hyphens are invalid, but underscores are now detected for validation
200200
String pattern = "(?<valid_group>[a-z]+)\\s+(?<123invalid>[0-9]+)\\s+(?<also-invalid>.*)";
201201
List<String> groups = RegexCommonUtils.getNamedGroupCandidates(pattern);
202-
assertEquals(0, groups.size());
202+
assertEquals(1, groups.size());
203+
assertEquals("valid_group", groups.get(0));
203204
}
204205

205206
@Test
@@ -211,4 +212,28 @@ public void testGetNamedGroupCandidatesValidAlphanumeric() {
211212
assertEquals("groupA", groups.get(0));
212213
assertEquals("group2B", groups.get(1));
213214
}
215+
216+
@Test
217+
public void testGetNamedGroupCandidatesWithUnderscores() {
218+
// Test that underscores in group names are detected (for validation purposes)
219+
String pattern = "(?<user_name>[a-z]+)\\s+(?<domain_name>[a-z]+)";
220+
List<String> groups = RegexCommonUtils.getNamedGroupCandidates(pattern);
221+
222+
assertEquals(2, groups.size());
223+
assertEquals("user_name", groups.get(0));
224+
assertEquals("domain_name", groups.get(1));
225+
}
226+
227+
@Test
228+
public void testGetNamedGroupCandidatesMixedUnderscores() {
229+
// Test that mixed patterns with and without underscores are detected
230+
String pattern =
231+
"(?<validName>[a-z]+)\\s+(?<invalid_name>[0-9]+)\\s+(?<another_invalid_name>.*)";
232+
List<String> groups = RegexCommonUtils.getNamedGroupCandidates(pattern);
233+
234+
assertEquals(3, groups.size());
235+
assertEquals("validName", groups.get(0));
236+
assertEquals("invalid_name", groups.get(1));
237+
assertEquals("another_invalid_name", groups.get(2));
238+
}
214239
}

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRexCommandIT.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,22 @@ public void testRexErrorNoNamedGroups() throws IOException {
5050
}
5151
}
5252

53+
@Test
54+
public void testRexErrorUnderscoreInGroupNames() throws IOException {
55+
try {
56+
executeQuery(
57+
String.format(
58+
"source=%s | rex field=email \\\"(?<user_name>[^@]+)@(?<domain_name>.+)\\\" | fields"
59+
+ " email",
60+
TEST_INDEX_ACCOUNT));
61+
fail("Should have thrown an exception for underscore in named capture group");
62+
} catch (Exception e) {
63+
assertTrue(
64+
e.getMessage()
65+
.contains("Underscores are not permitted in Java Regex capture group names"));
66+
}
67+
}
68+
5369
@Test
5470
public void testRexWithFiltering() throws IOException {
5571
JSONObject result =

0 commit comments

Comments
 (0)