Skip to content

Commit 265ab67

Browse files
committed
fix regex error messages WIP
1 parent a50ff1f commit 265ab67

File tree

4 files changed

+435
-89
lines changed

4 files changed

+435
-89
lines changed

dev/prompts/debug.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
Fix one test at a time; add println statements; create a one-liner Perl (jperl) to test behaviour.
2-
Give enough context if the changes, and tag the snippets with the file name so the editor doesn't make mistakes.
2+
Give enough context in the changes, and tag the snippets with the file name so the editor doesn't make mistakes.
33

src/main/java/org/perlonjava/regex/ExtendedCharClass.java

Lines changed: 111 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,22 @@ public class ExtendedCharClass {
1515
* @return Position after the closing ])
1616
*/
1717
static int handleExtendedCharacterClass(String s, int offset, StringBuilder sb, RegexFlags regexFlags) {
18+
// System.err.println("DEBUG: handleExtendedCharacterClass called at offset " + offset);
19+
1820
int start = offset + 3; // Skip past '(?['
21+
22+
// First, check if this is an empty extended character class
23+
int i = start;
24+
while (i < s.length() && Character.isWhitespace(s.charAt(i))) {
25+
i++;
26+
}
27+
28+
if (i < s.length() && s.charAt(i) == ']' && i + 1 < s.length() && s.charAt(i + 1) == ')') {
29+
// This is an empty extended character class
30+
RegexPreprocessor.regexError(s, start, "Incomplete expression within '(?[ ])'");
31+
}
32+
33+
// Now find the end
1934
int end = findExtendedClassEnd(s, start);
2035

2136
if (end == -1) {
@@ -24,6 +39,13 @@ static int handleExtendedCharacterClass(String s, int offset, StringBuilder sb,
2439

2540
String content = s.substring(start, end);
2641

42+
// System.err.println("DEBUG: ExtendedCharClass content: '" + content + "'");
43+
44+
// Check for empty or whitespace-only content
45+
if (content.trim().isEmpty()) {
46+
RegexPreprocessor.regexError(s, start, "Incomplete expression within '(?[ ])'");
47+
}
48+
2749
// try {
2850
// Parse and transform the extended character class
2951
String transformed = transformExtendedClass(content, s, start);
@@ -44,72 +66,56 @@ private static int findExtendedClassEnd(String s, int start) {
4466
int depth = 1;
4567
int i = start;
4668
boolean inEscape = false;
47-
boolean inCharClass = false;
48-
boolean firstInClass = false;
49-
boolean afterCaret = false;
69+
70+
// System.err.println("DEBUG: findExtendedClassEnd starting at position " + start);
71+
// System.err.println("DEBUG: Looking at: '" + s.substring(start) + "'");
5072

5173
while (i < s.length() && depth > 0) {
5274
char c = s.charAt(i);
5375

5476
if (inEscape) {
5577
inEscape = false;
56-
firstInClass = false;
57-
afterCaret = false;
5878
i++;
5979
continue;
6080
}
6181

82+
// Check for nested (?[...])
83+
if (c == '(' && i + 2 < s.length() && s.charAt(i + 1) == '?' && s.charAt(i + 2) == '[') {
84+
// System.err.println("DEBUG: Found nested (?[ at position " + i);
85+
// We found a nested extended character class
86+
// We need to skip over it entirely
87+
int nestedEnd = findExtendedClassEnd(s, i + 3);
88+
if (nestedEnd == -1) {
89+
// The nested one is unterminated, but we should continue
90+
// The error will be caught when that nested one is processed
91+
return -1;
92+
}
93+
// Skip to after the "])'" of the nested extended class
94+
i = nestedEnd + 2; // +2 for "])"
95+
continue;
96+
}
97+
6298
switch (c) {
6399
case '\\':
64100
inEscape = true;
65-
firstInClass = false;
66-
afterCaret = false;
67101
break;
68102
case '[':
69-
if (!inCharClass) {
70-
inCharClass = true;
71-
firstInClass = true;
72-
afterCaret = false;
73-
}
74103
depth++;
75104
break;
76-
case '^':
77-
if (inCharClass && firstInClass) {
78-
afterCaret = true;
79-
}
80-
firstInClass = false;
81-
break;
82105
case ']':
83-
// Special case: ] immediately after [ or [^ is literal
84-
if (inCharClass && (firstInClass || afterCaret)) {
85-
// This is a literal ], don't decrease depth
86-
firstInClass = false;
87-
afterCaret = false;
88-
} else {
89-
depth--;
90-
if (inCharClass && depth > 0) {
91-
// We just closed a character class
92-
inCharClass = false;
106+
depth--;
107+
if (depth == 0) {
108+
// Check if this ends the extended character class
109+
int j = i + 1;
110+
while (j < s.length() && Character.isWhitespace(s.charAt(j))) {
111+
j++;
93112
}
94-
95-
if (depth == 0) {
96-
// Check if this ends the extended character class
97-
int j = i + 1;
98-
while (j < s.length() && Character.isWhitespace(s.charAt(j))) {
99-
j++;
100-
}
101-
if (j < s.length() && s.charAt(j) == ')') {
102-
return i;
103-
}
104-
// Not properly terminated
105-
return -1;
113+
if (j < s.length() && s.charAt(j) == ')') {
114+
// System.err.println("DEBUG: Found end of extended class at position " + i);
115+
return i;
106116
}
107-
}
108-
break;
109-
default:
110-
if (!Character.isWhitespace(c)) {
111-
firstInClass = false;
112-
afterCaret = false;
117+
// Not properly terminated
118+
return -1;
113119
}
114120
break;
115121
}
@@ -123,6 +129,10 @@ private static int findExtendedClassEnd(String s, int start) {
123129
* Transform the extended character class content into Java syntax
124130
*/
125131
private static String transformExtendedClass(String content, String originalRegex, int contentStart) {
132+
// Before tokenizing, scan for nested regex constructs and process them
133+
// This allows us to catch errors in nested patterns
134+
scanForNestedConstructs(content, originalRegex, contentStart);
135+
126136
// Tokenize the expression
127137
List<Token> tokens = tokenizeExtendedClass(content, originalRegex, contentStart);
128138

@@ -133,6 +143,53 @@ private static String transformExtendedClass(String content, String originalRege
133143
return evaluateExtendedClass(tree);
134144
}
135145

146+
private static void scanForNestedConstructs(String content, String originalRegex, int contentStart) {
147+
int i = 0;
148+
while (i < content.length()) {
149+
char c = content.charAt(i);
150+
151+
// Look for (?...) constructs
152+
if (c == '(' && i + 1 < content.length() && content.charAt(i + 1) == '?') {
153+
// System.err.println("DEBUG: Found nested (? at position " + i + " in extended char class content");
154+
155+
// Check what type of construct this is
156+
if (i + 2 < content.length()) {
157+
char nextChar = content.charAt(i + 2);
158+
159+
if (nextChar == '[') {
160+
// Nested extended character class - process it recursively
161+
// System.err.println("DEBUG: Found nested (?[ at position " + i);
162+
// Call handleExtendedCharacterClass recursively
163+
StringBuilder dummySb = new StringBuilder();
164+
ExtendedCharClass.handleExtendedCharacterClass(originalRegex, contentStart + i, dummySb, RegexFlags.fromModifiers("", ""));
165+
} else if (nextChar == 'x' && i + 3 < content.length() && content.charAt(i + 3) == ':') {
166+
// (?x:...) construct - scan inside it
167+
// System.err.println("DEBUG: Found (?x: at position " + i);
168+
// Find the matching closing paren
169+
int closePos = findMatchingParen(content, i);
170+
if (closePos > i + 4) {
171+
String innerContent = content.substring(i + 4, closePos);
172+
scanForNestedConstructs(innerContent, originalRegex, contentStart + i + 4);
173+
}
174+
}
175+
}
176+
}
177+
i++;
178+
}
179+
}
180+
181+
private static int findMatchingParen(String s, int start) {
182+
int depth = 1;
183+
int i = start + 1;
184+
while (i < s.length() && depth > 0) {
185+
char c = s.charAt(i);
186+
if (c == '(') depth++;
187+
else if (c == ')') depth--;
188+
i++;
189+
}
190+
return depth == 0 ? i - 1 : -1;
191+
}
192+
136193
/**
137194
* Tokenize the extended character class content
138195
* Automatically handles whitespace (xx mode)
@@ -141,6 +198,8 @@ private static List<Token> tokenizeExtendedClass(String content, String original
141198
List<Token> tokens = new ArrayList<>();
142199
int i = 0;
143200

201+
// System.err.println("DEBUG: tokenizeExtendedClass content: '" + content + "'");
202+
144203
while (i < content.length()) {
145204
// Skip whitespace (automatic /xx mode)
146205
while (i < content.length() && Character.isWhitespace(content.charAt(i))) {
@@ -221,6 +280,12 @@ private static List<Token> tokenizeExtendedClass(String content, String original
221280
}
222281

223282
tokens.add(new Token(TokenType.EOF, "", contentStart + content.length()));
283+
284+
// Check if we have any meaningful tokens
285+
if (tokens.size() == 1 && tokens.get(0).type == TokenType.EOF) {
286+
RegexPreprocessor.regexError(originalRegex, contentStart, "Incomplete expression within '(?[ ])'");
287+
}
288+
224289
return tokens;
225290
}
226291

0 commit comments

Comments
 (0)