@@ -15,7 +15,22 @@ public class ExtendedCharClass {
15
15
* @return Position after the closing ])
16
16
*/
17
17
static int handleExtendedCharacterClass (String s , int offset , StringBuilder sb , RegexFlags regexFlags ) {
18
+ // System.err.println("DEBUG: handleExtendedCharacterClass called at offset " + offset);
19
+
18
20
int start = offset + 3 ; // Skip past '(?['
21
+
22
+ // First, check if this is an empty extended character class
23
+ int i = start ;
24
+ while (i < s .length () && Character .isWhitespace (s .charAt (i ))) {
25
+ i ++;
26
+ }
27
+
28
+ if (i < s .length () && s .charAt (i ) == ']' && i + 1 < s .length () && s .charAt (i + 1 ) == ')' ) {
29
+ // This is an empty extended character class
30
+ RegexPreprocessor .regexError (s , start , "Incomplete expression within '(?[ ])'" );
31
+ }
32
+
33
+ // Now find the end
19
34
int end = findExtendedClassEnd (s , start );
20
35
21
36
if (end == -1 ) {
@@ -24,6 +39,13 @@ static int handleExtendedCharacterClass(String s, int offset, StringBuilder sb,
24
39
25
40
String content = s .substring (start , end );
26
41
42
+ // System.err.println("DEBUG: ExtendedCharClass content: '" + content + "'");
43
+
44
+ // Check for empty or whitespace-only content
45
+ if (content .trim ().isEmpty ()) {
46
+ RegexPreprocessor .regexError (s , start , "Incomplete expression within '(?[ ])'" );
47
+ }
48
+
27
49
// try {
28
50
// Parse and transform the extended character class
29
51
String transformed = transformExtendedClass (content , s , start );
@@ -44,72 +66,56 @@ private static int findExtendedClassEnd(String s, int start) {
44
66
int depth = 1 ;
45
67
int i = start ;
46
68
boolean inEscape = false ;
47
- boolean inCharClass = false ;
48
- boolean firstInClass = false ;
49
- boolean afterCaret = false ;
69
+
70
+ // System.err.println("DEBUG: findExtendedClassEnd starting at position " + start) ;
71
+ // System.err.println("DEBUG: Looking at: '" + s.substring(start) + "'") ;
50
72
51
73
while (i < s .length () && depth > 0 ) {
52
74
char c = s .charAt (i );
53
75
54
76
if (inEscape ) {
55
77
inEscape = false ;
56
- firstInClass = false ;
57
- afterCaret = false ;
58
78
i ++;
59
79
continue ;
60
80
}
61
81
82
+ // Check for nested (?[...])
83
+ if (c == '(' && i + 2 < s .length () && s .charAt (i + 1 ) == '?' && s .charAt (i + 2 ) == '[' ) {
84
+ // System.err.println("DEBUG: Found nested (?[ at position " + i);
85
+ // We found a nested extended character class
86
+ // We need to skip over it entirely
87
+ int nestedEnd = findExtendedClassEnd (s , i + 3 );
88
+ if (nestedEnd == -1 ) {
89
+ // The nested one is unterminated, but we should continue
90
+ // The error will be caught when that nested one is processed
91
+ return -1 ;
92
+ }
93
+ // Skip to after the "])'" of the nested extended class
94
+ i = nestedEnd + 2 ; // +2 for "])"
95
+ continue ;
96
+ }
97
+
62
98
switch (c ) {
63
99
case '\\' :
64
100
inEscape = true ;
65
- firstInClass = false ;
66
- afterCaret = false ;
67
101
break ;
68
102
case '[' :
69
- if (!inCharClass ) {
70
- inCharClass = true ;
71
- firstInClass = true ;
72
- afterCaret = false ;
73
- }
74
103
depth ++;
75
104
break ;
76
- case '^' :
77
- if (inCharClass && firstInClass ) {
78
- afterCaret = true ;
79
- }
80
- firstInClass = false ;
81
- break ;
82
105
case ']' :
83
- // Special case: ] immediately after [ or [^ is literal
84
- if (inCharClass && (firstInClass || afterCaret )) {
85
- // This is a literal ], don't decrease depth
86
- firstInClass = false ;
87
- afterCaret = false ;
88
- } else {
89
- depth --;
90
- if (inCharClass && depth > 0 ) {
91
- // We just closed a character class
92
- inCharClass = false ;
106
+ depth --;
107
+ if (depth == 0 ) {
108
+ // Check if this ends the extended character class
109
+ int j = i + 1 ;
110
+ while (j < s .length () && Character .isWhitespace (s .charAt (j ))) {
111
+ j ++;
93
112
}
94
-
95
- if (depth == 0 ) {
96
- // Check if this ends the extended character class
97
- int j = i + 1 ;
98
- while (j < s .length () && Character .isWhitespace (s .charAt (j ))) {
99
- j ++;
100
- }
101
- if (j < s .length () && s .charAt (j ) == ')' ) {
102
- return i ;
103
- }
104
- // Not properly terminated
105
- return -1 ;
113
+ if (j < s .length () && s .charAt (j ) == ')' ) {
114
+ // System.err.println("DEBUG: Found end of extended class at position " + i);
115
+ return i ;
106
116
}
107
- }
108
- break ;
109
- default :
110
- if (!Character .isWhitespace (c )) {
111
- firstInClass = false ;
112
- afterCaret = false ;
117
+ // Not properly terminated
118
+ return -1 ;
113
119
}
114
120
break ;
115
121
}
@@ -123,6 +129,10 @@ private static int findExtendedClassEnd(String s, int start) {
123
129
* Transform the extended character class content into Java syntax
124
130
*/
125
131
private static String transformExtendedClass (String content , String originalRegex , int contentStart ) {
132
+ // Before tokenizing, scan for nested regex constructs and process them
133
+ // This allows us to catch errors in nested patterns
134
+ scanForNestedConstructs (content , originalRegex , contentStart );
135
+
126
136
// Tokenize the expression
127
137
List <Token > tokens = tokenizeExtendedClass (content , originalRegex , contentStart );
128
138
@@ -133,6 +143,53 @@ private static String transformExtendedClass(String content, String originalRege
133
143
return evaluateExtendedClass (tree );
134
144
}
135
145
146
+ private static void scanForNestedConstructs (String content , String originalRegex , int contentStart ) {
147
+ int i = 0 ;
148
+ while (i < content .length ()) {
149
+ char c = content .charAt (i );
150
+
151
+ // Look for (?...) constructs
152
+ if (c == '(' && i + 1 < content .length () && content .charAt (i + 1 ) == '?' ) {
153
+ // System.err.println("DEBUG: Found nested (? at position " + i + " in extended char class content");
154
+
155
+ // Check what type of construct this is
156
+ if (i + 2 < content .length ()) {
157
+ char nextChar = content .charAt (i + 2 );
158
+
159
+ if (nextChar == '[' ) {
160
+ // Nested extended character class - process it recursively
161
+ // System.err.println("DEBUG: Found nested (?[ at position " + i);
162
+ // Call handleExtendedCharacterClass recursively
163
+ StringBuilder dummySb = new StringBuilder ();
164
+ ExtendedCharClass .handleExtendedCharacterClass (originalRegex , contentStart + i , dummySb , RegexFlags .fromModifiers ("" , "" ));
165
+ } else if (nextChar == 'x' && i + 3 < content .length () && content .charAt (i + 3 ) == ':' ) {
166
+ // (?x:...) construct - scan inside it
167
+ // System.err.println("DEBUG: Found (?x: at position " + i);
168
+ // Find the matching closing paren
169
+ int closePos = findMatchingParen (content , i );
170
+ if (closePos > i + 4 ) {
171
+ String innerContent = content .substring (i + 4 , closePos );
172
+ scanForNestedConstructs (innerContent , originalRegex , contentStart + i + 4 );
173
+ }
174
+ }
175
+ }
176
+ }
177
+ i ++;
178
+ }
179
+ }
180
+
181
+ private static int findMatchingParen (String s , int start ) {
182
+ int depth = 1 ;
183
+ int i = start + 1 ;
184
+ while (i < s .length () && depth > 0 ) {
185
+ char c = s .charAt (i );
186
+ if (c == '(' ) depth ++;
187
+ else if (c == ')' ) depth --;
188
+ i ++;
189
+ }
190
+ return depth == 0 ? i - 1 : -1 ;
191
+ }
192
+
136
193
/**
137
194
* Tokenize the extended character class content
138
195
* Automatically handles whitespace (xx mode)
@@ -141,6 +198,8 @@ private static List<Token> tokenizeExtendedClass(String content, String original
141
198
List <Token > tokens = new ArrayList <>();
142
199
int i = 0 ;
143
200
201
+ // System.err.println("DEBUG: tokenizeExtendedClass content: '" + content + "'");
202
+
144
203
while (i < content .length ()) {
145
204
// Skip whitespace (automatic /xx mode)
146
205
while (i < content .length () && Character .isWhitespace (content .charAt (i ))) {
@@ -221,6 +280,12 @@ private static List<Token> tokenizeExtendedClass(String content, String original
221
280
}
222
281
223
282
tokens .add (new Token (TokenType .EOF , "" , contentStart + content .length ()));
283
+
284
+ // Check if we have any meaningful tokens
285
+ if (tokens .size () == 1 && tokens .get (0 ).type == TokenType .EOF ) {
286
+ RegexPreprocessor .regexError (originalRegex , contentStart , "Incomplete expression within '(?[ ])'" );
287
+ }
288
+
224
289
return tokens ;
225
290
}
226
291
0 commit comments