Skip to content

Commit 172f7fb

Browse files
committed
fix(parser): lookAhead chains
Fix various parser issues around lookAheads. Previously, all these cases would lead to parse errors: /^(?=^a$)b$/ /^(?=^a$)(?:b)$/ /^(?=^a$)(?=^b$)c$/ /^a(?=^b$)c$/ /^a(?=^b$)$/ /^a(?=^b$)c(?=^d$)e$/ Also, changed `optional` to not backtrack by default. With that failing instances in the parser benchmark are down to 279/749.
1 parent ffa6a78 commit 172f7fb

File tree

4 files changed

+114
-24
lines changed

4 files changed

+114
-24
lines changed

equiv-checker.html

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,9 +333,6 @@ <h4>Powered by:</h4>
333333
const diffAB = RB(regexA).without(regexB);
334334
const diffBA = RB(regexB).without(regexA);
335335

336-
console.log(diffAB.toRegExp())
337-
console.log(diffBA.toRegExp())
338-
339336
if (diffAB.isEmpty() && diffBA.isEmpty()) {
340337
// Equivalent
341338
showResult('✅ The regular expressions are equivalent!<br>Both expressions match exactly the same set of strings.', 'equivalent');
@@ -363,7 +360,7 @@ <h4>Powered by:</h4>
363360
if (error instanceof SyntaxError) {
364361
showResult(`❌ Invalid regex syntax: ${error.message}<br>Please check that your regular expressions use valid JavaScript regex syntax.`, 'error');
365362
} else if (error instanceof ParseError) {
366-
showResult(`🚧 Unsupported regex syntax: The syntax is valid but not supported by the checker yet<br/><br/>${error.message}`, 'error');
363+
showResult(`🚧 Unsupported regex syntax: The syntax is valid but not yet supported by the tool.<br/><br/>${error.message}`, 'error');
367364
} else if (error instanceof VeryLargeSyntaxTreeError || error instanceof CacheOverflowError) {
368365
showResult(`☠️ Excessive resource use detected: This example might be computationally hard.`, 'error');
369366
} else {
@@ -399,7 +396,6 @@ <h4>Powered by:</h4>
399396
html += '<div class="examples-list">';
400397
examples.forEach(str => {
401398
// const displayStr = str === '' ? '(empty string)' : str;
402-
console.debug({ str })
403399
html += `<code>${JSON.stringify(str)}</code> `;
404400
});
405401
html += '</div></div>';

src/parser.ts

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,15 @@ export function optional<T>(parser: Parser<T>): Parser<T | undefined> {
150150
return parser.run(input)
151151
} catch (error) {
152152
if (error instanceof ParseError) {
153-
// parser failed ==> return `undefined` and consume no characters:
154-
return { value: undefined, restInput: input }
153+
if (error.restInput === input) {
154+
// parser failed but did not consume any characters
155+
// ==> return `undefined` and consume no characters:
156+
return { value: undefined, restInput: input }
157+
} else {
158+
// parser failed and consumed characters.
159+
// ==> don't backtrack by default:
160+
throw error
161+
}
155162
} else {
156163
// Only catch parse errors, otherwise we silence logic errors:
157164
throw error
@@ -191,11 +198,27 @@ export function lazy<T>(createParser: () => Parser<T>): Parser<T> {
191198
return pure(null).andThen(createParser)
192199
}
193200

201+
export function tryElseBacktrack<T>(parser: Parser<T>): Parser<T> {
202+
return new Parser(input => {
203+
try {
204+
return parser.run(input)
205+
} catch (error) {
206+
if (error instanceof ParseError) {
207+
// restore original `input` and pretend that the parser did
208+
// not consume any characters:
209+
throw new ParseError(error.message, input)
210+
} else {
211+
throw error
212+
}
213+
}
214+
})
215+
}
216+
194217
export namespace Expr {
195218

196219
export type UnaryOperator<T> = Parser<(inner: T) => T>
197220

198-
export type BinaryOperator<T> = Parser<(left: T, right: T) => T>
221+
export type BinaryOperator<T, R=T> = Parser<(left: T, right: T) => R>
199222

200223
function prefixOp<T>(
201224
operator: UnaryOperator<T>,
@@ -245,11 +268,31 @@ export namespace Expr {
245268
)
246269
}
247270

271+
/**
272+
* Right-associative infix operator where both left- and right
273+
* operand can be optional.
274+
*/
275+
export function infixOpRightAssocOptional<T>(
276+
left: T | undefined,
277+
operatorParser: BinaryOperator<T | undefined, T>,
278+
rightParser: Parser<T>,
279+
): Parser<T> {
280+
return operatorParser.andThen(op =>
281+
optional(rightParser).andThen(right =>
282+
choice([
283+
infixOpRightAssocOptional(right, operatorParser, rightParser),
284+
pure(right)
285+
])
286+
).map(right => op(left, right))
287+
)
288+
}
289+
248290
export type Operator<T> = Readonly<
249291
| { type: 'prefix', op: Expr.UnaryOperator<T> }
250292
| { type: 'postfix', op: Expr.UnaryOperator<T> }
251293
| { type: 'infixLeft', op: Expr.BinaryOperator<T> }
252294
| { type: 'infixRight', op: Expr.BinaryOperator<T> }
295+
| { type: 'infixRightOptional', op: Expr.BinaryOperator<T | undefined, T> }
253296
>
254297

255298
function addPrecLevel<T>(
@@ -281,6 +324,16 @@ export namespace Expr {
281324
pure(left)
282325
])
283326
)
327+
case 'infixRightOptional':
328+
return optional(termParser).andThen(left => {
329+
if (left === undefined)
330+
return infixOpRightAssocOptional(left, operator.op, termParser)
331+
else
332+
return choice([
333+
infixOpRightAssocOptional(left, operator.op, termParser),
334+
pure(left)
335+
])
336+
})
284337
}
285338
checkedAllCases(operator)
286339
}

src/regex-parser.ts

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const regExpFlags = [
1515
'sticky',
1616
] as const
1717

18-
type RegExpFlag = typeof regExpFlags[number]
18+
// type RegExpFlag = typeof regExpFlags[number]
1919

2020
// TODO:
2121
// - parse \uXXXX notation
@@ -99,7 +99,11 @@ const charSet = P.choice([
9999
])
100100

101101
const group = P.between(
102-
P.string('(').andThen(() => P.optional(P.string('?:'))),
102+
P.choice([
103+
P.string('(?:'),
104+
// TODO: named group: (?<name>...)
105+
P.string('('),
106+
]),
103107
P.string(')'),
104108
regex(),
105109
)
@@ -134,23 +138,56 @@ const boundedQuantifier: P.Expr.UnaryOperator<RE.ExtRegex> = P.between(
134138
function regexTerm() {
135139
return P.choice([
136140
wildcard,
137-
group,
141+
P.tryElseBacktrack(group),
138142
escapeSequence.map(RE.literal),
139143
charSet.map(RE.literal),
140144
])
141145
}
142146

143-
function lookAhead(): P.Expr.UnaryOperator<RE.ExtRegex> {
147+
function positiveLookAhead(): P.Expr.UnaryOperator<RE.ExtRegex> {
144148
return P.between(
145-
P.string('(?'),
149+
P.string('(?='),
146150
P.string(')'),
147-
P.choice([
148-
// positive lookahead
149-
P.string('=').andThen(_ => regexWithBounds()),
150-
// negative lookahead
151-
P.string('!').andThen(_ => regexWithBounds().map(RE.complement)),
152-
]).map(
153-
left => right => RE.intersection(left, right)
151+
regexWithBounds()
152+
).map(inner => right =>
153+
RE.intersection(inner, right)
154+
)
155+
}
156+
157+
function negativeLookAhead(): P.Expr.UnaryOperator<RE.ExtRegex> {
158+
return P.between(
159+
P.string('(?!'),
160+
P.string(')'),
161+
regexWithBounds()
162+
).map(inner => right =>
163+
RE.intersection(RE.complement(inner), right)
164+
)
165+
}
166+
167+
/**
168+
* We treat lookAheads like a right-associative infix operator
169+
* even though it only "acts" on the right hand side:
170+
*
171+
* aaa (?=bbb) ccc
172+
*
173+
* We could treat it as a prefix operator but then it's
174+
* unclear what should have higher precedence: concat or
175+
* lookAhead? But even when treating lookAheads as infix
176+
* operators, they need special treatment because the left- and
177+
* right operand can be optional:
178+
*
179+
* (?=bbb) fff
180+
* aaa (?=bbb)
181+
* aaa (?=bbb) (?!ccc) ddd
182+
*/
183+
function lookAheadOp(): P.Expr.BinaryOperator<RE.ExtRegex | undefined, RE.ExtRegex> {
184+
return P.choice([
185+
positiveLookAhead(),
186+
negativeLookAhead(),
187+
]).map(op => (left, right) =>
188+
RE.concat(
189+
left ?? RE.string(''),
190+
op(right ?? RE.string(''))
154191
)
155192
)
156193
}
@@ -164,7 +201,7 @@ function regex(): P.Parser<RE.ExtRegex> {
164201
{ type: 'postfix', op: P.string('+').map(_ => RE.plus) },
165202
{ type: 'postfix', op: P.string('?').map(_ => RE.optional) },
166203
{ type: 'infixRight', op: P.string('').map(_ => RE.concat) },
167-
{ type: 'prefix', op: lookAhead() },
204+
{ type: 'infixRightOptional', op: lookAheadOp() },
168205
{ type: 'infixRight', op: P.string('|').map(_ => RE.union) },
169206
]
170207
))
@@ -185,7 +222,6 @@ export function parseRegexString(
185222
): RE.ExtRegex {
186223
const { value, restInput } = regexWithBounds().run(regexStr)
187224
if (restInput === '') {
188-
// TODO: parsing should always return stdandard regex instances:
189225
return value
190226
} else {
191227
throw new P.ParseError('Expected end of input.', restInput)

test/regex-parser.spec.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,18 @@ describe('parseRegexString', () => {
3636
// non-capturing groups
3737
[/^(?:ab)$/, RE.string('ab')],
3838
// positive lookahead
39-
[/^(?=^a$)a$/, RE.intersection(RE.string('a'), RE.string('a'))],
39+
[/^(?=^a$)b$/, RE.intersection(RE.string('a'), RE.string('b'))],
40+
[/^(?=^a$)(?:b)$/, RE.intersection(RE.string('a'), RE.string('b'))],
41+
[/^(?=^a$)(?=^b$)c$/, RE.intersection(RE.string('a'), RE.intersection(RE.string('b'), RE.string('b')))],
42+
[/^a(?=^b$)c$/, RE.concat(RE.singleChar('a'), RE.intersection(RE.string('b'), RE.string('c')))],
43+
[/^a(?=^b$)$/, RE.concat(RE.string('a'), RE.intersection(RE.string('b'), RE.string('')))],
44+
[/^a(?=^b$)c(?=^d$)e$/, RE.concat(RE.string('a'), RE.intersection(RE.string('b'), RE.concat(RE.string('c'), RE.intersection(RE.string('d'), RE.string('e')))))],
4045
// negative lookahead
4146
[/^(?!^a$)b$/, RE.intersection(RE.complement(RE.string('a')), RE.string('b'))],
4247
[/^(?!^a$)b|c$/, RE.union(RE.intersection(RE.complement(RE.string('a')), RE.string('b')), RE.string('c'))],
4348
// some special chars don't need escape when inside brackets:
4449
[/^[.^$*+?()[{-|]$/, RE.literal(CharSet.fromArray([...'.^$*+?()[{-|']))],
45-
// other special chars need escaped even inside brackets:
50+
// other special chars need escape even inside brackets:
4651
[/^[\\\]\/]$/, RE.literal(CharSet.fromArray([...'\\]/']))],
4752
])('can parse %s', (regexp, expected) => {
4853
expect(parseRegExp(regexp)).toEqual(expected)

0 commit comments

Comments
 (0)