Skip to content

Commit 8e8f527

Browse files
committed
refactor(ast): unify positive/negative lookahead
1 parent 3cbcf57 commit 8e8f527

File tree

4 files changed

+48
-88
lines changed

4 files changed

+48
-88
lines changed

src/ast.ts

Lines changed: 30 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@ export type RegExpAST =
2323
| { type: "optional", inner: RegExpAST }
2424
| { type: "repeat", inner: RegExpAST, bounds: RepeatBounds }
2525
| { type: "capture-group", name?: string, inner: RegExpAST }
26-
| { type: "positive-lookahead", inner: RegExpAST, right: RegExpAST }
27-
| { type: "negative-lookahead", inner: RegExpAST, right: RegExpAST }
26+
| { type: "lookahead", isPositive: boolean, inner: RegExpAST, right: RegExpAST }
2827
| { type: "start-anchor", left: RegExpAST, right: RegExpAST }
2928
| { type: "end-anchor", left: RegExpAST, right: RegExpAST }
3029

@@ -56,8 +55,7 @@ function isNullable(ast: RegExpAST): boolean {
5655
}
5756
}
5857
case "capture-group": return isNullable(ast.inner)
59-
case "positive-lookahead": return isNullable(ast.inner) && isNullable(ast.right)
60-
case "negative-lookahead": return !isNullable(ast.inner) && isNullable(ast.right)
58+
case "lookahead": return isNullable(ast.inner) && isNullable(ast.right)
6159
case "start-anchor": return isNullable(ast.left) && isNullable(ast.right)
6260
case "end-anchor": return isNullable(ast.left) && isNullable(ast.right)
6361
}
@@ -80,8 +78,7 @@ function desugar(ast: RegExpAST): RegExpAST {
8078
case 'star': return star(desugar(ast.inner))
8179
case 'start-anchor': return startAnchor(desugar(ast.left), desugar(ast.right))
8280
case 'end-anchor': return endAnchor(desugar(ast.left), desugar(ast.right))
83-
case 'positive-lookahead': return positiveLookahead(desugar(ast.inner), desugar(ast.right))
84-
case 'negative-lookahead': return negativeLookahead(desugar(ast.inner), desugar(ast.right))
81+
case 'lookahead': return lookahead(ast.isPositive, desugar(ast.inner), desugar(ast.right))
8582
// sugar nodes:
8683
case 'capture-group': return desugar(ast.inner)
8784
case 'plus': {
@@ -243,26 +240,15 @@ function pullUpStartAnchor(ast: RegExpAST): RegExpAST {
243240
return endAnchor(left, undefined)
244241
}
245242
}
246-
case "positive-lookahead": {
243+
case "lookahead": {
247244
const inner = pullUpStartAnchor(ast.inner)
248245
const right = pullUpStartAnchor(ast.right)
249246
if (inner.type === 'start-anchor') {
250-
throw new UnsupportedSyntaxError('start anchors (^) inside lookaheads are not supported')
247+
throw new UnsupportedSyntaxError('start anchors inside lookaheads like (?=^a) are not supported')
251248
} else if (right.type === 'start-anchor') {
252-
return startAnchor(undefined, positiveLookahead(ast.inner, right.right))
249+
return startAnchor(undefined, lookahead(ast.isPositive, ast.inner, right.right))
253250
} else {
254-
return positiveLookahead(ast.inner, right)
255-
}
256-
}
257-
case "negative-lookahead": {
258-
const inner = pullUpStartAnchor(ast.inner)
259-
const right = pullUpStartAnchor(ast.right)
260-
if (inner.type === 'start-anchor') {
261-
throw new UnsupportedSyntaxError('start anchors (^) inside lookaheads are not supported')
262-
} else if (right.type === 'start-anchor') {
263-
return startAnchor(undefined, negativeLookahead(ast.inner, right.right))
264-
} else {
265-
return negativeLookahead(ast.inner, right)
251+
return lookahead(ast.isPositive, ast.inner, right)
266252
}
267253
}
268254
}
@@ -392,26 +378,15 @@ function pullUpEndAnchor(ast: RegExpAST): RegExpAST {
392378
return endAnchor(left, undefined) // i.e. `l$`
393379
}
394380
}
395-
case "positive-lookahead": {
381+
case "lookahead": {
396382
const inner = pullUpEndAnchor(ast.inner)
397383
const right = pullUpEndAnchor(ast.right)
398384
if (inner.type === 'end-anchor') {
399-
throw new UnsupportedSyntaxError('end anchors ($) inside lookaheads are not supported')
385+
throw new UnsupportedSyntaxError('end anchors inside lookaheads like (?=a$) are not supported')
400386
} else if (right.type === 'end-anchor') {
401-
return endAnchor(positiveLookahead(ast.inner, right.left), undefined)
387+
return endAnchor(lookahead(ast.isPositive, ast.inner, right.left), undefined)
402388
} else {
403-
return positiveLookahead(ast.inner, right)
404-
}
405-
}
406-
case "negative-lookahead": {
407-
const inner = pullUpEndAnchor(ast.inner)
408-
const right = pullUpEndAnchor(ast.right)
409-
if (inner.type === 'end-anchor') {
410-
throw new UnsupportedSyntaxError('end anchors ($) inside lookaheads are not supported')
411-
} else if (right.type === 'end-anchor') {
412-
return endAnchor(negativeLookahead(ast.inner, right.right), undefined)
413-
} else {
414-
return negativeLookahead(ast.inner, right)
389+
return lookahead(ast.isPositive, ast.inner, right)
415390
}
416391
}
417392
}
@@ -458,15 +433,13 @@ function toExtRegexAux(ast: RegExpAST): RE.ExtRegex {
458433
case 'concat': return RE.concat(toExtRegexAux(ast.left), toExtRegexAux(ast.right))
459434
case 'union': return RE.union(toExtRegexAux(ast.left), toExtRegexAux(ast.right))
460435
case 'star': return RE.star(toExtRegexAux(ast.inner))
461-
case 'positive-lookahead': {
462-
const inner = toExtRegexAux(ast.inner)
463-
const right = toExtRegexAux(ast.right)
464-
return RE.intersection(inner, right)
465-
}
466-
case 'negative-lookahead': {
436+
case 'lookahead': {
467437
const inner = toExtRegexAux(ast.inner)
468438
const right = toExtRegexAux(ast.right)
469-
return RE.intersection(RE.complement(inner), right)
439+
if (ast.isPositive)
440+
return RE.intersection(inner, right)
441+
else
442+
return RE.intersection(RE.complement(inner), right)
470443
}
471444
}
472445
checkedAllCases(ast.type)
@@ -525,18 +498,12 @@ export function captureGroup(inner: RegExpAST, name?: string): RegExpAST {
525498
return { type: 'capture-group', inner, name }
526499
}
527500

528-
export function positiveLookahead(
529-
inner: RegExpAST,
530-
right: RegExpAST,
531-
): RegExpAST {
532-
return { type: 'positive-lookahead', inner, right }
533-
}
534-
535-
export function negativeLookahead(
501+
export function lookahead(
502+
isPositive: boolean,
536503
inner: RegExpAST,
537504
right: RegExpAST,
538505
): RegExpAST {
539-
return { type: 'negative-lookahead', inner, right }
506+
return { type: 'lookahead', isPositive, inner, right }
540507
}
541508

542509
//////////////////////////////////////////////
@@ -587,10 +554,8 @@ function debugShow_(ast: RegExpAST): unknown {
587554
return { type: 'repeat', inner: debugShow_(ast.inner), bounds: ast.bounds }
588555
case 'capture-group':
589556
return { type: 'capture-group', name: ast.name, inner: debugShow_(ast.inner) }
590-
case 'positive-lookahead':
591-
return { type: 'positive-lookahead', inner: debugShow_(ast.inner) }
592-
case 'negative-lookahead':
593-
return { type: 'negative-lookahead', inner: debugShow_(ast.inner) }
557+
case 'lookahead':
558+
return { type: 'lookahead', isPositive: ast.isPositive, inner: debugShow_(ast.inner) }
594559
}
595560
checkedAllCases(ast)
596561
}
@@ -644,10 +609,14 @@ export function toString(ast: RegExpAST, options: RenderOptions): string {
644609

645610
case 'capture-group':
646611
return captureGroupToString(ast.name, ast.inner, options)
647-
case 'positive-lookahead':
648-
return '(?=' + toString(ast.inner, options) + ')' + maybeWithParens(ast.right, ast, options)
649-
case 'negative-lookahead':
650-
return '(?!' + toString(ast.inner, options) + ')' + maybeWithParens(ast.right, ast, options)
612+
case 'lookahead': {
613+
const inner = toString(ast.inner, options)
614+
const right = maybeWithParens(ast.right, ast, options)
615+
if (ast.isPositive)
616+
return '(?=' + inner + ')' + right
617+
else
618+
return '(?!' + inner + ')' + right
619+
}
651620
}
652621
checkedAllCases(ast)
653622
}
@@ -666,8 +635,7 @@ function precLevel(nodeType: RegExpAST['type']) {
666635

667636
case 'concat': return 4
668637

669-
case 'positive-lookahead': return 3
670-
case 'negative-lookahead': return 3
638+
case 'lookahead': return 3
671639

672640
case 'start-anchor': return 2
673641
case 'end-anchor': return 2

src/regex-parser.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ function positiveLookAhead(): P.Expr.UnaryOperator<AST.RegExpAST> {
195195
// FIXME: that allows ^/$ inside lookaheads but that isn't
196196
// handled correctly right now.
197197
regex(),
198-
).map(inner => right => AST.positiveLookahead(inner, right))
198+
).map(inner => right => AST.lookahead(true, inner, right))
199199
}
200200

201201
function negativeLookAhead(): P.Expr.UnaryOperator<AST.RegExpAST> {
@@ -205,7 +205,7 @@ function negativeLookAhead(): P.Expr.UnaryOperator<AST.RegExpAST> {
205205
// FIXME: that allows ^/$ inside lookaheads but that isn't
206206
// handled correctly right now.
207207
regex(),
208-
).map(inner => right => AST.negativeLookahead(inner, right))
208+
).map(inner => right => AST.lookahead(false, inner, right))
209209
}
210210

211211
/**

test/arbitrary-ast.ts

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,9 @@ function captureGroup(innerArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary
6161
.map(([inner, name]) => AST.captureGroup(inner, name))
6262
}
6363

64-
function positiveLookahead(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
65-
return fc.tuple(childArb(), childArb())
66-
.map(([inner, right]) => AST.positiveLookahead(inner, right))
67-
}
68-
69-
function negativeLookahead(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
70-
return fc.tuple(childArb(), childArb())
71-
.map(([inner, right]) => AST.negativeLookahead(inner, right))
64+
function lookahead(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
65+
return fc.tuple(fc.boolean(), childArb(), childArb())
66+
.map(([isPositive, inner, right]) => AST.lookahead(isPositive, inner, right))
7267
}
7368

7469
function startAnchor(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
@@ -128,10 +123,8 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
128123
return AST.captureGroup(innerProcessed, nameProcessed)
129124
}
130125
}
131-
case 'positive-lookahead':
132-
return AST.positiveLookahead(traverse(node.inner), traverse(node.right))
133-
case 'negative-lookahead':
134-
return AST.negativeLookahead(traverse(node.inner), traverse(node.right))
126+
case 'lookahead':
127+
return AST.lookahead(node.isPositive, traverse(node.inner), traverse(node.right))
135128
case 'start-anchor':
136129
return AST.startAnchor(traverse(node.left), traverse(node.right))
137130
case 'end-anchor':
@@ -165,8 +158,7 @@ function regexpAST_(size: number): fc.Arbitrary<AST.RegExpAST> {
165158
{ arbitrary: optional(() => regexpAST_(childSize)), weight: 1 },
166159
{ arbitrary: repeat(() => regexpAST_(childSize)), weight: 1 },
167160
{ arbitrary: captureGroup(() => regexpAST_(childSize)), weight: 2 },
168-
{ arbitrary: positiveLookahead(() => regexpAST_(childSize)), weight: 1 },
169-
{ arbitrary: negativeLookahead(() => regexpAST_(childSize)), weight: 1 },
161+
{ arbitrary: lookahead(() => regexpAST_(childSize)), weight: 1 },
170162
{ arbitrary: startAnchor(() => regexpAST_(childSize)), weight: 1 },
171163
{ arbitrary: endAnchor(() => regexpAST_(childSize)), weight: 1 }
172164
)

test/regex-parser.spec.ts

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,17 @@ describe('parseRegExp', () => {
7272
[/^abc$/, AST.startAnchor(undefined, AST.endAnchor(str('abc'), undefined))],
7373
[/$a^/, AST.startAnchor(AST.endAnchor(undefined, char('a')), undefined)],
7474
// positive lookahead - now parsed as lookahead AST nodes, not intersections
75-
[/(?=a)b/, AST.positiveLookahead(char('a'), char('b'))],
76-
[/(?=a)(?:b)/, AST.positiveLookahead(char('a'), char('b'))],
77-
[/(?=a)(?=b)c/, AST.positiveLookahead(char('a'), AST.positiveLookahead(char('b'), char('c')))],
78-
[/a(?=b)c/, AST.concat(char('a'), AST.positiveLookahead(char('b'), char('c')))],
79-
[/a(?=b)/, AST.concat(char('a'), AST.positiveLookahead(char('b'), AST.epsilon))],
80-
[/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.positiveLookahead(char('b'), AST.concat(char('c'), AST.positiveLookahead(char('d'), char('e')))))],
81-
[/(?=)/, AST.positiveLookahead(AST.epsilon, AST.epsilon)],
75+
[/(?=a)b/, AST.lookahead(true, char('a'), char('b'))],
76+
[/(?=a)(?:b)/, AST.lookahead(true, char('a'), char('b'))],
77+
[/(?=a)(?=b)c/, AST.lookahead(true, char('a'), AST.lookahead(true, char('b'), char('c')))],
78+
[/a(?=b)c/, AST.concat(char('a'), AST.lookahead(true, char('b'), char('c')))],
79+
[/a(?=b)/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.epsilon))],
80+
[/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.concat(char('c'), AST.lookahead(true, char('d'), char('e')))))],
81+
[/(?=)/, AST.lookahead(true, AST.epsilon, AST.epsilon)],
8282
// negative lookahead
83-
[/(?!a)b/, AST.negativeLookahead(char('a'), char('b'))],
84-
[/(?!a)b|c/, AST.union(AST.negativeLookahead(char('a'), char('b')), char('c'))],
85-
[/(?!)/, AST.negativeLookahead(AST.epsilon, AST.epsilon)],
83+
[/(?!a)b/, AST.lookahead(false, char('a'), char('b'))],
84+
[/(?!a)b|c/, AST.union(AST.lookahead(false, char('a'), char('b')), char('c'))],
85+
[/(?!)/, AST.lookahead(false, AST.epsilon, AST.epsilon)],
8686
// TODO: positive lookbehind
8787
// [/(?<=a)/, AST.positiveLookbehind(char('a'))],
8888
// TODO: negative lookbehind

0 commit comments

Comments
 (0)