Skip to content

Commit 6e2b8ab

Browse files
committed
fix: temp disable lookaheads
Lookahead assertions are broken in many ways. Now throwing UnsupportedSyntaxError for expressions that contain them. Refs: #13
1 parent 5ad0261 commit 6e2b8ab

File tree

9 files changed

+215
-211
lines changed

9 files changed

+215
-211
lines changed

benchmark/aoc2023-day12-result.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11

2-
Part 1: 7191 (time: 973ms)
3-
Part 2: 6512849198636 (time: 11618ms)
2+
Part 1: 7191 (time: 1048ms)
3+
Part 2: 6512849198636 (time: 11977ms)

benchmark/parser-bench-result.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
error ratio : 17 / 749
3-
total parse time : 5908ms
4-
avg parse time : 8.07103825136612ms
5-
max parse time : 919ms
2+
error ratio : 10 / 749
3+
total parse time : 5663ms
4+
avg parse time : 7.663058186738836ms
5+
max parse time : 849ms
Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11

22
failed instances:
3-
- parseError : 49
4-
- cacheOverflow : 89
5-
- veryLargeSyntaTree : 24
3+
- parseError : 111
4+
- cacheOverflow : 80
5+
- veryLargeSyntaTree : 23
66
- stackOverflow : 12
77
- regexSyntaxError : 0
88

99
size multipliers:
10-
- mean : 321.86932736154387
11-
- median : 1.1923076923076923
10+
- mean : 330.0766901004811
11+
- median : 1.6666666666666667
1212
- max : 178441.75438596492
1313
- min : 0.0021570319240724763
1414

1515
memory:
16-
- max heap used : 2491.6993865966797 MB
17-
- max rss : 2909.78515625 MB
16+
- max heap used : 2354.146553039551 MB
17+
- max rss : 2755.58203125 MB

equiv-checker.html

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,6 @@ <h4>Supported syntax:</h4>
363363
<li>Character classes: <code>.</code>, <code>\w</code>, <code>[a-zA-Z]</code>, ...</li>
364364
<li>Escaping: <code>\$</code>, <code>\.</code>, ...</li>
365365
<li>(Non-)capturing groups: <code>(?...)</code>, <code>(...)</code></li>
366-
<li>Positive/negative lookahead: <code>(?=...)</code>, <code>(?!...)</code></li>
367366
</ul>
368367

369368
<h4>Unsupported syntax:</h4>
@@ -372,7 +371,8 @@ <h4>Unsupported syntax:</h4>
372371
<li>Local flags: <code>(?i:...)</code>, ...</li>
373372
<li>Unicode property escapes: <code>\p{...}</code>, <code>\P{...}</code></li>
374373
<li>Backreferences: <code>\1</code>, <code>\2</code>, ...</li>
375-
<li>Lookbehind assertions: <code>(?&lt;=...)</code>, <code>(?&lt;!...)</code></li>
374+
<li>Positive/negative lookahead: <code>(?=...)</code>, <code>(?!...)</code></li>
375+
<li>Positive/negative lookbehind assertions: <code>(?&lt;=...)</code>, <code>(?&lt;!...)</code></li>
376376
<li>Word boundary: <code>\b</code>, <code>\B</code></li>
377377
</ul>
378378

@@ -407,10 +407,10 @@ <h4>Powered by:</h4>
407407
const urlParams = new URLSearchParams(window.location.search);
408408
const regex1 = urlParams.get('regexp1');
409409
const regex2 = urlParams.get('regexp2');
410-
410+
411411
if (regex1) regex1Input.value = regex1;
412412
if (regex2) regex2Input.value = regex2;
413-
413+
414414
// Auto-check equivalence if both parameters are present
415415
if (regex1 && regex2) {
416416
setTimeout(checkEquivalence, 100);
@@ -422,14 +422,14 @@ <h4>Powered by:</h4>
422422
const urlParams = new URLSearchParams();
423423
const regex1 = regex1Input.value.trim();
424424
const regex2 = regex2Input.value.trim();
425-
425+
426426
if (regex1) urlParams.set('regexp1', regex1);
427427
if (regex2) urlParams.set('regexp2', regex2);
428-
429-
const newURL = urlParams.toString() ?
430-
`${window.location.pathname}?${urlParams.toString()}` :
428+
429+
const newURL = urlParams.toString() ?
430+
`${window.location.pathname}?${urlParams.toString()}` :
431431
window.location.pathname;
432-
432+
433433
window.history.replaceState(null, '', newURL);
434434
}
435435

@@ -474,9 +474,9 @@ <h4>Powered by:</h4>
474474
const hasEndAnchor1 = pattern1.includes('$');
475475
const hasStartAnchor2 = pattern2.includes('^');
476476
const hasEndAnchor2 = pattern2.includes('$');
477-
477+
478478
const shouldShowAnchorInfo = !hasStartAnchor1 || !hasEndAnchor1 || !hasStartAnchor2 || !hasEndAnchor2;
479-
479+
480480
// Show/hide anchor info banner
481481
if (shouldShowAnchorInfo) {
482482
anchorInfoBanner.style.display = 'block';
@@ -516,7 +516,7 @@ <h4>Powered by:</h4>
516516
const hasValidationIssue = !(
517517
stringsMatchingAButNotB.every(str => regexA.test(str) && !regexB.test(str)) &&
518518
stringsMatchingBButNotA.every(str => regexB.test(str) && !regexA.test(str))
519-
);
519+
);
520520
const examples = {
521521
regex1Only: stringsMatchingAButNotB.length > 0 ? stringsMatchingAButNotB : null,
522522
regex2Only: stringsMatchingBButNotA.length > 0 ? stringsMatchingBButNotA : null
@@ -574,12 +574,12 @@ <h4>Powered by:</h4>
574574

575575
// Build the complete HTML structure
576576
let html = '';
577-
577+
578578
// Add warning banner if there's a validation issue
579579
if (hasValidationIssue) {
580580
html += '<div class="mismatch-warning">Something went wrong. The presented example strings don\'t match the regex as claimed. Please file an issue on GitHub with your input regex.</div>';
581581
}
582-
582+
583583
html += message;
584584

585585
// Add Venn diagram (only for subset/superset/not-equivalent cases)
@@ -591,7 +591,7 @@ <h4>Powered by:</h4>
591591
if (examples) {
592592
html += '<p>Example strings matched by one RegExp but not the other:</p>'
593593
html += '<div style="display: flex; gap: 20px; margin-top: 15px;">';
594-
594+
595595
// Left column for RegExp 1 examples
596596
html += '<div style="flex: 1;">';
597597
if (examples.regex1Only && examples.regex1Only.length > 0) {
@@ -601,7 +601,7 @@ <h4>RegExp 1 only</h4>
601601
</p>`;
602602
}
603603
html += '</div>';
604-
604+
605605
// Right column for RegExp 2 examples
606606
html += '<div style="flex: 1;">';
607607
if (examples.regex2Only && examples.regex2Only.length > 0) {
@@ -611,7 +611,7 @@ <h4>RegExp 2 only</h4>
611611
</p>`;
612612
}
613613
html += '</div>';
614-
614+
615615
html += '</div>';
616616
}
617617

src/ast.ts

Lines changed: 71 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@ export type RegExpAST =
2323
| { type: "optional", inner: RegExpAST }
2424
| { type: "repeat", inner: RegExpAST, bounds: RepeatBounds }
2525
| { type: "capture-group", name?: string, inner: RegExpAST }
26-
| { type: "lookahead", isPositive: boolean, inner: RegExpAST, right: RegExpAST }
26+
| { type: "lookahead", isPositive: boolean, inner: RegExpAST }
27+
| { type: "lookbehind", isPositive: boolean, inner: RegExpAST }
2728
| { type: "start-anchor", left: RegExpAST, right: RegExpAST }
28-
| { type: "end-anchor", left: RegExpAST, right: RegExpAST }
29+
| { type: "end-anchor", left: RegExpAST, right: RegExpAST }
2930

3031
export type RenderOptions = {
3132
useNonCapturingGroups: boolean
@@ -55,7 +56,8 @@ function isNullable(ast: RegExpAST): boolean {
5556
}
5657
}
5758
case "capture-group": return isNullable(ast.inner)
58-
case "lookahead": return isNullable(ast.inner) && isNullable(ast.right)
59+
case "lookahead": return isNullable(ast.inner) // TODO: is this correct?
60+
case "lookbehind": return isNullable(ast.inner) // TODO: is this correct?
5961
case "start-anchor": return isNullable(ast.left) && isNullable(ast.right)
6062
case "end-anchor": return isNullable(ast.left) && isNullable(ast.right)
6163
}
@@ -78,7 +80,8 @@ function desugar(ast: RegExpAST): RegExpAST {
7880
case 'star': return star(desugar(ast.inner))
7981
case 'start-anchor': return startAnchor(desugar(ast.left), desugar(ast.right))
8082
case 'end-anchor': return endAnchor(desugar(ast.left), desugar(ast.right))
81-
case 'lookahead': return lookahead(ast.isPositive, desugar(ast.inner), desugar(ast.right))
83+
case 'lookahead': return lookahead(ast.isPositive, desugar(ast.inner))
84+
case 'lookbehind': return lookbehind(ast.isPositive, desugar(ast.inner))
8285
// sugar nodes:
8386
case 'capture-group': return desugar(ast.inner)
8487
case 'plus': {
@@ -252,17 +255,20 @@ function pullUpStartAnchor(ast: RegExpAST, isLeftClosed: boolean): RegExpAST {
252255
return endAnchor(left, undefined)
253256
}
254257
}
255-
case "lookahead": {
256-
const inner = pullUpStartAnchor(ast.inner, true)
257-
const right = pullUpStartAnchor(ast.right, isLeftClosed)
258-
if (inner.type === 'start-anchor') {
259-
throw new UnsupportedSyntaxError('start anchors inside lookaheads like (?=^a)')
260-
} else if (right.type === 'start-anchor') {
261-
return startAnchor(undefined, lookahead(ast.isPositive, ast.inner, right.right))
262-
} else {
263-
return lookahead(ast.isPositive, inner, right)
264-
}
265-
}
258+
case "lookahead":
259+
// FIXME:
260+
// const inner = pullUpStartAnchor(ast.inner, true)
261+
// const right = pullUpStartAnchor(ast.right, isLeftClosed)
262+
// if (inner.type === 'start-anchor') {
263+
// throw new UnsupportedSyntaxError('start anchors inside lookaheads like (?=^a)')
264+
// } else if (right.type === 'start-anchor') {
265+
// return startAnchor(undefined, lookahead(ast.isPositive, ast.inner, right.right))
266+
// } else {
267+
// return lookahead(ast.isPositive, inner, right)
268+
// }
269+
throw new UnsupportedSyntaxError('lookahead assertion')
270+
case 'lookbehind':
271+
throw new UnsupportedSyntaxError('lookbehind assertion')
266272
}
267273
checkedAllCases(ast.type)
268274
}
@@ -384,17 +390,20 @@ function pullUpEndAnchor(ast: RegExpAST, isRightClosed: boolean): RegExpAST {
384390
return endAnchor(left, undefined) // i.e. `l$`
385391
}
386392
}
387-
case "lookahead": {
388-
const inner = pullUpEndAnchor(ast.inner, false)
389-
const right = pullUpEndAnchor(ast.right, isRightClosed)
390-
if (inner.type === 'end-anchor') {
391-
throw new UnsupportedSyntaxError('end anchors inside lookaheads like (?=a$)')
392-
} else if (right.type === 'end-anchor') {
393-
return endAnchor(lookahead(ast.isPositive, ast.inner, right.left), undefined)
394-
} else {
395-
return lookahead(ast.isPositive, inner, right)
396-
}
397-
}
393+
case "lookahead":
394+
// FIXME:
395+
// const inner = pullUpEndAnchor(ast.inner, false)
396+
// const right = pullUpEndAnchor(ast.right, isRightClosed)
397+
// if (inner.type === 'end-anchor') {
398+
// throw new UnsupportedSyntaxError('end anchors inside lookaheads like (?=a$)')
399+
// } else if (right.type === 'end-anchor') {
400+
// return endAnchor(lookahead(ast.isPositive, ast.inner, right.left), undefined)
401+
// } else {
402+
// return lookahead(ast.isPositive, inner, right)
403+
// }
404+
throw new UnsupportedSyntaxError('lookahead assertion')
405+
case "lookbehind":
406+
throw new UnsupportedSyntaxError('lookbehind assertion')
398407
}
399408
checkedAllCases(ast.type)
400409
}
@@ -426,27 +435,30 @@ export function toExtRegex(ast: RegExpAST): RE.ExtRegex {
426435
// no end anchors anywhere and we have to append the implicit `.*`:
427436
ast = concat(ast, dotStar)
428437
}
429-
438+
430439
return toExtRegexAux(ast)
431440
}
432441
function toExtRegexAux(ast: RegExpAST): RE.ExtRegex {
433442
assert(!isOneOf(ast.type, sugarNodeTypes), `Got ${ast.type} node. Expected desugared AST.`)
434-
assert(ast.type !== 'start-anchor', `Unexpected start anchor. Should already be eliminated.`)
435-
assert(ast.type !== 'end-anchor', `Unexpected end anchor. Should already be eliminated.`)
443+
assert(ast.type !== 'start-anchor', `Unexpected start anchor. Should already be eliminated.`)
444+
assert(ast.type !== 'end-anchor', `Unexpected end anchor. Should already be eliminated.`)
436445
switch (ast.type) {
437446
case 'epsilon': return RE.epsilon
438447
case 'literal': return RE.literal(ast.charset)
439448
case 'concat': return RE.concat(toExtRegexAux(ast.left), toExtRegexAux(ast.right))
440449
case 'union': return RE.union(toExtRegexAux(ast.left), toExtRegexAux(ast.right))
441450
case 'star': return RE.star(toExtRegexAux(ast.inner))
442-
case 'lookahead': {
443-
const inner = toExtRegexAux(ast.inner)
444-
const right = toExtRegexAux(ast.right)
445-
if (ast.isPositive)
446-
return RE.intersection(inner, right)
447-
else
448-
return RE.intersection(RE.complement(inner), right)
449-
}
451+
case 'lookahead':
452+
// FIXME:
453+
// const inner = toExtRegexAux(ast.inner)
454+
// const right = toExtRegexAux(ast.right)
455+
// if (ast.isPositive)
456+
// return RE.intersection(inner, right)
457+
// else
458+
// return RE.intersection(RE.complement(inner), right)
459+
throw new UnsupportedSyntaxError('lookahead assertion')
460+
case 'lookbehind':
461+
throw new UnsupportedSyntaxError('lookbehind assertion')
450462
}
451463
checkedAllCases(ast.type)
452464
}
@@ -475,7 +487,7 @@ export function concat(left: RegExpAST, right: RegExpAST): RegExpAST {
475487
return { type: 'concat', left, right }
476488
}
477489

478-
function seq(asts: RegExpAST[]): RegExpAST {
490+
export function seq(asts: RegExpAST[]): RegExpAST {
479491
if (asts.length === 0)
480492
return epsilon
481493
else
@@ -518,9 +530,15 @@ export function captureGroup(inner: RegExpAST, name?: string): RegExpAST {
518530
export function lookahead(
519531
isPositive: boolean,
520532
inner: RegExpAST,
521-
right: RegExpAST,
522533
): RegExpAST {
523-
return { type: 'lookahead', isPositive, inner, right }
534+
return { type: 'lookahead', isPositive, inner }
535+
}
536+
537+
export function lookbehind(
538+
isPositive: boolean,
539+
inner: RegExpAST,
540+
): RegExpAST {
541+
return { type: 'lookahead', isPositive, inner }
524542
}
525543

526544
//////////////////////////////////////////////
@@ -573,6 +591,8 @@ function debugShow_(ast: RegExpAST): unknown {
573591
return { type: 'capture-group', name: ast.name, inner: debugShow_(ast.inner) }
574592
case 'lookahead':
575593
return { type: 'lookahead', isPositive: ast.isPositive, inner: debugShow_(ast.inner) }
594+
case 'lookbehind':
595+
return { type: 'lookbehind', isPositive: ast.isPositive, inner: debugShow_(ast.inner) }
576596
}
577597
checkedAllCases(ast)
578598
}
@@ -632,11 +652,17 @@ export function toString(ast: RegExpAST, options: RenderOptions): string {
632652
return captureGroupToString(ast.name, ast.inner, options)
633653
case 'lookahead': {
634654
const inner = toString(ast.inner, options)
635-
const right = maybeWithParens(ast.right, ast, options)
636655
if (ast.isPositive)
637-
return '(?=' + inner + ')' + right
656+
return '(?=' + inner + ')'
657+
else
658+
return '(?!' + inner + ')'
659+
}
660+
case 'lookbehind': {
661+
const inner = toString(ast.inner, options)
662+
if (ast.isPositive)
663+
return '(?<=' + inner + ')'
638664
else
639-
return '(?!' + inner + ')' + right
665+
return '(?<!' + inner + ')'
640666
}
641667
}
642668
checkedAllCases(ast)
@@ -648,6 +674,8 @@ function precLevel(nodeType: RegExpAST['type']) {
648674
case 'epsilon': return 10
649675
case 'literal': return 10
650676
case 'capture-group': return 10
677+
case 'lookahead': return 10
678+
case 'lookbehind': return 10
651679

652680
case 'star': return 5
653681
case 'plus': return 5
@@ -656,8 +684,6 @@ function precLevel(nodeType: RegExpAST['type']) {
656684

657685
case 'concat': return 4
658686

659-
case 'lookahead': return 3
660-
661687
case 'start-anchor': return 2
662688
case 'end-anchor': return 2
663689

0 commit comments

Comments
 (0)