Skip to content

Commit 5ad0261

Browse files
committed
fix: throw error on known anchor elimination issues
Throw `UnsupportedSyntaxError` for expressions with known unsupported start/end anchor patterns. Better than processing them incorrectly.
1 parent a146f14 commit 5ad0261

File tree

8 files changed

+172
-156
lines changed

8 files changed

+172
-156
lines changed

benchmark/toStdRegex_output_length-result.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

22
failed instances:
3-
- parseError : 45
3+
- parseError : 49
44
- cacheOverflow : 89
55
- veryLargeSyntaTree : 24
66
- stackOverflow : 12

src/ast.ts

Lines changed: 116 additions & 110 deletions
Large diffs are not rendered by default.

src/parser.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ export type ParseResult<T> = { value: T, restInput: string }
44

55
export class ParseError extends Error {
66

7+
name = "ParseError"
8+
79
constructor(
810
message: string,
911
public readonly restInput: string

src/regex-parser.ts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ const unescapedCharInsideBrackets = P.satisfy(char => !Range.mustBeEscapedInside
2929
const unescapedCharOutsideBrackets = P.satisfy(char => !Range.mustBeEscapedOutsideBrackets(char))
3030
.map(CharSet.singleton)
3131

32-
export class UnsupportedSyntaxError extends Error {}
32+
export class UnsupportedSyntaxError extends Error {
33+
name = "UnsupportedSyntaxError"
34+
}
3335

3436
const escapeSequence = P.string('\\').andThen(_ => P.anyChar).andThen(escapedChar => {
3537
switch (escapedChar) {
@@ -45,16 +47,16 @@ const escapeSequence = P.string('\\').andThen(_ => P.anyChar).andThen(escapedCha
4547
case 'v': return P.pure(CharSet.singleton('\v')) // vertical tab
4648
case 'f': return P.pure(CharSet.singleton('\f')) // form feed
4749
case '0': return P.pure(CharSet.singleton('\0')) // NUL character
48-
case 'b': throw new UnsupportedSyntaxError('\b word-boundary assertion not supported')
49-
case 'c': throw new UnsupportedSyntaxError('\cX control characters not supported')
50+
case 'b': throw new UnsupportedSyntaxError('\b word-boundary assertion')
51+
case 'c': throw new UnsupportedSyntaxError('\cX control characters')
5052
case 'x': return P.count(2, P.hexChar).map(chars =>
5153
CharSet.fromRange(Range.singleton(parseInt(chars.join(''), 16)))
5254
)
5355
case 'u': return P.count(4, P.hexChar).map(chars =>
5456
CharSet.fromRange(Range.singleton(parseInt(chars.join(''), 16)))
5557
)
56-
case 'p': throw new UnsupportedSyntaxError('\\p not supported')
57-
case 'P': throw new UnsupportedSyntaxError('\\P not supported')
58+
case 'p': throw new UnsupportedSyntaxError('\\p')
59+
case 'P': throw new UnsupportedSyntaxError('\\P')
5860
default: return P.pure(CharSet.singleton(escapedChar)) // match character literally
5961
}
6062
})
@@ -175,7 +177,7 @@ const lookbehind: P.Parser<AST.RegExpAST> =
175177
P.string(')'),
176178
regex(),
177179
).map(_ => {
178-
throw new UnsupportedSyntaxError('lookbehind assertions are not supported')
180+
throw new UnsupportedSyntaxError('lookbehind assertions')
179181
})
180182

181183
function regexTerm() {

src/regex.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,9 @@ export function isEmpty(regex: ExtRegex): boolean {
428428
return regex.type === 'literal' && CharSet.isEmpty(regex.charset)
429429
}
430430

431-
export class CacheOverflowError extends Error {}
431+
export class CacheOverflowError extends Error {
432+
name = "CacheOverflowError"
433+
}
432434

433435
export function codePointDerivative(codePoint: number, regex: StdRegex, cache: Table.Table<StdRegex>): StdRegex
434436
export function codePointDerivative(codePoint: number, regex: ExtRegex, cache: Table.Table<ExtRegex>): ExtRegex
@@ -484,7 +486,7 @@ function codePointDerivativeAux(codePoint: number, regex: ExtRegex, cache: Table
484486
// At least errors can be caught and handled. The limit is somewhat arbitrary.
485487
// TODO: maybe make this user configurable:
486488
if (Table.size(cache) >= 10_000) {
487-
throw new CacheOverflowError('Cache overflow while computing DFA transitions.')
489+
throw new CacheOverflowError('while computing DFA transitions.')
488490
}
489491

490492
const result = codePointDerivative(codePoint, regex, cache)
@@ -668,7 +670,9 @@ function derivativeClassesAux(
668670
///// exclusive standard regex utils /////
669671
//////////////////////////////////////////////
670672

671-
export class VeryLargeSyntaxTreeError extends Error {}
673+
export class VeryLargeSyntaxTreeError extends Error {
674+
name = "VeryLargeSyntaxTreeError"
675+
}
672676

673677
/**
674678
* TODO: docs

test/arbitrary-ast.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ function endAnchor(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AS
7878

7979
/**
8080
* Traverses AST and renames capturing groups if the name already occurs in the expression.
81-
* `new RegExp(...)` throws an error when capture group names occur multiple times in the
81+
* `new RegExp(...)` throws an error when capture group names occur multiple times in the
8282
* same expression.
8383
*/
8484
export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
@@ -94,7 +94,7 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
9494
return renameIfSeen(newName)
9595
}
9696
}
97-
97+
9898
function traverse(node: AST.RegExpAST): AST.RegExpAST {
9999
switch (node.type) {
100100
case 'epsilon':
@@ -115,7 +115,7 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
115115
return AST.repeat(traverse(node.inner), node.bounds)
116116
case 'capture-group': {
117117
const innerProcessed = traverse(node.inner)
118-
118+
119119
if (node.name === undefined) {
120120
return AST.captureGroup(innerProcessed, node.name)
121121
} else {
@@ -133,7 +133,7 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
133133
checkedAllCases(node)
134134
}
135135
}
136-
136+
137137
return traverse(ast)
138138
}
139139

test/ast.spec.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ describe('toExtRegex', () => {
4444
[/^a^b/, RE.empty],
4545
// but two ^^ directly in a row are not a contradiction:
4646
[/(^^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
47-
// in fact, as long as anything between two ^ can match epsilon,
47+
// in fact, as long as anything between two ^ can match epsilon,
4848
// there is no contradiction:
4949
[/(^(c|)^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
5050
[/(^c*^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
@@ -60,6 +60,9 @@ describe('toExtRegex', () => {
6060
// Nullable expressions on the left and right can be ignored:
6161
[/(a?)$^(b*)/, RE.epsilon],
6262

63+
// Contradiction inside lookahead collapses to empty set. Then empty set lookahead can't match anything:
64+
[/(?=a^)/, RE.empty],
65+
6366
[/(^a|)^b/, RE.seq([RE.singleChar('b'), dotStar])],
6467
[/^a(b^|c)/, RE.seq([RE.string('ac'), dotStar]) ],
6568
[/(^|a)b/, prefix(RE.concat(RE.optional(suffix(RE.singleChar('a'))), RE.singleChar('b')))],
@@ -89,7 +92,7 @@ describe('toExtRegex', () => {
8992
// negative lookahead:
9093
[/^a(?!b)c$/, RE.concat(RE.string('a'), RE.intersection(RE.complement(RE.string('b')), RE.string('c')))],
9194
// TODO: lookahead + lookbehind
92-
// [/^a(?=b)(?<=a)b$/, RE.string('ab')],
95+
// [/^a(?=b)(?<=a)b$/, RE.string('ab')],
9396
// [/^b(?=ab)a(?<=ba)b$/, RE.string('bab')],
9497
// [/^a(?=b)(?<=a)(?!a)(?<!b)b$/, RE.string('ab')],
9598
] as const
@@ -104,11 +107,11 @@ describe('toExtRegex', () => {
104107
it('fixme', { todo: true }, () => {
105108
const actual = AST.toExtRegex(parseRegExp(/^(a(?!b))*$/))
106109
const expected = RE.star(RE.string('a'))
107-
assert.equal(actual.hash, expected.hash)
110+
assert.equal(actual.hash, expected.hash)
108111
})
109112

110113
})
111-
114+
112115
})
113116

114117
describe('toString', () => {

test/regex-parser.spec.ts

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { describe, it, test } from "node:test"
22
import assert from "node:assert"
3-
import { parseRegExp, parseRegExpString } from "../src/regex-parser"
3+
import { parseRegExp, parseRegExpString, UnsupportedSyntaxError } from "../src/regex-parser"
44
import { RB } from "../src/index"
55
import { ParseError } from "../src/parser"
66
import * as AST from "../src/ast"
@@ -12,7 +12,7 @@ import * as Arbitrary from './arbitrary-ast'
1212
function char(c: string) {
1313
return AST.literal(CharSet.singleton(c))
1414
}
15-
function str(s: string) {
15+
function str(s: string) {
1616
const chars = [...s].map(char)
1717
// Use right-associative concatenation: a(bc) not (ab)c
1818
return chars.reduceRight((acc, curr) => AST.concat(curr, acc))
@@ -44,7 +44,7 @@ describe('parseRegExp', () => {
4444
[/a{3,5}/, AST.repeat(char('a'), { min: 3, max: 5 })],
4545
// if curly bracket is not terminated the whole thing is interpreted literally:
4646
[/a{3,5/, str('a{3,5')],
47-
// same if max value is given but min value is missing:
47+
// same if max value is given but min value is missing:
4848
[/a{,5}/, str('a{,5}')],
4949
// char classes / escaping:
5050
[/\w/, AST.literal(CharSet.wordChars)],
@@ -72,21 +72,21 @@ describe('parseRegExp', () => {
7272
[/^abc$/, AST.startAnchor(undefined, AST.endAnchor(str('abc'), undefined))],
7373
[/$a^/, AST.startAnchor(AST.endAnchor(undefined, char('a')), undefined)],
7474
// positive lookahead - now parsed as lookahead AST nodes, not intersections
75-
[/(?=a)b/, AST.lookahead(true, char('a'), char('b'))],
76-
[/(?=a)(?:b)/, AST.lookahead(true, char('a'), char('b'))],
77-
[/(?=a)(?=b)c/, AST.lookahead(true, char('a'), AST.lookahead(true, char('b'), char('c')))],
78-
[/a(?=b)c/, AST.concat(char('a'), AST.lookahead(true, char('b'), char('c')))],
79-
[/a(?=b)/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.epsilon))],
80-
[/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.concat(char('c'), AST.lookahead(true, char('d'), char('e')))))],
81-
[/(?=)/, AST.lookahead(true, AST.epsilon, AST.epsilon)],
75+
[/(?=a)b/, AST.lookahead(true, char('a'), char('b'))],
76+
[/(?=a)(?:b)/, AST.lookahead(true, char('a'), char('b'))],
77+
[/(?=a)(?=b)c/, AST.lookahead(true, char('a'), AST.lookahead(true, char('b'), char('c')))],
78+
[/a(?=b)c/, AST.concat(char('a'), AST.lookahead(true, char('b'), char('c')))],
79+
[/a(?=b)/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.epsilon))],
80+
[/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.concat(char('c'), AST.lookahead(true, char('d'), char('e')))))],
81+
[/(?=)/, AST.lookahead(true, AST.epsilon, AST.epsilon)],
8282
// negative lookahead
83-
[/(?!a)b/, AST.lookahead(false, char('a'), char('b'))],
83+
[/(?!a)b/, AST.lookahead(false, char('a'), char('b'))],
8484
[/(?!a)b|c/, AST.union(AST.lookahead(false, char('a'), char('b')), char('c'))],
8585
[/(?!)/, AST.lookahead(false, AST.epsilon, AST.epsilon)],
8686
// TODO: positive lookbehind
87-
// [/(?<=a)/, AST.positiveLookbehind(char('a'))],
87+
// [/(?<=a)/, AST.positiveLookbehind(char('a'))],
8888
// TODO: negative lookbehind
89-
// [/(?<!a)/, AST.negativeLookbehind(char('a'))],
89+
// [/(?<!a)/, AST.negativeLookbehind(char('a'))],
9090
// some special chars don't need escape when inside brackets:
9191
[/[.^$*+?()[{-|]/, AST.literal(CharSet.fromArray([...'.^$*+?()[{-|']))],
9292
// other special chars need escape even inside brackets:
@@ -134,12 +134,24 @@ describe('parseRegExp', () => {
134134

135135
})
136136

137+
function parse_skipKnownIssues(re: RegExp) {
138+
try {
139+
return RB(re)
140+
} catch (error) {
141+
if (error instanceof UnsupportedSyntaxError) {
142+
fc.pre(false)
143+
} else {
144+
throw error
145+
}
146+
}
147+
}
148+
137149
test('parse/stringify roundtrip preserves equivalence', {todo:true}, () => {
138150
fc.assert(
139151
fc.property(
140152
Arbitrary.regexp(),
141153
(inputRegExp: RegExp) => {
142-
const builder = RB(inputRegExp)
154+
const builder = parse_skipKnownIssues(inputRegExp)
143155
const outputRegExp = builder.toRegExp()
144156

145157
for (const str of builder.enumerate().take(10)) {
@@ -148,19 +160,6 @@ test('parse/stringify roundtrip preserves equivalence', {todo:true}, () => {
148160
}
149161
},
150162
),
151-
// { numRuns: 1000 },
163+
{ numRuns: 100 },
152164
)
153165
})
154-
155-
test('fixme 1', { todo: true }, () => {
156-
const inputRegExp = /(^)+a/
157-
const builder = RB(inputRegExp)
158-
const outputRegExp = builder.toRegExp()
159-
160-
// console.debug(outputRegExp)
161-
162-
for (const str of builder.enumerate().take(10)) {
163-
assert.match(str, outputRegExp)
164-
assert.match(str, inputRegExp)
165-
}
166-
})

0 commit comments

Comments
 (0)