Skip to content

Commit 14d330b

Browse files
committed
feat: explicit/catchable errors
DFA construction can blow-up in multiple ways that can't be handled by users like OOM kills, non-termination, call stack overflows. Instead we should throw explicit errors in advance. 1) We now throw an error if caches during derivative class computations grow too large. This directly protects against OOM kills but also implicitly protects against non- termination and stack-overflows. It seems the cache size is a good proxy measure for state space explosion. 2) Converting a very large regex to `RegExp` can also take very long and maybe fail because the `RegExp` constructor rejects inputs with too many capturing groups. Although this can be caught already, can take a while until the error is thrown. Thus, we throw a custom error early if the syntax tree is very large.
1 parent 19286df commit 14d330b

File tree

7 files changed

+197
-64
lines changed

7 files changed

+197
-64
lines changed

benchmark/regex_handwritten.js

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import fs from 'fs'
2+
3+
function* readHandWrittenDataset() {
4+
const jsonStr = fs.readFileSync('./benchmark/regex-dataset.json', 'utf-8')
5+
6+
for (const item of JSON.parse(jsonStr)) {
7+
if (item.flavor === "javascript" && item.flags === "") {
8+
try {
9+
yield new RegExp(item.regex) // , item.flags)
10+
} catch (e) {
11+
console.warn('regex dataset: skipping invalid regex')
12+
}
13+
}
14+
}
15+
}
16+
17+
export default [...readHandWrittenDataset()]
18+

benchmark/toStdRegex_output_length.js

Lines changed: 60 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,77 @@
11
import fc from 'fast-check'
22
import * as RE from '../dist/regex.js'
3+
import { ParseError } from '../dist/parser.js'
4+
import { UnsupportedSyntaxError } from '../dist/regex-parser.js'
35
import { parse, toStdRegex } from '../dist/low-level-api.js'
4-
import regexDataset from './regex_random_unique_no-nested-star_1000.js'
6+
import { regexToDFA } from '../dist/dfa.js'
7+
import randomRegexDataset from './regex_random_unique_no-nested-star_1000.js'
8+
import handwrittenRegexDataset from './regex_handwritten.js'
9+
10+
const fullRegexDataset = [
11+
...randomRegexDataset,
12+
...handwrittenRegexDataset,
13+
]
14+
515

616
let avgMult = 0
717
let maxMult = -Infinity
818

9-
const hardInstances = new Set([
10-
290, // call-stack overflow
11-
556, // takes very long
12-
658, // takes very long
13-
689, // call-stack overflow
14-
724, // takes very long
15-
])
16-
1719
function run(inputRegExp, index) {
18-
// skip some hard early instances:
19-
if (hardInstances.has(index)) return
20-
// only consider first 800 instances for now:
21-
if (index > 750) return
22-
2320
console.log('#' + index, inputRegExp)
21+
const startTime = performance.now()
2422

25-
const outputRegex = toStdRegex(parse(inputRegExp))
26-
try {
27-
const outputRegExp = RE.toRegExp(outputRegex)
23+
const inputRegex = parse(inputRegExp)
24+
const outputRegex = toStdRegex(inputRegex)
25+
const outputRegExp = RE.toRegExp(outputRegex)
2826

29-
const inp = inputRegExp.source.length
30-
const out = outputRegExp.source.length
31-
const mult = out/inp
27+
const inp = inputRegExp.source.length
28+
const out = outputRegExp.source.length
29+
const mult = out/inp
3230

33-
avgMult = (avgMult*index + mult)/(index+1)
34-
if (mult > maxMult) {
35-
maxMult = mult
36-
}
37-
38-
console.log(`
39-
regex input length : ${inp}
40-
regex ouptut length : ${out}
41-
multiplier : ${mult}
42-
avg. multiplier : ${avgMult}
43-
worst multiplier : ${maxMult}
44-
`)
45-
} catch (err) {
46-
console.log('too many captures')
31+
avgMult = (avgMult*index + mult)/(index+1)
32+
if (mult > maxMult) {
33+
maxMult = mult
4734
}
35+
36+
console.log(`
37+
regex input length : ${inp}
38+
regex ouptut length : ${out}
39+
multiplier : ${mult}
40+
avg. multiplier : ${avgMult}
41+
worst multiplier : ${maxMult}
42+
`)
4843
}
4944

50-
const timeStart = performance.now()
45+
let parseError = 0
46+
let cacheOverflow = 0
47+
let veryLargeSyntaTree = 0
48+
let stackOverflow = 0
49+
let regexSyntaxError = 0
5150

52-
regexDataset
53-
// do short (likely easier) instances first and see how far we get:
54-
.sort((a,b) => a.source.length - b.source.length)
55-
.forEach(run)
51+
fullRegexDataset.forEach((regex, i) => {
52+
try {
53+
run(regex, i)
54+
} catch (e) {
55+
if (e instanceof ParseError || e instanceof UnsupportedSyntaxError) {
56+
parseError++
57+
} else if (e instanceof RE.CacheOverflowError) {
58+
cacheOverflow++
59+
} else if (e instanceof RE.VeryLargeSyntaxTreeError) {
60+
veryLargeSyntaTree++
61+
} else if (e instanceof RangeError) {
62+
stackOverflow++
63+
} else if (e instanceof SyntaxError) {
64+
regexSyntaxError++
65+
} else {
66+
throw e
67+
}
68+
}
69+
})
5670

57-
console.log('time:', performance.now() - timeStart)
71+
console.debug('failed instances: ', {
72+
parseError,
73+
cacheOverflow,
74+
veryLargeSyntaTree,
75+
stackOverflow,
76+
regexSyntaxError
77+
})

src/dfa.ts

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ function regexToDFA(regex: RE.ExtRegex): DFA {
4848
transitions,
4949
)
5050
worklist.push(targetState)
51-
// console.debug('state count: ', allStates.size)
51+
// if (allStates.size % 100 === 0) {
52+
// console.debug({ stateCount: allStates.size })
53+
// }
5254
} else {
5355
Table.set(
5456
sourceState.hash,
@@ -179,10 +181,10 @@ export function dfaToRegex(dfa: DFA): RE.StdRegex {
179181
}
180182

181183
// TODO: can this round-trip through DFA construction be avoided?
182-
export function toStdRegex(regex: RE.ExtRegex): RE.StdRegex {
183-
const dfa = regexToDFA(regex)
184-
// console.debug('dfa done')
185-
return dfaToRegex(dfa)
184+
export function toStdRegex(inputRegex: RE.ExtRegex): RE.StdRegex {
185+
const dfa = regexToDFA(inputRegex)
186+
const outputRegex = dfaToRegex(dfa)
187+
return outputRegex
186188
}
187189

188190
// function printTrans(trans: Table.Table<CharSet.CharSet>) {

src/regex-parser.ts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ const codePoint = singleChar.map(char => {
5050
return result
5151
})
5252

53+
export class UnsupportedSyntaxError extends Error {}
54+
5355
const escapeSequence = P.string('\\').andThen(_ => P.anyChar).map(escapedChar => {
5456
switch (escapedChar) {
5557
case 'w': return CharSet.wordChars
@@ -64,12 +66,12 @@ const escapeSequence = P.string('\\').andThen(_ => P.anyChar).map(escapedChar =>
6466
case 'v': return CharSet.singleton('\v') // vertical tab
6567
case 'f': return CharSet.singleton('\f') // form feed
6668
case '0': return CharSet.singleton('\0') // NUL character
67-
case 'b': throw new Error('\b word-boundary assertion not supported')
68-
case 'c': throw new Error('\cX control characters not supported')
69-
case 'x': throw new Error('\\x not supported')
70-
case 'u': throw new Error('\\u not supported')
71-
case 'p': throw new Error('\\p not supported')
72-
case 'P': throw new Error('\\P not supported')
69+
case 'b': throw new UnsupportedSyntaxError('\b word-boundary assertion not supported')
70+
case 'c': throw new UnsupportedSyntaxError('\cX control characters not supported')
71+
case 'x': throw new UnsupportedSyntaxError('\\x not supported')
72+
case 'u': throw new UnsupportedSyntaxError('\\u not supported')
73+
case 'p': throw new UnsupportedSyntaxError('\\p not supported')
74+
case 'P': throw new UnsupportedSyntaxError('\\P not supported')
7375
default: return CharSet.singleton(escapedChar) // match character literally
7476
}
7577
})

src/regex.ts

Lines changed: 95 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,8 @@ export function isEmpty(regex: ExtRegex): boolean {
471471
return regex.type === 'literal' && CharSet.isEmpty(regex.charset)
472472
}
473473

474+
export class CacheOverflowError extends Error {}
475+
474476
export function codePointDerivative(codePoint: number, regex: StdRegex, cache: Table.Table<StdRegex>): StdRegex
475477
export function codePointDerivative(codePoint: number, regex: ExtRegex, cache: Table.Table<ExtRegex>): ExtRegex
476478
export function codePointDerivative(codePoint: number, regex: ExtRegex, cache: Table.Table<ExtRegex>): ExtRegex {
@@ -521,6 +523,13 @@ function codePointDerivativeAux(codePoint: number, regex: ExtRegex, cache: Table
521523
function codePointDerivativeAux(codePoint: number, regex: ExtRegex, cache: Table.Table<ExtRegex>): ExtRegex {
522524
const cachedResult = Table.get(codePoint, regex.hash, cache)
523525
if (cachedResult === undefined) {
526+
// Rather throw an error when cache grows too large than getting OOM killed.
527+
// At least errors can be caught and handled. The limit is somewhat arbitrary.
528+
// TODO: maybe make this user configurable:
529+
if (Table.size(cache) >= 10_000) {
530+
throw new CacheOverflowError('Cache overflow while computing DFA transitions.')
531+
}
532+
524533
const result = codePointDerivative(codePoint, regex, cache)
525534
Table.set(codePoint, regex.hash, result, cache)
526535
return result
@@ -608,6 +617,13 @@ function allNonEmptyIntersections(
608617
return resultCached
609618
}
610619

620+
// Rather throw an error when cache grows too large than getting OOM killed.
621+
// At least errors can be caught and handled. The limit is somewhat arbitrary.
622+
// TODO: maybe make this user configurable:
623+
if (Table.size(cache) >= 10_000) {
624+
throw new CacheOverflowError()
625+
}
626+
611627
const result: CharSet.CharSet[] = []
612628
for (const classA of classesA) {
613629
for (const classB of classesB) {
@@ -668,12 +684,20 @@ export function derivativeClasses(
668684
}
669685
checkedAllCases(regex)
670686
}
687+
671688
function derivativeClassesAux(
672689
regex: ExtRegex,
673690
cache: DerivativeClassesCache
674691
) {
675692
const cachedResult = cache.classes.get(regex.hash)
676693
if (cachedResult === undefined) {
694+
// Rather throw an error when cache grows too large than getting OOM killed.
695+
// At least errors can be caught and handled. The limit is somewhat arbitrary.
696+
// TODO: maybe make this user configurable:
697+
if (cache.classes.size >= 10_000) {
698+
throw new CacheOverflowError()
699+
}
700+
677701
const result = derivativeClasses(regex, cache)
678702
cache.classes.set(regex.hash, result)
679703
return result
@@ -687,6 +711,8 @@ function derivativeClassesAux(
687711
///// exclusive standard regex utils /////
688712
//////////////////////////////////////////////
689713

714+
export class VeryLargeSyntaxTreeError extends Error {}
715+
690716
/**
691717
* TODO: docs
692718
*
@@ -697,7 +723,20 @@ export function toRegExp(regex: StdRegex): RegExp {
697723
}
698724

699725
export function toString(regex: ExtRegex): string {
700-
return '^(' + astToString(toRegExpAST(regex)) + ')$'
726+
const size = nodeCount(regex)
727+
if (size > 1_000_000) {
728+
throw new VeryLargeSyntaxTreeError(
729+
"Won't try to convert to RegExp. Syntax tree has over 1_000_000 nodes."
730+
)
731+
}
732+
733+
// Render parenthesis as non-capturing groups if there is a large number of them,
734+
// i.e. `/(?:abc)` instead of `/(abc)/`. `new RegExp(...)` throws an error if there
735+
// is a large number of capturing groups. Non-capturing groups are a bit more verbose
736+
// but at large sizes like this it doesn't matter anyway:
737+
const useNonCapturingGroups = size > 10_000
738+
739+
return '^(' + astToString(toRegExpAST(regex), { useNonCapturingGroups }) + ')$'
701740
}
702741

703742
// TODO: information is duplicated in parser:
@@ -786,37 +825,43 @@ function toRegExpAST(regex: ExtRegex): RegExpAST {
786825
checkedAllCases(regex)
787826
}
788827

789-
function astToString(ast: RegExpAST): string {
828+
type RenderOptions = {
829+
useNonCapturingGroups: boolean
830+
}
831+
832+
function astToString(ast: RegExpAST, options: RenderOptions): string {
790833
switch (ast.type) {
791834
case 'epsilon':
792835
return ''
793836
case 'literal':
794837
return CharSet.toString(ast.charset)
795838
case 'concat':
796-
return maybeWithParens(ast.left, ast) + maybeWithParens(ast.right, ast)
839+
return maybeWithParens(ast.left, ast, options) + maybeWithParens(ast.right, ast, options)
797840
case 'union':
798-
return maybeWithParens(ast.left, ast) + '|' + maybeWithParens(ast.right, ast)
841+
return maybeWithParens(ast.left, ast, options) + '|' + maybeWithParens(ast.right, ast, options)
799842
case 'star':
800-
return maybeWithParens(ast.inner, ast) + '*'
843+
return maybeWithParens(ast.inner, ast, options) + '*'
801844
case 'plus':
802-
return maybeWithParens(ast.inner, ast) + '+'
845+
return maybeWithParens(ast.inner, ast, options) + '+'
803846
case 'optional':
804-
return maybeWithParens(ast.inner, ast) + '?'
847+
return maybeWithParens(ast.inner, ast, options) + '?'
805848
case 'boundedQuantifier':
806-
return maybeWithParens(ast.inner, ast) + '{' + ast.count + '}'
849+
return maybeWithParens(ast.inner, ast, options) + '{' + ast.count + '}'
807850
case 'complement':
808-
return '¬' + maybeWithParens(ast.inner, ast)
851+
return '¬' + maybeWithParens(ast.inner, ast, options)
809852
case 'intersection':
810-
return maybeWithParens(ast.left, ast) + '∩' + maybeWithParens(ast.right, ast)
853+
return maybeWithParens(ast.left, ast, options) + '∩' + maybeWithParens(ast.right, ast, options)
811854
}
812855
checkedAllCases(ast)
813856
}
814857

815-
function maybeWithParens(ast: RegExpAST, parent: RegExpAST): string {
858+
function maybeWithParens(ast: RegExpAST, parent: RegExpAST, options: RenderOptions): string {
816859
if (ast.type === parent.type || precLevel(ast.type) > precLevel(parent.type))
817-
return astToString(ast)
860+
return astToString(ast, options)
861+
else if (options.useNonCapturingGroups)
862+
return '(?:' + astToString(ast, options) + ')'
818863
else
819-
return '(' + astToString(ast) + ')'
864+
return '(' + astToString(ast, options) + ')'
820865
}
821866

822867
/**
@@ -938,6 +983,43 @@ function sizeMemoizedAux(
938983
}
939984
}
940985

986+
export function nodeCount(
987+
regex: ExtRegex,
988+
cache: Map<number, number> = new Map()
989+
): number {
990+
switch (regex.type) {
991+
case 'epsilon':
992+
return 1
993+
case 'literal':
994+
return 1
995+
case 'concat':
996+
return nodeCountAux(regex.left, cache) + nodeCountAux(regex.right, cache) + 1
997+
case 'union':
998+
return nodeCountAux(regex.left, cache) + nodeCountAux(regex.right, cache) + 1
999+
case 'star':
1000+
return nodeCountAux(regex.inner, cache) + 1
1001+
case 'intersection':
1002+
return nodeCountAux(regex.left, cache) + nodeCountAux(regex.right, cache) + 1
1003+
case 'complement':
1004+
return nodeCountAux(regex.inner, cache) + 1
1005+
}
1006+
checkedAllCases(regex)
1007+
}
1008+
1009+
function nodeCountAux(
1010+
regex: ExtRegex,
1011+
cache: Map<number, number>
1012+
): number {
1013+
const cachedResult = cache.get(regex.hash)
1014+
if (cachedResult === undefined) {
1015+
const result = nodeCount(regex, cache)
1016+
cache.set(regex.hash, result)
1017+
return result
1018+
} else {
1019+
return cachedResult
1020+
}
1021+
}
1022+
9411023
// export function equivalent(regex1: ExtRegex, regex2: ExtRegex): boolean {
9421024
// if (equal(regex1, regex2)) {
9431025
// return true

src/table.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { sum } from "./utils"
12

23

34
export type Table<T> = Map<number, Map<number, T>>
@@ -65,3 +66,7 @@ export function fromEntries<A>(items: Iterable<[number, number, A]>): Table<A> {
6566
}
6667
return table
6768
}
69+
70+
export function size<A>(table: Table<A>): number {
71+
return sum([...table.values()].map(row => row.size))
72+
}

0 commit comments

Comments
 (0)