feat: explicit/catchable errors

gruhn · gruhn · commit 14d330b9955a · 2025-05-27T20:32:56.000+02:00
DFA construction can blow-up in multiple ways that can't be handled by
users like OOM kills, non-termination, call stack overflows. Instead we
should throw explicit errors in advance.

1) We now throw an error if caches during derivative class computations
grow too large. This directly protects against OOM kills but also
implicitly protects against non- termination and stack-overflows.
It seems the cache size is a good proxy measure for state space
explosion.

2) Converting a very large regex to `RegExp` can also take very long
and maybe fail because the `RegExp` constructor rejects inputs with too
many capturing groups. Although this can be caught already, can take a
while until the error is thrown. Thus, we throw a custom error early if
the syntax tree is very large.
diff --git a/benchmark/regex_handwritten.js b/benchmark/regex_handwritten.js
@@ -0,0 +1,18 @@
+import fs from 'fs'
+
+function* readHandWrittenDataset() {
+  const jsonStr = fs.readFileSync('./benchmark/regex-dataset.json', 'utf-8')
+
+  for (const item of JSON.parse(jsonStr)) {
+    if (item.flavor === "javascript" && item.flags === "") {
+      try {
+        yield new RegExp(item.regex) // , item.flags)
+      } catch (e) {
+        console.warn('regex dataset: skipping invalid regex')
+      }
+    }
+  }
+}
+
+export default [...readHandWrittenDataset()]
+
diff --git a/benchmark/toStdRegex_output_length.js b/benchmark/toStdRegex_output_length.js
@@ -1,57 +1,77 @@
 import fc from 'fast-check'
 import * as RE from '../dist/regex.js'
+import { ParseError } from '../dist/parser.js'
+import { UnsupportedSyntaxError } from '../dist/regex-parser.js'
 import { parse, toStdRegex } from '../dist/low-level-api.js'
-import regexDataset from './regex_random_unique_no-nested-star_1000.js'
+import { regexToDFA } from '../dist/dfa.js'
+import randomRegexDataset from './regex_random_unique_no-nested-star_1000.js'
+import handwrittenRegexDataset from './regex_handwritten.js'
+
+const fullRegexDataset = [
+  ...randomRegexDataset,
+  ...handwrittenRegexDataset,
+] 
+
 
 let avgMult = 0
 let maxMult = -Infinity
 
-const hardInstances = new Set([
-  290, // call-stack overflow
-  556, // takes very long
-  658, // takes very long
-  689, // call-stack overflow
-  724, // takes very long
-])
-
 function run(inputRegExp, index) {
-  // skip some hard early instances:
-  if (hardInstances.has(index)) return
-  // only consider first 800 instances for now:
-  if (index > 750) return
-
   console.log('#' + index, inputRegExp)
+  const startTime = performance.now()
 
-  const outputRegex = toStdRegex(parse(inputRegExp))
-  try {
-    const outputRegExp = RE.toRegExp(outputRegex)
+  const inputRegex = parse(inputRegExp)
+  const outputRegex = toStdRegex(inputRegex)
+  const outputRegExp = RE.toRegExp(outputRegex)
 
-    const inp = inputRegExp.source.length
-    const out = outputRegExp.source.length
-    const mult = out/inp
+  const inp = inputRegExp.source.length
+  const out = outputRegExp.source.length
+  const mult = out/inp
 
-    avgMult = (avgMult*index + mult)/(index+1)
-    if (mult > maxMult) {
-      maxMult = mult
-    }
-
-    console.log(`
-      regex input length  : ${inp}
-      regex ouptut length : ${out}
-      multiplier          : ${mult}
-      avg. multiplier     : ${avgMult}
-      worst multiplier    : ${maxMult}
-    `) 
-  } catch (err) {
-    console.log('too many captures')
+  avgMult = (avgMult*index + mult)/(index+1)
+  if (mult > maxMult) {
+    maxMult = mult
   }
+
+  console.log(`
+    regex input length  : ${inp}
+    regex ouptut length : ${out}
+    multiplier          : ${mult}
+    avg. multiplier     : ${avgMult}
+    worst multiplier    : ${maxMult}
+  `) 
 }
 
-const timeStart = performance.now()
+let parseError = 0
+let cacheOverflow = 0
+let veryLargeSyntaTree = 0
+let stackOverflow = 0
+let regexSyntaxError = 0
 
-regexDataset
-  // do short (likely easier) instances first and see how far we get:
-  .sort((a,b) => a.source.length - b.source.length)
-  .forEach(run)
+fullRegexDataset.forEach((regex, i) => {
+  try {
+    run(regex, i)
+  } catch (e) {
+    if (e instanceof ParseError || e instanceof UnsupportedSyntaxError) {
+      parseError++
+    } else if (e instanceof RE.CacheOverflowError) {
+      cacheOverflow++
+    } else if (e instanceof RE.VeryLargeSyntaxTreeError) {
+      veryLargeSyntaTree++
+    } else if (e instanceof RangeError) {
+      stackOverflow++
+    } else if (e instanceof SyntaxError) {
+      regexSyntaxError++
+    } else {
+      throw e
+    }
+  }
+})
 
-console.log('time:', performance.now() - timeStart)
+console.debug('failed instances: ', {
+  parseError,
+  cacheOverflow,
+  veryLargeSyntaTree,
+  stackOverflow,
+  regexSyntaxError
+})
diff --git a/src/dfa.ts b/src/dfa.ts
@@ -48,7 +48,9 @@ function regexToDFA(regex: RE.ExtRegex): DFA {
           transitions,
         )
         worklist.push(targetState)
-        // console.debug('state count: ', allStates.size)
+        // if (allStates.size % 100 === 0) {
+        //   console.debug({ stateCount: allStates.size })
+        // }
       } else {
         Table.set(
           sourceState.hash,
@@ -179,10 +181,10 @@ export function dfaToRegex(dfa: DFA): RE.StdRegex {
 }
 
 // TODO: can this round-trip through DFA construction be avoided?
-export function toStdRegex(regex: RE.ExtRegex): RE.StdRegex {
-  const dfa = regexToDFA(regex)
-  // console.debug('dfa done')
-  return dfaToRegex(dfa)
+export function toStdRegex(inputRegex: RE.ExtRegex): RE.StdRegex {
+  const dfa = regexToDFA(inputRegex)
+  const outputRegex = dfaToRegex(dfa)
+  return outputRegex
 }
 
 // function printTrans(trans: Table.Table<CharSet.CharSet>) {
diff --git a/src/regex-parser.ts b/src/regex-parser.ts
@@ -50,6 +50,8 @@ const codePoint = singleChar.map(char => {
   return result
 })
 
+export class UnsupportedSyntaxError extends Error {}
+
 const escapeSequence = P.string('\\').andThen(_ => P.anyChar).map(escapedChar => {
   switch (escapedChar) {
     case 'w': return CharSet.wordChars
@@ -64,12 +66,12 @@ const escapeSequence = P.string('\\').andThen(_ => P.anyChar).map(escapedChar =>
     case 'v': return CharSet.singleton('\v') // vertical tab
     case 'f': return CharSet.singleton('\f') // form feed
     case '0': return CharSet.singleton('\0') // NUL character
-    case 'b': throw new Error('\b word-boundary assertion not supported')
-    case 'c': throw new Error('\cX control characters not supported')
-    case 'x': throw new Error('\\x not supported')
-    case 'u': throw new Error('\\u not supported')
-    case 'p': throw new Error('\\p not supported')
-    case 'P': throw new Error('\\P not supported')
+    case 'b': throw new UnsupportedSyntaxError('\b word-boundary assertion not supported')
+    case 'c': throw new UnsupportedSyntaxError('\cX control characters not supported')
+    case 'x': throw new UnsupportedSyntaxError('\\x not supported')
+    case 'u': throw new UnsupportedSyntaxError('\\u not supported')
+    case 'p': throw new UnsupportedSyntaxError('\\p not supported')
+    case 'P': throw new UnsupportedSyntaxError('\\P not supported')
     default: return CharSet.singleton(escapedChar) // match character literally
   }
 })
diff --git a/src/regex.ts b/src/regex.ts
@@ -471,6 +471,8 @@ export function isEmpty(regex: ExtRegex): boolean {
   return regex.type === 'literal' && CharSet.isEmpty(regex.charset)
 }
 
+export class CacheOverflowError extends Error {}
+
 export function codePointDerivative(codePoint: number, regex: StdRegex, cache: Table.Table<StdRegex>): StdRegex
 export function codePointDerivative(codePoint: number, regex: ExtRegex, cache: Table.Table<ExtRegex>): ExtRegex
 export function codePointDerivative(codePoint: number, regex: ExtRegex, cache: Table.Table<ExtRegex>): ExtRegex {
@@ -521,6 +523,13 @@ function codePointDerivativeAux(codePoint: number, regex: ExtRegex, cache: Table
 function codePointDerivativeAux(codePoint: number, regex: ExtRegex, cache: Table.Table<ExtRegex>): ExtRegex {
   const cachedResult = Table.get(codePoint, regex.hash, cache)
   if (cachedResult === undefined) {
+    // Rather throw an error when cache grows too large than getting OOM killed.
+    // At least errors can be caught and handled. The limit is somewhat arbitrary.
+    // TODO: maybe make this user configurable:
+    if (Table.size(cache) >= 10_000) {
+      throw new CacheOverflowError('Cache overflow while computing DFA transitions.')
+    }
+
     const result = codePointDerivative(codePoint, regex, cache)
     Table.set(codePoint, regex.hash, result, cache)
     return result
@@ -608,6 +617,13 @@ function allNonEmptyIntersections(
     return resultCached
   }
 
+  // Rather throw an error when cache grows too large than getting OOM killed.
+  // At least errors can be caught and handled. The limit is somewhat arbitrary.
+  // TODO: maybe make this user configurable:
+  if (Table.size(cache) >= 10_000) {
+    throw new CacheOverflowError()
+  }
+
   const result: CharSet.CharSet[] = []
   for (const classA of classesA) {
     for (const classB of classesB) {
@@ -668,12 +684,20 @@ export function derivativeClasses(
   }  
   checkedAllCases(regex)
 }
+
 function derivativeClassesAux(
   regex: ExtRegex,
   cache: DerivativeClassesCache
 ) {
   const cachedResult = cache.classes.get(regex.hash)
   if (cachedResult === undefined) {
+    // Rather throw an error when cache grows too large than getting OOM killed.
+    // At least errors can be caught and handled. The limit is somewhat arbitrary.
+    // TODO: maybe make this user configurable:
+    if (cache.classes.size >= 10_000) {
+      throw new CacheOverflowError()
+    }
+
     const result = derivativeClasses(regex, cache)
     cache.classes.set(regex.hash, result)
     return result
@@ -687,6 +711,8 @@ function derivativeClassesAux(
 ///// exclusive standard regex utils     /////
 //////////////////////////////////////////////
 
+export class VeryLargeSyntaxTreeError extends Error {}
+
 /**
  * TODO: docs
  * 
@@ -697,7 +723,20 @@ export function toRegExp(regex: StdRegex): RegExp {
 }
 
 export function toString(regex: ExtRegex): string {
-  return '^(' + astToString(toRegExpAST(regex)) + ')$'
+  const size = nodeCount(regex)
+  if (size > 1_000_000) {
+    throw new VeryLargeSyntaxTreeError(
+      "Won't try to convert to RegExp. Syntax tree has over 1_000_000 nodes."
+    )
+  }
+
+  // Render parenthesis as non-capturing groups if there is a large number of them,
+  // i.e. `/(?:abc)` instead of `/(abc)/`. `new RegExp(...)` throws an error if there
+  // is a large number of capturing groups. Non-capturing groups are a bit more verbose
+  // but at large sizes like this it doesn't matter anyway:
+  const useNonCapturingGroups = size > 10_000
+
+  return '^(' + astToString(toRegExpAST(regex), { useNonCapturingGroups }) + ')$'
 }
 
 // TODO: information is duplicated in parser:
@@ -786,37 +825,43 @@ function toRegExpAST(regex: ExtRegex): RegExpAST {
   checkedAllCases(regex)
 }
 
-function astToString(ast: RegExpAST): string {
+type RenderOptions = {
+  useNonCapturingGroups: boolean
+}
+
+function astToString(ast: RegExpAST, options: RenderOptions): string {
   switch (ast.type) {
     case 'epsilon':
       return ''
     case 'literal':
       return CharSet.toString(ast.charset)
     case 'concat':
-      return maybeWithParens(ast.left, ast) + maybeWithParens(ast.right, ast)
+      return maybeWithParens(ast.left, ast, options) + maybeWithParens(ast.right, ast, options)
     case 'union': 
-      return maybeWithParens(ast.left, ast) + '|' + maybeWithParens(ast.right, ast)   
+      return maybeWithParens(ast.left, ast, options) + '|' + maybeWithParens(ast.right, ast, options)   
     case 'star':
-      return maybeWithParens(ast.inner, ast) + '*'
+      return maybeWithParens(ast.inner, ast, options) + '*'
     case 'plus':
-      return maybeWithParens(ast.inner, ast) + '+'
+      return maybeWithParens(ast.inner, ast, options) + '+'
     case 'optional':
-      return maybeWithParens(ast.inner, ast) + '?'
+      return maybeWithParens(ast.inner, ast, options) + '?'
     case 'boundedQuantifier':
-      return maybeWithParens(ast.inner, ast) + '{' + ast.count + '}'
+      return maybeWithParens(ast.inner, ast, options) + '{' + ast.count + '}'
     case 'complement':
-      return '¬' + maybeWithParens(ast.inner, ast)
+      return '¬' + maybeWithParens(ast.inner, ast, options)
     case 'intersection':
-      return maybeWithParens(ast.left, ast) + '∩' + maybeWithParens(ast.right, ast)
+      return maybeWithParens(ast.left, ast, options) + '∩' + maybeWithParens(ast.right, ast, options)
   }
   checkedAllCases(ast)
 }
 
-function maybeWithParens(ast: RegExpAST, parent: RegExpAST): string {
+function maybeWithParens(ast: RegExpAST, parent: RegExpAST, options: RenderOptions): string {
   if (ast.type === parent.type || precLevel(ast.type) > precLevel(parent.type)) 
-    return astToString(ast)
+    return astToString(ast, options)
+  else if (options.useNonCapturingGroups)
+    return '(?:' + astToString(ast, options) + ')'
   else
-    return '(' + astToString(ast) + ')'
+    return '(' + astToString(ast, options) + ')'
 }
 
 /**
@@ -938,6 +983,43 @@ function sizeMemoizedAux(
   }
 }
 
+export function nodeCount(
+  regex: ExtRegex,
+  cache: Map<number, number> = new Map()
+): number {
+  switch (regex.type) {
+    case 'epsilon':
+      return 1
+    case 'literal':
+      return 1
+    case 'concat':
+      return nodeCountAux(regex.left, cache) + nodeCountAux(regex.right, cache) + 1
+    case 'union':
+      return nodeCountAux(regex.left, cache) + nodeCountAux(regex.right, cache) + 1
+    case 'star':
+      return nodeCountAux(regex.inner, cache) + 1
+    case 'intersection':
+      return nodeCountAux(regex.left, cache) + nodeCountAux(regex.right, cache) + 1
+    case 'complement':
+      return nodeCountAux(regex.inner, cache) + 1
+  }
+  checkedAllCases(regex)
+}
+
+function nodeCountAux(
+  regex: ExtRegex,
+  cache: Map<number, number>
+): number {
+  const cachedResult = cache.get(regex.hash)
+  if (cachedResult === undefined) {
+    const result = nodeCount(regex, cache)   
+    cache.set(regex.hash, result)
+    return result
+  } else {
+    return cachedResult
+  }
+}
+
 // export function equivalent(regex1: ExtRegex, regex2: ExtRegex): boolean {
 //   if (equal(regex1, regex2)) {
 //     return true
diff --git a/src/table.ts b/src/table.ts
@@ -1,3 +1,4 @@
+import { sum } from "./utils"
 
 
 export type Table<T> = Map<number, Map<number, T>>
@@ -65,3 +66,7 @@ export function fromEntries<A>(items: Iterable<[number, number, A]>): Table<A> {
   }
   return table
 }
+
+export function size<A>(table: Table<A>): number {
+  return sum([...table.values()].map(row => row.size))
+}
diff --git a/src/utils.ts b/src/utils.ts

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+import { sum } from "./utils"`
`1`	`2`
`2`	`3`
`3`	`4`	`export type Table<T> = Map<number, Map<number, T>>`
`@@ -65,3 +66,7 @@ export function fromEntries<A>(items: Iterable<[number, number, A]>): Table<A> {`
`65`	`66`	`}`
`66`	`67`	`return table`
`67`	`68`	`}`
	`69`	`+`
	`70`	`+export function size<A>(table: Table<A>): number {`
	`71`	`+ return sum([...table.values()].map(row => row.size))`
	`72`	`+}`