perf(toStdRegex): reduce generated regex size

gruhn · gruhn · commit e82a94fde026 · 2025-05-21T23:10:04.000+02:00
`toStdRegex` and can generate very large expressions. Even if called on
an instance that is already a `StdRegex`. Created a new benchmark
"toStdRegex_output_length" to track this. The benchmark runs `toStdRegex`
on fixed dataset of 1000 regular expressions. The current implementation
can not get though all instances. On many instances there are
call-stack overflows, out-of-memory errors and non-termination.
A run on the first 500 (smaller) instances give the following result:

    time             : 5973 ms
    avg. multiplier  : 27
    worst multiplier : 3127

Here avg. multiplier means that on average the output regex was 27 times
larger then the input regex. In the worst recorded case it was 3127 times
larger.

After changing the order in which `dfaToRegex` eliminates states from a
DFA, the benchmark on the first 500 instances improves to:

    time             : 1551 ms
    avg. multiplier  : 2
    worst multiplier : 70

That's much better but we still don't get through all 1000 instances.
Result for the first 750 instances:

    time             : 3672 ms
    avg. multiplier  : 7
    worst multiplier : 962
diff --git a/benchmark/regex_random_unique_no-nested-star_1000.js b/benchmark/regex_random_unique_no-nested-star_1000.js
diff --git a/benchmark/toStdRegex_output_length.js b/benchmark/toStdRegex_output_length.js
@@ -0,0 +1,57 @@
+import fc from 'fast-check'
+import * as RE from '../dist/regex.js'
+import { parse, toStdRegex } from '../dist/low-level-api.js'
+import regexDataset from './regex_random_unique_no-nested-star_1000.js'
+
+let avgMult = 0
+let maxMult = -Infinity
+
+const hardInstances = new Set([
+  290, // call-stack overflow
+  556, // takes very long
+  658, // takes very long
+  689, // call-stack overflow
+  724, // takes very long
+])
+
+function run(inputRegExp, index) {
+  // skip some hard early instances:
+  if (hardInstances.has(index)) return
+  // only consider first 800 instances for now:
+  if (index > 750) return
+
+  console.log('#' + index, inputRegExp)
+
+  const outputRegex = toStdRegex(parse(inputRegExp))
+  try {
+    const outputRegExp = RE.toRegExp(outputRegex)
+
+    const inp = inputRegExp.source.length
+    const out = outputRegExp.source.length
+    const mult = out/inp
+
+    avgMult = (avgMult*index + mult)/(index+1)
+    if (mult > maxMult) {
+      maxMult = mult
+    }
+
+    console.log(`
+      regex input length  : ${inp}
+      regex ouptut length : ${out}
+      multiplier          : ${mult}
+      avg. multiplier     : ${avgMult}
+      worst multiplier    : ${maxMult}
+    `) 
+  } catch (err) {
+    console.log('too many captures')
+  }
+}
+
+const timeStart = performance.now()
+
+regexDataset
+  // do short (likely easier) instances first and see how far we get:
+  .sort((a,b) => a.source.length - b.source.length)
+  .forEach(run)
+
+console.log('time:', performance.now() - timeStart)
diff --git a/src/dfa.ts b/src/dfa.ts
@@ -94,25 +94,65 @@ export function dfaToRegex(dfa: DFA): RE.StdRegex {
       graph
     )
   }
+ 
+  // All states except `newStartState` and `newFinalState` need to be eliminated.
+  // After that, the only remaining transition is between `newStartState` and
+  // `newFinalState` and is labeled with the result regex.
+  // Thus, we put all these states in worklist to be iteratively eliminated. 
+  // Ripping out states with small in/out-degree earlier can result in smaller expressions.
+  // For example:
+  //                                 b            d
+  //                            +---------(s2)---------+
+  //                 a         /                        \
+  //      (s0) ------------- (s1)                      (s4)
+  //                           \     c            e     /
+  //                            +---------(s3)---------+
+  //                         
+  // Ripping states in the order s2, s3, s1 produces:
+  // 
+  //                           a(bd|ce)
+  //      (s0) --------------------------------------- (s4)
+  // 
+  // Ripping states in the order s1, s2, s3 produces:
+  // 
+  //                           (abd)|(ace)
+  //      (s0) --------------------------------------- (s4)
+  // 
+  // Thus, we sort the worklist by degree. Note, that the degree of nodes changes during
+  // the later iteration so it can still be that nodes with higher degree are sometimes
+  // ripped out first. However, keeping the worklist sorted at the same time also has a  
+  // cost. Maybe this can be improved by choosing some heap structure:
+  const worklist = [...dfa.allStates.keys()]
+    // Avoid constantly re-computing degree during sorting by computing it once in a first pass:
+    .map(state => ({ state, degree: Graph.degree(state, graph)}))
+    // Sort states by degree:
+    .sort((a,b) => a.degree - b.degree)
+    // Through degree away again after sorting:
+    .map(({ state }) => state)
 
-  for (const state of dfa.allStates.keys()) {
-    const result = Graph.ripNode(state, graph)
-
-    for (const [pred, predLabel] of result.predecessors) {
-      for (const [succ, succLabel] of result.successors) {
-        const transitiveLabel = RE.seq([
-          predLabel,
-          RE.star(result.selfLoop ?? RE.epsilon),
-          succLabel,
-        ])
-
-        Graph.setEdge(
-          pred, 
-          succ,
-          transitiveLabel,
-          graph, 
-          RE.union,
-        )
+  while (true) {
+    const state = worklist.shift()
+    if (state === undefined) {
+      break
+    } else {
+      const result = Graph.ripNode(state, graph)
+      for (const [pred, predLabel] of result.predecessors) {
+        for (const [succ, succLabel] of result.successors) {
+          const transitiveLabel = RE.seq([
+            predLabel,
+            RE.star(result.selfLoop ?? RE.epsilon),
+            succLabel,
+          ])
+          Graph.setEdge(
+            pred, 
+            succ,
+            transitiveLabel,
+            graph, 
+            // Flipping the arguments avoids that the associativity rewrite rule of `union`
+            // keeps getting triggered. This makes a segnificant performance difference:
+            (oldValue, newValue) => RE.union(newValue, oldValue),
+          )
+        }
       }
     }
   }
diff --git a/src/graph.ts b/src/graph.ts
@@ -41,6 +41,13 @@ export function outDegree<A>(node: number, graph: Graph<A>): number {
     return succs.size
 }
 
+/**
+ * Number of in- and out-going edges at `node` (not counting self-loop).
+ */
+export function degree<A>(node: number, graph: Graph<A>): number {
+  return inDegree(node, graph) + outDegree(node, graph)
+}
+
 export type RipNodeResult<A> = {
   predecessors: [number, A][]
   selfLoop: A | undefined
diff --git a/src/utils.ts b/src/utils.ts
@@ -146,3 +146,16 @@ export function hashStr(str: string, seed = 0): number {
 export function xor(a: number, b: number): number {
   return a^b
 }
+
+export function minBy<T>(iterable: Iterable<T>, scoreOf: (item: T) => number): T | undefined {
+  let minItem = undefined
+  let minScore = Infinity
+  for (const item of iterable) {
+    const score = scoreOf(item)
+    if (scoreOf(item) < minScore) {
+      minItem = item
+      minScore = score
+    }
+  }
+  return minItem
+}
diff --git a/test/arbitrary-regex.ts b/test/arbitrary-regex.ts
@@ -56,6 +56,18 @@ export function stdRegexNoStar(size = 100): fc.Arbitrary<RE.StdRegex> {
     )
 }
 
+export function stdRegexNoNestedStar(size = 100): fc.Arbitrary<RE.StdRegex> {
+  if (size <= 0)
+    return literal()
+  else
+    return fc.oneof(
+      star(() => stdRegexNoStar(Math.floor(size/2))),
+      concat(() => stdRegexNoNestedStar(Math.floor(size/2))),
+      union(() => stdRegexNoNestedStar(Math.floor(size/2))),
+      literal(),
+    )
+}
+
 export function stdRegexString(): fc.Arbitrary<string> {
   return stdRegex().map(RE.toString)
 }
diff --git a/test/regex.spec.ts b/test/regex.spec.ts
@@ -199,14 +199,3 @@ describe('rewrite rules', () => {
   })
   
 })
-
-// describe('equivalent', () => {
-//   it('every regex is equivalent to itself', () => {
-//     fc.assert(
-//       fc.property(extRegex(), (tree) => {
-//         expect(RegexTree.equivalent(tree, tree)).toBe(true)
-//       })
-//     )
-//   })
-// })
-

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,18 @@ export function stdRegexNoStar(size = 100): fc.Arbitrary<RE.StdRegex> {`
`56`	`56`	`)`
`57`	`57`	`}`
`58`	`58`
	`59`	`+export function stdRegexNoNestedStar(size = 100): fc.Arbitrary<RE.StdRegex> {`
	`60`	`+ if (size <= 0)`
	`61`	`+ return literal()`
	`62`	`+ else`
	`63`	`+ return fc.oneof(`
	`64`	`+ star(() => stdRegexNoStar(Math.floor(size/2))),`
	`65`	`+ concat(() => stdRegexNoNestedStar(Math.floor(size/2))),`
	`66`	`+ union(() => stdRegexNoNestedStar(Math.floor(size/2))),`
	`67`	`+ literal(),`
	`68`	`+ )`
	`69`	`+}`
	`70`	`+`
`59`	`71`	`export function stdRegexString(): fc.Arbitrary<string> {`
`60`	`72`	`return stdRegex().map(RE.toString)`
`61`	`73`	`}`