fix: invalid union rewrite rule

gruhn · gruhn · commit 8fab4ebd6ea9 · 2025-05-28T23:49:53.000+02:00
Generate `star` in very test now, since blow up can be caught and
handled. With that discovered an invalid rewrite rule for `union`.
The rule would rewrite:

   a{2}|a*  --&gt;  a(a|a*)

The result does not match the empty string anymore.
diff --git a/benchmark/toStdRegex_output_length.js b/benchmark/toStdRegex_output_length.js
@@ -3,18 +3,15 @@ import * as RE from '../dist/regex.js'
 import { ParseError } from '../dist/parser.js'
 import { UnsupportedSyntaxError } from '../dist/regex-parser.js'
 import { parse, toStdRegex } from '../dist/low-level-api.js'
-import { regexToDFA } from '../dist/dfa.js'
 import randomRegexDataset from './regex_random_unique_no-nested-star_1000.js'
 import handwrittenRegexDataset from './regex_handwritten.js'
 
 const fullRegexDataset = [
   ...randomRegexDataset,
   ...handwrittenRegexDataset,
-] 
+]
 
-
-let avgMult = 0
-let maxMult = -Infinity
+const mults = []
 
 function run(inputRegExp, index) {
   console.log('#' + index, inputRegExp)
@@ -27,18 +24,12 @@ function run(inputRegExp, index) {
   const inp = inputRegExp.source.length
   const out = outputRegExp.source.length
   const mult = out/inp
-
-  avgMult = (avgMult*index + mult)/(index+1)
-  if (mult > maxMult) {
-    maxMult = mult
-  }
+  mults.push(mult)
 
   console.log(`
     regex input length  : ${inp}
     regex ouptut length : ${out}
     multiplier          : ${mult}
-    avg. multiplier     : ${avgMult}
-    worst multiplier    : ${maxMult}
   `) 
 }
 
@@ -75,3 +66,15 @@ console.debug('failed instances: ', {
   stackOverflow,
   regexSyntaxError
 })
+
+const mean = mults.reduce((a,b) => a+b, 0) / mults.length
+const median = mults[Math.ceil(mults.length / 2)]
+const worst = mults.reduce((a,b) => Math.max(a,b), -Infinity)
+
+console.log(`
+multipliers:
+  mean   : ${mean}
+  median : ${median}
+  max    : ${worst}
+`) 
+
diff --git a/src/dfa.ts b/src/dfa.ts
@@ -134,7 +134,7 @@ export function dfaToRegex(dfa: DFA): RE.StdRegex {
     .map(state => ({ state, degree: Graph.degree(state, graph)}))
     // Sort states by degree:
     .sort((a,b) => a.degree - b.degree)
-    // Through degree away again after sorting:
+    // Throw degree away again after sorting:
     .map(({ state }) => state)
 
   while (true) {
@@ -183,13 +183,16 @@ export function dfaToRegex(dfa: DFA): RE.StdRegex {
 // TODO: can this round-trip through DFA construction be avoided?
 export function toStdRegex(inputRegex: RE.ExtRegex): RE.StdRegex {
   const dfa = regexToDFA(inputRegex)
+  // printTrans(dfa)
   const outputRegex = dfaToRegex(dfa)
   return outputRegex
 }
 
-// function printTrans(trans: Table.Table<CharSet.CharSet>) {
+// function printTrans(dfa: DFA) {
+//   console.debug({ start: dfa.startState })
+//   console.debug({ final: dfa.finalStates })
 //   console.debug('=========trans===========')
-//   for (const [source, succs] of trans.entries()) {
+//   for (const [source, succs] of dfa.transitions.entries()) {
 //     for (const [target, label] of succs) {
 //       console.debug(source, target, new RegExp(CharSet.toString(label)))
 //       // console.debug(source, target, RE.toString(label))
diff --git a/src/regex.ts b/src/regex.ts
@@ -124,7 +124,7 @@ function extractFront(regex: ExtRegex): [ExtRegex, ExtRegex] {
     case 'literal': return [regex, epsilon]
     case 'concat': return [regex.left, regex.right]
     case 'union': return [regex, epsilon]
-    case 'star': return [regex.inner, regex]
+    case 'star': return [regex, epsilon]
     case 'intersection': return [regex, epsilon]
     case 'complement': return [regex, epsilon]
   }
@@ -139,7 +139,7 @@ function extractBack(regex: ExtRegex): [ExtRegex, ExtRegex] {
     case 'literal': return [epsilon, regex]
     case 'concat': return [regex.left, regex.right]
     case 'union': return [epsilon, regex]
-    case 'star': return [regex, regex.inner]
+    case 'star': return [epsilon, regex]
     case 'intersection': return [epsilon, regex]
     case 'complement': return [epsilon, regex]
   }
diff --git a/test/arbitrary-regex.ts b/test/arbitrary-regex.ts
@@ -1,7 +1,6 @@
 import fc from 'fast-check'
 import * as RE from '../src/regex'
 import * as CharSet from '../src/char-set'
-import { checkedAllCases } from '../src/utils'
 
 // TODO: try larger alphabet:
 export function charSet(): fc.Arbitrary<CharSet.CharSet> {
@@ -38,33 +37,10 @@ export function stdRegex(size = 100): fc.Arbitrary<RE.StdRegex> {
     return literal()
   else
     return fc.oneof(
-      star(() => stdRegex(Math.floor(size/2))),
-      concat(() => stdRegex(Math.floor(size/2))),
-      union(() => stdRegex(Math.floor(size/2))),
-      literal(),
-    )
-}
-
-export function stdRegexNoStar(size = 100): fc.Arbitrary<RE.StdRegex> {
-  if (size <= 0)
-    return literal()
-  else
-    return fc.oneof(
-      concat(() => stdRegexNoStar(Math.floor(size/2))),
-      union(() => stdRegexNoStar(Math.floor(size/2))),
-      literal(),
-    )
-}
-
-export function stdRegexNoNestedStar(size = 100): fc.Arbitrary<RE.StdRegex> {
-  if (size <= 0)
-    return literal()
-  else
-    return fc.oneof(
-      star(() => stdRegexNoStar(Math.floor(size/2))),
-      concat(() => stdRegexNoNestedStar(Math.floor(size/2))),
-      union(() => stdRegexNoNestedStar(Math.floor(size/2))),
-      literal(),
+      { arbitrary: literal(), weight: 5 },
+      { arbitrary: concat(() => stdRegex(Math.floor(size/2))), weight: 3 },
+      { arbitrary: union(() => stdRegex(Math.floor(size/2))), weight: 3 },
+      { arbitrary: star(() => stdRegex(Math.floor(size/2))), weight: 1 },
     )
 }
 
diff --git a/test/low-level-api.spec.ts b/test/low-level-api.spec.ts
@@ -1,53 +1,72 @@
 import fc from "fast-check"
 import { describe, it, expect, test } from "vitest"
-import { isEmpty } from '../src/regex'
+import { CacheOverflowError, isEmpty, VeryLargeSyntaxTreeError } from '../src/regex'
 import * as RE from "../src/low-level-api"
 import * as Arb from './arbitrary-regex'
-import * as Stream from '../src/stream'
-import { assert } from "../src/utils"
 
 /**
  * Stochastically verifies that `regex1` is a subset of `regex2`.
  * It samples a bunch of matches from `regex1` and checks whether
  * they match `regex2` as well. If a mismatch is found it is returned.
- * Otherwise, `true` is returned.
+ * Otherwise, `undefined` is returned.
  */
-function isSubsetOf(regex1: RE.StdRegex, regex2: RE.StdRegex, maxSamples = 30): true | string {
-  const re2 = RE.toRegExp(regex2)
-
+function expectSubsetOf(regex1: RE.StdRegex, regex2: RE.StdRegex, maxSamples = 30) {
+  const re2 = toRegExp_ignoreBlowUp(regex2)
   for (const match1 of RE.enumerate(regex1).take(maxSamples)) {
-    if (!re2.test(match1)) {
-      return match1
-    }
+    expect(match1).toMatch(re2)
+  }
+}
+
+function toRegExp_ignoreBlowUp(regex: RE.StdRegex) {
+  try {
+    return RE.toRegExp(regex)
+  } catch (e) {
+    if (e instanceof VeryLargeSyntaxTreeError) {
+      console.warn(e)
+      fc.pre(false)
+    } else {
+      throw e
+    }     
   }
+}
 
-  return true
+function toStdRegex_ignoreBlowUp(regex: RE.ExtRegex) {
+  try {
+    return RE.toStdRegex(regex)
+  } catch (e) {
+    if (e instanceof CacheOverflowError) {
+      console.warn(e)
+      fc.pre(false)
+    } else {
+      throw e
+    }     
+  }
 }
 
 describe('toStdRegex', () => {
 
   it('is idempotent on StdRegex', () => {
     fc.assert(
       fc.property(
-        // FIXME: `star` often leads to exponential blow up.
-        Arb.stdRegexNoStar(),
+        Arb.stdRegex(),
         inputRegex => {
-          const outputRegex = RE.toStdRegex(inputRegex)
-          expect(isSubsetOf(inputRegex, outputRegex)).toBe(true)
-          expect(isSubsetOf(outputRegex, inputRegex)).toBe(true)
+          const outputRegex = toStdRegex_ignoreBlowUp(inputRegex)
+          expectSubsetOf(inputRegex, outputRegex)
+          expectSubsetOf(outputRegex, inputRegex)
         }
       ),
+      { numRuns: 100, maxSkipsPerRun: 100 }
     )
-  })
+  }, 10_000)
 
 })
 
 test('A ∩ ¬A = ∅', () => {
   fc.assert(
     fc.property(
-      Arb.stdRegexNoStar(),
+      Arb.stdRegex(),
       regexA => {
-        const outputRegex = RE.toStdRegex(
+        const outputRegex = toStdRegex_ignoreBlowUp(
           RE.and([regexA, RE.not(regexA)])
         )
         expect(isEmpty(outputRegex)).toBe(true)
@@ -59,14 +78,14 @@ test('A ∩ ¬A = ∅', () => {
 test('B ⊆ (A ∪ B) ∩ (B ∪ C)', () => {
   fc.assert(
     fc.property(
-      Arb.stdRegexNoStar(),
-      Arb.stdRegexNoStar(),
-      Arb.stdRegexNoStar(),
+      Arb.stdRegex(),
+      Arb.stdRegex(),
+      Arb.stdRegex(),
       (regexA, regexB, regexC) => {
         const unionAB = RE.or([regexA, regexB])
         const unionBC = RE.or([regexB, regexC])
-        const interRegex = RE.toStdRegex(RE.and([unionAB, unionBC]))
-        expect(isSubsetOf(regexB, interRegex)).toBe(true)
+        const interRegex = toStdRegex_ignoreBlowUp(RE.and([unionAB, unionBC]))
+        expectSubsetOf(regexB, interRegex)
       }
     ),
   )   
@@ -76,10 +95,10 @@ test('intersection with regex /^.{N}$/ has only words of length N', () => {
   fc.assert(
     fc.property(
       fc.nat({ max: 10 }),
-      Arb.stdRegexNoStar(),
+      Arb.stdRegex(),
       (length, regexA) => {
         const regexB = RE.repeat(RE.anySingleChar, length)
-        const interAB = RE.toStdRegex(RE.and([regexA, regexB]))
+        const interAB = toStdRegex_ignoreBlowUp(RE.and([regexA, regexB]))
 
         for (const word of RE.enumerate(interAB).take(100)) {
           expect(word).toHaveLength(length)
diff --git a/test/regex.spec.ts b/test/regex.spec.ts
@@ -1,13 +1,25 @@
 import fc from "fast-check"
-import { describe, it, expect, test } from "vitest"
+import { describe, it, expect } from "vitest"
 import * as RE from "../src/regex"
 import * as DFA from '../src/dfa'
 import * as Arb from './arbitrary-regex'
 import * as Stream from '../src/stream'
 import * as CharSet from '../src/char-set'
-import { toRegExp } from "../src/regex"
 import { parseRegExp } from "../src/regex-parser"
 
+
+function toStdRegex_ignoreBlowUp(regex: RE.ExtRegex) {
+  try {
+    return DFA.toStdRegex(regex)
+  } catch (e) {
+    if (e instanceof RE.CacheOverflowError) {
+      fc.pre(false)
+    } else {
+      throw e
+    }     
+  }
+}
+
 describe('toString', () => {
 
   it('output is accepted by RegExp constructor', () => {
@@ -47,36 +59,16 @@ describe('enumerate', () => {
     )
   })
 
-  // it.only('debug', () => {
-  //   const regexp = /^((a(fc)?([cef]|f*)|a*|([ce]b*e*(eb)*)*)((cd)*b*(ac*|d))*c)$/
-  //   const inputRegex = parseRegExp(regexp)
-    
-  //   // get words NOT in the output by enumerating words of the complement:
-  //   const inputRegexComplement = DFA.toStdRegex(RE.complement(inputRegex))
-  //   console.debug(RE.toRegExp(inputRegexComplement))
-  //   const allComplementWords = RE.enumerate(inputRegexComplement)
-
-  //   // long words are likely result of repetition and are less interesting to test
-  //   // and also blow up memory:
-  //   const shortWords = Stream.takeWhile(word => word.length <= 30, allComplementWords)
-
-  //   for (const complementWord of Stream.take(100, shortWords)) {
-  //     expect(complementWord).not.toMatch(regexp)
-  //   }
-  // })
-
   // completeness
   it('strings NOT in the output, do NOT match the input regex', () => {
     fc.assert(
       fc.property(
-        // FIXME: have to exclude `star` because complement operation
-        // then often leads to exponential blow-up:
-        Arb.stdRegexNoStar(),
+        Arb.stdRegex(),
         inputRegex => {
           const regexp = RE.toRegExp(inputRegex)
 
           // get words NOT in the output by enumerating words of the complement:
-          const inputRegexComplement = DFA.toStdRegex(RE.complement(inputRegex))
+          const inputRegexComplement = toStdRegex_ignoreBlowUp(RE.complement(inputRegex))
           const allComplementWords = RE.enumerateAux(inputRegexComplement)
 
           // long words are likely result of repetition and are less interesting to test
@@ -88,7 +80,8 @@ describe('enumerate', () => {
           }
         }
       ),
-      { endOnFailure: true }
+      // { endOnFailure: true }
+      { seed: -1078936918, path: "13", endOnFailure: true }
     )
   })
 
@@ -180,7 +173,8 @@ describe('rewrite rules', () => {
     [/^(a|b)|a$/, /^([ab])$/],
     [/^(a?)?$/, /^(a?)$/],
     [/^(a*)?$/, /^(a*)$/],
-    [/^(a|a*)$/, /^(aa*)$/],
+    // TODO:
+    // [/^(a|a*)$/, /^(aa*)$/],
     // union-of-concat rules:
     [/^ab|ac$/, /^(a[bc])$/],
     [/^ba|ca$/, /^([bc]a)$/],
@@ -199,3 +193,18 @@ describe('rewrite rules', () => {
   })
   
 })
+
+describe('derivative', () => {
+
+  it.each([
+    [/^((aa*)?)$/, 'a', /^(a*)$/],
+    [/^(a{2}(a{3})*)$/, 'a', /^(a(a{3})*)$/],
+    [/^(a{2}(a*)|(aa*))$/, 'a', /^(a?a*)$/],
+    [/^(a(a{3})*|(aa*)?)$/, 'a', /^((a{3})*|a*)$/],
+    [/^(a{2}(a{3})*|(aa*)?)$/, 'a', /^(a(a{3})*|a*)$/],
+  ])('of %s with respect to "%s" is %s', (input, str, expected) => {
+    const actual = RE.derivative(str, parseRegExp(input))
+    expect(RE.toRegExp(actual)).toEqual(expected)
+  })
+  
+})