feat: sample random matches

gruhn · web-flow · commit 5d4b7e2b8bce · 2025-09-24T20:38:19.000+02:00
diff --git a/README.md b/README.md
@@ -17,9 +17,11 @@ These are surprisingly hard to come by for any programming language. ✨
   - [.isSubsetOf(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#isSubsetOf)
   - [.isSupersetOf(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#isSupersetOf)
   - [.isDisjointFrom(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#isDisjointFrom)
+- 📜 Generate strings:
+  - [.sample(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#sample) - Generate random strings matching a regex.
+  - [.enumerate()](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#enumerate) - Exhaustively enumerate strings matching a regex.
 - 🔧 Miscellaneous:
   - [.size()](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#size) - Count the number of strings that a regex matches.
-  - [.enumerate()](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#enumerate) - Generate strings matching a regex.
   - [.derivative(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#derivative) - Compute a Brzozowski derivative of a regex.
 
 ## Installation 📦
diff --git a/src/char-set.ts b/src/char-set.ts
@@ -355,6 +355,40 @@ export function size(set: CharSet): number {
   } 
 }
 
+/**
+ * Samples a random character from the CharSet using the provided random number generator.
+ * Returns null if the set is empty.
+ */
+export function sampleChar(set: CharSet, randomInt: (max: number) => number): string | null {
+  const totalSize = size(set)
+  if (totalSize === 0) return null
+  
+  let targetIndex = randomInt(totalSize)
+  return sampleCharAux(set, targetIndex)
+}
+function sampleCharAux(set: CharSet, targetIndex: number): string | null {
+  if (set.type === 'empty') {
+    return null
+  }
+
+  const leftSize = size(set.left)
+  if (targetIndex < leftSize) {
+    return sampleCharAux(set.left, targetIndex)
+  }
+
+  targetIndex -= leftSize
+
+  const rootSize = Range.size(set.range)
+  if (targetIndex < rootSize) {
+    // Target is in this range
+    const codePoint = set.range.start + targetIndex
+    return String.fromCodePoint(codePoint)
+  }
+
+  targetIndex -= rootSize
+  return sampleCharAux(set.right, targetIndex)
+}
+
 ////////////////////////////////////////////////////////////
 //////////////// Specific Character Classes //////////////// 
 ////////////////////////////////////////////////////////////
diff --git a/src/index.ts b/src/index.ts
@@ -304,6 +304,31 @@ class RegexBuilder {
     yield* RE.enumerate(this.getStdRegex())
   }
 
+  /**
+   * Generates random strings that match the regex using a deterministic seed.
+   * Unlike enumerate(), this produces a stream of random samples rather than
+   * a fair enumeration of all possible matches. This is more useful for generating
+   * representative examples without unusual characters like "\u0000".
+   * 
+   * @example
+   * ```typescript
+   * const emailRegex = /^[a-z]+@[a-z]+\.[a-z]{2,}$/
+   * 
+   * // Generate 10 random email examples with seed 42
+   * for (const sample of RB(emailRegex).sample(42).take(10)) {
+   *   console.log(sample)
+   * }
+   * ```
+   * 
+   * @param seed - Optional seed to make sampling deterministic.
+   * @returns Generator yielding random matching strings
+   * 
+   * @public
+   */
+  sample(seed: number = Date.now()) {
+    return RE.sample(this.getStdRegex(), seed)
+  }
+
   /**
    * Converts back to a native JavaScript `RegExp`. 
    * 
diff --git a/src/prng.ts b/src/prng.ts
@@ -0,0 +1,22 @@
+
+/**
+ * Simple deterministic PRNG using Linear Congruential Generator
+ * Based on Numerical Recipes parameters
+ */
+export class PRNG {
+  private state: number
+
+  constructor(seed: number = 1) {
+    this.state = seed
+  }
+
+  next(): number {
+    this.state = (this.state * 1664525 + 1013904223) % 0x100000000
+    return this.state / 0x100000000 // normalize to [0, 1)
+  }
+
+  nextInt(max: number): number {
+    return Math.floor(this.next() * max)
+  }
+}
+
diff --git a/src/regex.ts b/src/regex.ts
@@ -3,6 +3,7 @@ import * as CharSet from './char-set'
 import * as Stream from './stream'
 import * as Table from './table'
 import * as AST from './ast'
+import { PRNG } from './prng'
 
 /**
  * TODO
@@ -803,6 +804,112 @@ function enumerateMemoizedAux(
   }
 }
 
+/**
+ * Generates random strings that match the given regex using a deterministic seed.
+ * Unlike enumerate(), this produces a stream of random samples rather than
+ * a fair enumeration of all possible matches.
+ * 
+ * @param re - The regex to sample from
+ * @param seed - Deterministic seed for random generation (default: 42)
+ * @returns Generator yielding random matching strings
+ * 
+ * @public
+ */
+export function* sample(re: StdRegex, seed: number): Generator<string> {
+  if (isEmpty(re)) {
+    // otherwise generator does not terminate:
+    return
+  }
+
+  const rng = new PRNG(seed)
+
+  // To reduce sampling bias, we weight probabilities by number of nodes in a sub-expression.
+  // To not re-compute these counts, we traverse the tree once and populate a cache of node
+  // counts at every node:
+  const cachedNodeCount = new Map<number, number>()
+  nodeCountAux(re, cachedNodeCount)
+  const lookupNodeCount = (subExpr: StdRegex): number => {
+    const count = cachedNodeCount.get(subExpr.hash)
+    assert(count !== undefined, 'logic error: node count cache should be populated for all subexpressions')
+    return count
+  }
+  
+  while (true) {
+    try {
+      const result = sampleAux(re, rng, 1000, lookupNodeCount)
+      if (result !== null) {
+        yield result
+      }
+    } catch {
+      // If we hit max depth or other issues, skip this sample
+      continue
+    }
+  }
+}
+function sampleAux(
+  regex: StdRegex,
+  rng: PRNG,
+  maxDepth: number,
+  lookupNodeCount: (subExpr: StdRegex) => number
+): string | null {
+  if (maxDepth <= 0) {
+    throw new Error('Max depth exceeded')
+  }
+
+  switch (regex.type) {
+    case 'epsilon':
+      return ''
+    
+    case 'literal': {
+      return CharSet.sampleChar(regex.charset, (max) => rng.nextInt(max))
+    }
+    
+    case 'concat': {
+      const leftSample = sampleAux(regex.left, rng, maxDepth / 2, lookupNodeCount)
+      if (leftSample === null) return null
+      const rightSample = sampleAux(regex.right, rng, maxDepth / 2, lookupNodeCount)
+      if (rightSample === null) return null
+      return leftSample + rightSample
+    }
+    
+    case 'union': {
+      // For unions we randomly sample from the left- or right subtree.
+      // The probability is weighted by the number of nodes in the subtree.
+      // Consider the expression /^(aa|(bb|cc))$/ which matches the three strings: "aa", "bb", "cc".
+      // If we give equal probability to all branches, we sample 50% "aa", 25% "bb" and 25% "cc".
+      // Weighting by node count does not eliminate this problem completely. 
+      // We could also weight by the number of strings matched by the subtrees (computed using `size`).
+      // But what to we do if one of the subtrees matches infinitely many strings (e.g. /^(a|b*)$/)?
+      const leftCount = lookupNodeCount(regex.left)
+      const rightCount = lookupNodeCount(regex.right)
+      const chooseLeft = rng.next() < leftCount / (leftCount + rightCount)
+
+      if (chooseLeft) {
+        return sampleAux(regex.left, rng, maxDepth - 1, lookupNodeCount)
+      } else {
+        return sampleAux(regex.right, rng, maxDepth - 1, lookupNodeCount)
+      }
+    }
+    
+    case 'star': {
+      // Randomly choose whether to stop repetition or to continue:
+      const chooseStop = rng.next() < 0.5
+      if (chooseStop) {
+        return ""
+      } else {
+        const innerSample = sampleAux(regex.inner, rng, maxDepth / 2, lookupNodeCount)
+        if (innerSample === null) return null
+        const restSample = sampleAux(regex, rng, maxDepth / 2, lookupNodeCount)
+        if (restSample === null) return null
+        return innerSample + restSample
+      }
+
+    }
+  }
+  
+  checkedAllCases(regex)
+}
+
 /**
  * TODO
  */
@@ -906,6 +1013,9 @@ function nodeCountAux(
 export function debugShow(regex: ExtRegex): any {
   return JSON.stringify(debugShowAux(regex), null, 2)
 }
+export function debugPrint(regex: ExtRegex): any {
+  return console.debug(JSON.stringify(debugShowAux(regex), null, 2))
+}
 
 function debugShowAux(regex: ExtRegex): any {
   switch (regex.type) {
diff --git a/test/char-set.spec.ts b/test/char-set.spec.ts
@@ -4,7 +4,6 @@ import * as CharSet from '../src/char-set'
 import fc from 'fast-check'
 import * as Range from '../src/code-point-range'
 
-
 const arbitraryRange: fc.Arbitrary<Range.CodePointRange> =
   fc.tuple(
     fc.integer({ min: 48, max: 122 }), // 0-9a-zA-Z
diff --git a/test/regex.spec.ts b/test/regex.spec.ts
@@ -101,6 +101,50 @@ describe('enumerate', () => {
 
 })
 
+describe('sample', () => {
+
+  it('output strings match the input regex', () => {
+    fc.assert(
+      fc.property(
+        Arb.stdRegex(),
+        fc.integer({ min: 0, max: 1000 }),
+        (inputRegex, seed) => {
+          const regexp = RE.toRegExp(inputRegex)
+          const samples = RE.sample(inputRegex, seed)
+
+          for (const sample of samples.take(50)) {
+            assert.match(sample, regexp)
+          }
+        }
+      ),
+    )
+  })
+
+  it('is deterministic with same seed', () => {
+    fc.assert(
+      fc.property(
+        Arb.stdRegex(),
+        fc.nat(),
+        (regex, seed) => {
+          const gen1 = RE.sample(regex, seed)
+          const gen2 = RE.sample(regex, seed)
+        
+          assert.deepEqual(
+            [...gen1.take(10)],
+            [...gen2.take(10)],
+          )
+        }
+      )
+    )
+  })
+
+  it('terminates for empty regex', () => {
+    const samples = [...RE.sample(RE.empty)]
+    assert.deepEqual(samples, [])
+  })
+
+})
+
 describe('size', () => {
 
   it('returns 1 for ∅ *', () => {
@@ -235,4 +279,3 @@ describe('derivative', () => {
   }
   
 })
-