Skip to content

Commit 5d4b7e2

Browse files
authored
feat: sample random matches
1 parent 5c1092c commit 5d4b7e2

File tree

7 files changed

+238
-3
lines changed

7 files changed

+238
-3
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ These are surprisingly hard to come by for any programming language. ✨
1717
- [.isSubsetOf(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#isSubsetOf)
1818
- [.isSupersetOf(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#isSupersetOf)
1919
- [.isDisjointFrom(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#isDisjointFrom)
20+
- 📜 Generate strings:
21+
- [.sample(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#sample) - Generate random strings matching a regex.
22+
- [.enumerate()](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#enumerate) - Exhaustively enumerate strings matching a regex.
2023
- 🔧 Miscellaneous:
2124
- [.size()](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#size) - Count the number of strings that a regex matches.
22-
- [.enumerate()](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#enumerate) - Generate strings matching a regex.
2325
- [.derivative(...)](https://gruhn.github.io/regex-utils/interfaces/RegexBuilder.html#derivative) - Compute a Brzozowski derivative of a regex.
2426

2527
## Installation 📦

src/char-set.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,40 @@ export function size(set: CharSet): number {
355355
}
356356
}
357357

358+
/**
359+
* Samples a random character from the CharSet using the provided random number generator.
360+
* Returns null if the set is empty.
361+
*/
362+
export function sampleChar(set: CharSet, randomInt: (max: number) => number): string | null {
363+
const totalSize = size(set)
364+
if (totalSize === 0) return null
365+
366+
let targetIndex = randomInt(totalSize)
367+
return sampleCharAux(set, targetIndex)
368+
}
369+
function sampleCharAux(set: CharSet, targetIndex: number): string | null {
370+
if (set.type === 'empty') {
371+
return null
372+
}
373+
374+
const leftSize = size(set.left)
375+
if (targetIndex < leftSize) {
376+
return sampleCharAux(set.left, targetIndex)
377+
}
378+
379+
targetIndex -= leftSize
380+
381+
const rootSize = Range.size(set.range)
382+
if (targetIndex < rootSize) {
383+
// Target is in this range
384+
const codePoint = set.range.start + targetIndex
385+
return String.fromCodePoint(codePoint)
386+
}
387+
388+
targetIndex -= rootSize
389+
return sampleCharAux(set.right, targetIndex)
390+
}
391+
358392
////////////////////////////////////////////////////////////
359393
//////////////// Specific Character Classes ////////////////
360394
////////////////////////////////////////////////////////////

src/index.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,31 @@ class RegexBuilder {
304304
yield* RE.enumerate(this.getStdRegex())
305305
}
306306

307+
/**
308+
* Generates random strings that match the regex using a deterministic seed.
309+
* Unlike enumerate(), this produces a stream of random samples rather than
310+
* a fair enumeration of all possible matches. This is more useful for generating
311+
* representative examples without unusual characters like "\u0000".
312+
*
313+
* @example
314+
* ```typescript
315+
* const emailRegex = /^[a-z]+@[a-z]+\.[a-z]{2,}$/
316+
*
317+
* // Generate 10 random email examples with seed 42
318+
* for (const sample of RB(emailRegex).sample(42).take(10)) {
319+
* console.log(sample)
320+
* }
321+
* ```
322+
*
323+
* @param seed - Optional seed to make sampling deterministic.
324+
* @returns Generator yielding random matching strings
325+
*
326+
* @public
327+
*/
328+
sample(seed: number = Date.now()) {
329+
return RE.sample(this.getStdRegex(), seed)
330+
}
331+
307332
/**
308333
* Converts back to a native JavaScript `RegExp`.
309334
*

src/prng.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
/**
3+
* Simple deterministic PRNG using Linear Congruential Generator
4+
* Based on Numerical Recipes parameters
5+
*/
6+
export class PRNG {
7+
private state: number
8+
9+
constructor(seed: number = 1) {
10+
this.state = seed
11+
}
12+
13+
next(): number {
14+
this.state = (this.state * 1664525 + 1013904223) % 0x100000000
15+
return this.state / 0x100000000 // normalize to [0, 1)
16+
}
17+
18+
nextInt(max: number): number {
19+
return Math.floor(this.next() * max)
20+
}
21+
}
22+

src/regex.ts

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import * as CharSet from './char-set'
33
import * as Stream from './stream'
44
import * as Table from './table'
55
import * as AST from './ast'
6+
import { PRNG } from './prng'
67

78
/**
89
* TODO
@@ -803,6 +804,112 @@ function enumerateMemoizedAux(
803804
}
804805
}
805806

807+
/**
808+
* Generates random strings that match the given regex using a deterministic seed.
809+
* Unlike enumerate(), this produces a stream of random samples rather than
810+
* a fair enumeration of all possible matches.
811+
*
812+
* @param re - The regex to sample from
813+
* @param seed - Deterministic seed for random generation (default: 42)
814+
* @returns Generator yielding random matching strings
815+
*
816+
* @public
817+
*/
818+
export function* sample(re: StdRegex, seed: number): Generator<string> {
819+
if (isEmpty(re)) {
820+
// otherwise generator does not terminate:
821+
return
822+
}
823+
824+
const rng = new PRNG(seed)
825+
826+
// To reduce sampling bias, we weight probabilities by number of nodes in a sub-expression.
827+
// To not re-compute these counts, we traverse the tree once and populate a cache of node
828+
// counts at every node:
829+
const cachedNodeCount = new Map<number, number>()
830+
nodeCountAux(re, cachedNodeCount)
831+
const lookupNodeCount = (subExpr: StdRegex): number => {
832+
const count = cachedNodeCount.get(subExpr.hash)
833+
assert(count !== undefined, 'logic error: node count cache should be populated for all subexpressions')
834+
return count
835+
}
836+
837+
while (true) {
838+
try {
839+
const result = sampleAux(re, rng, 1000, lookupNodeCount)
840+
if (result !== null) {
841+
yield result
842+
}
843+
} catch {
844+
// If we hit max depth or other issues, skip this sample
845+
continue
846+
}
847+
}
848+
}
849+
function sampleAux(
850+
regex: StdRegex,
851+
rng: PRNG,
852+
maxDepth: number,
853+
lookupNodeCount: (subExpr: StdRegex) => number
854+
): string | null {
855+
if (maxDepth <= 0) {
856+
throw new Error('Max depth exceeded')
857+
}
858+
859+
switch (regex.type) {
860+
case 'epsilon':
861+
return ''
862+
863+
case 'literal': {
864+
return CharSet.sampleChar(regex.charset, (max) => rng.nextInt(max))
865+
}
866+
867+
case 'concat': {
868+
const leftSample = sampleAux(regex.left, rng, maxDepth / 2, lookupNodeCount)
869+
if (leftSample === null) return null
870+
const rightSample = sampleAux(regex.right, rng, maxDepth / 2, lookupNodeCount)
871+
if (rightSample === null) return null
872+
return leftSample + rightSample
873+
}
874+
875+
case 'union': {
876+
// For unions we randomly sample from the left- or right subtree.
877+
// The probability is weighted by the number of nodes in the subtree.
878+
// Consider the expression /^(aa|(bb|cc))$/ which matches the three strings: "aa", "bb", "cc".
879+
// If we give equal probability to all branches, we sample 50% "aa", 25% "bb" and 25% "cc".
880+
// Weighting by node count does not eliminate this problem completely.
881+
// We could also weight by the number of strings matched by the subtrees (computed using `size`).
882+
// But what to we do if one of the subtrees matches infinitely many strings (e.g. /^(a|b*)$/)?
883+
const leftCount = lookupNodeCount(regex.left)
884+
const rightCount = lookupNodeCount(regex.right)
885+
const chooseLeft = rng.next() < leftCount / (leftCount + rightCount)
886+
887+
if (chooseLeft) {
888+
return sampleAux(regex.left, rng, maxDepth - 1, lookupNodeCount)
889+
} else {
890+
return sampleAux(regex.right, rng, maxDepth - 1, lookupNodeCount)
891+
}
892+
}
893+
894+
case 'star': {
895+
// Randomly choose whether to stop repetition or to continue:
896+
const chooseStop = rng.next() < 0.5
897+
if (chooseStop) {
898+
return ""
899+
} else {
900+
const innerSample = sampleAux(regex.inner, rng, maxDepth / 2, lookupNodeCount)
901+
if (innerSample === null) return null
902+
const restSample = sampleAux(regex, rng, maxDepth / 2, lookupNodeCount)
903+
if (restSample === null) return null
904+
return innerSample + restSample
905+
}
906+
907+
}
908+
}
909+
910+
checkedAllCases(regex)
911+
}
912+
806913
/**
807914
* TODO
808915
*/
@@ -906,6 +1013,9 @@ function nodeCountAux(
9061013
export function debugShow(regex: ExtRegex): any {
9071014
return JSON.stringify(debugShowAux(regex), null, 2)
9081015
}
1016+
export function debugPrint(regex: ExtRegex): any {
1017+
return console.debug(JSON.stringify(debugShowAux(regex), null, 2))
1018+
}
9091019

9101020
function debugShowAux(regex: ExtRegex): any {
9111021
switch (regex.type) {

test/char-set.spec.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import * as CharSet from '../src/char-set'
44
import fc from 'fast-check'
55
import * as Range from '../src/code-point-range'
66

7-
87
const arbitraryRange: fc.Arbitrary<Range.CodePointRange> =
98
fc.tuple(
109
fc.integer({ min: 48, max: 122 }), // 0-9a-zA-Z

test/regex.spec.ts

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,50 @@ describe('enumerate', () => {
101101

102102
})
103103

104+
describe('sample', () => {
105+
106+
it('output strings match the input regex', () => {
107+
fc.assert(
108+
fc.property(
109+
Arb.stdRegex(),
110+
fc.integer({ min: 0, max: 1000 }),
111+
(inputRegex, seed) => {
112+
const regexp = RE.toRegExp(inputRegex)
113+
const samples = RE.sample(inputRegex, seed)
114+
115+
for (const sample of samples.take(50)) {
116+
assert.match(sample, regexp)
117+
}
118+
}
119+
),
120+
)
121+
})
122+
123+
it('is deterministic with same seed', () => {
124+
fc.assert(
125+
fc.property(
126+
Arb.stdRegex(),
127+
fc.nat(),
128+
(regex, seed) => {
129+
const gen1 = RE.sample(regex, seed)
130+
const gen2 = RE.sample(regex, seed)
131+
132+
assert.deepEqual(
133+
[...gen1.take(10)],
134+
[...gen2.take(10)],
135+
)
136+
}
137+
)
138+
)
139+
})
140+
141+
it('terminates for empty regex', () => {
142+
const samples = [...RE.sample(RE.empty)]
143+
assert.deepEqual(samples, [])
144+
})
145+
146+
})
147+
104148
describe('size', () => {
105149

106150
it('returns 1 for ∅ *', () => {
@@ -235,4 +279,3 @@ describe('derivative', () => {
235279
}
236280

237281
})
238-

0 commit comments

Comments
 (0)