@@ -3,6 +3,7 @@ import * as CharSet from './char-set'
33import * as Stream from './stream'
44import * as Table from './table'
55import * as AST from './ast'
6+ import { PRNG } from './prng'
67
78/**
89 * TODO
@@ -803,6 +804,112 @@ function enumerateMemoizedAux(
803804 }
804805}
805806
807+ /**
808+ * Generates random strings that match the given regex using a deterministic seed.
809+ * Unlike enumerate(), this produces a stream of random samples rather than
810+ * a fair enumeration of all possible matches.
811+ *
812+ * @param re - The regex to sample from
813+ * @param seed - Deterministic seed for random generation (default: 42)
814+ * @returns Generator yielding random matching strings
815+ *
816+ * @public
817+ */
818+ export function * sample ( re : StdRegex , seed : number ) : Generator < string > {
819+ if ( isEmpty ( re ) ) {
820+ // otherwise generator does not terminate:
821+ return
822+ }
823+
824+ const rng = new PRNG ( seed )
825+
826+ // To reduce sampling bias, we weight probabilities by number of nodes in a sub-expression.
827+ // To not re-compute these counts, we traverse the tree once and populate a cache of node
828+ // counts at every node:
829+ const cachedNodeCount = new Map < number , number > ( )
830+ nodeCountAux ( re , cachedNodeCount )
831+ const lookupNodeCount = ( subExpr : StdRegex ) : number => {
832+ const count = cachedNodeCount . get ( subExpr . hash )
833+ assert ( count !== undefined , 'logic error: node count cache should be populated for all subexpressions' )
834+ return count
835+ }
836+
837+ while ( true ) {
838+ try {
839+ const result = sampleAux ( re , rng , 1000 , lookupNodeCount )
840+ if ( result !== null ) {
841+ yield result
842+ }
843+ } catch {
844+ // If we hit max depth or other issues, skip this sample
845+ continue
846+ }
847+ }
848+ }
849+ function sampleAux (
850+ regex : StdRegex ,
851+ rng : PRNG ,
852+ maxDepth : number ,
853+ lookupNodeCount : ( subExpr : StdRegex ) => number
854+ ) : string | null {
855+ if ( maxDepth <= 0 ) {
856+ throw new Error ( 'Max depth exceeded' )
857+ }
858+
859+ switch ( regex . type ) {
860+ case 'epsilon' :
861+ return ''
862+
863+ case 'literal' : {
864+ return CharSet . sampleChar ( regex . charset , ( max ) => rng . nextInt ( max ) )
865+ }
866+
867+ case 'concat' : {
868+ const leftSample = sampleAux ( regex . left , rng , maxDepth / 2 , lookupNodeCount )
869+ if ( leftSample === null ) return null
870+ const rightSample = sampleAux ( regex . right , rng , maxDepth / 2 , lookupNodeCount )
871+ if ( rightSample === null ) return null
872+ return leftSample + rightSample
873+ }
874+
875+ case 'union' : {
876+ // For unions we randomly sample from the left- or right subtree.
877+ // The probability is weighted by the number of nodes in the subtree.
878+ // Consider the expression /^(aa|(bb|cc))$/ which matches the three strings: "aa", "bb", "cc".
879+ // If we give equal probability to all branches, we sample 50% "aa", 25% "bb" and 25% "cc".
880+ // Weighting by node count does not eliminate this problem completely.
881+ // We could also weight by the number of strings matched by the subtrees (computed using `size`).
882+ // But what to we do if one of the subtrees matches infinitely many strings (e.g. /^(a|b*)$/)?
883+ const leftCount = lookupNodeCount ( regex . left )
884+ const rightCount = lookupNodeCount ( regex . right )
885+ const chooseLeft = rng . next ( ) < leftCount / ( leftCount + rightCount )
886+
887+ if ( chooseLeft ) {
888+ return sampleAux ( regex . left , rng , maxDepth - 1 , lookupNodeCount )
889+ } else {
890+ return sampleAux ( regex . right , rng , maxDepth - 1 , lookupNodeCount )
891+ }
892+ }
893+
894+ case 'star' : {
895+ // Randomly choose whether to stop repetition or to continue:
896+ const chooseStop = rng . next ( ) < 0.5
897+ if ( chooseStop ) {
898+ return ""
899+ } else {
900+ const innerSample = sampleAux ( regex . inner , rng , maxDepth / 2 , lookupNodeCount )
901+ if ( innerSample === null ) return null
902+ const restSample = sampleAux ( regex , rng , maxDepth / 2 , lookupNodeCount )
903+ if ( restSample === null ) return null
904+ return innerSample + restSample
905+ }
906+
907+ }
908+ }
909+
910+ checkedAllCases ( regex )
911+ }
912+
806913/**
807914 * TODO
808915 */
@@ -906,6 +1013,9 @@ function nodeCountAux(
9061013export function debugShow ( regex : ExtRegex ) : any {
9071014 return JSON . stringify ( debugShowAux ( regex ) , null , 2 )
9081015}
1016+ export function debugPrint ( regex : ExtRegex ) : any {
1017+ return console . debug ( JSON . stringify ( debugShowAux ( regex ) , null , 2 ) )
1018+ }
9091019
9101020function debugShowAux ( regex : ExtRegex ) : any {
9111021 switch ( regex . type ) {
0 commit comments