@@ -3,6 +3,7 @@ import * as CharSet from './char-set'
3
3
import * as Stream from './stream'
4
4
import * as Table from './table'
5
5
import * as AST from './ast'
6
+ import { PRNG } from './prng'
6
7
7
8
/**
8
9
* TODO
@@ -803,6 +804,112 @@ function enumerateMemoizedAux(
803
804
}
804
805
}
805
806
807
+ /**
808
+ * Generates random strings that match the given regex using a deterministic seed.
809
+ * Unlike enumerate(), this produces a stream of random samples rather than
810
+ * a fair enumeration of all possible matches.
811
+ *
812
+ * @param re - The regex to sample from
813
+ * @param seed - Deterministic seed for random generation (default: 42)
814
+ * @returns Generator yielding random matching strings
815
+ *
816
+ * @public
817
+ */
818
+ export function * sample ( re : StdRegex , seed : number ) : Generator < string > {
819
+ if ( isEmpty ( re ) ) {
820
+ // otherwise generator does not terminate:
821
+ return
822
+ }
823
+
824
+ const rng = new PRNG ( seed )
825
+
826
+ // To reduce sampling bias, we weight probabilities by number of nodes in a sub-expression.
827
+ // To not re-compute these counts, we traverse the tree once and populate a cache of node
828
+ // counts at every node:
829
+ const cachedNodeCount = new Map < number , number > ( )
830
+ nodeCountAux ( re , cachedNodeCount )
831
+ const lookupNodeCount = ( subExpr : StdRegex ) : number => {
832
+ const count = cachedNodeCount . get ( subExpr . hash )
833
+ assert ( count !== undefined , 'logic error: node count cache should be populated for all subexpressions' )
834
+ return count
835
+ }
836
+
837
+ while ( true ) {
838
+ try {
839
+ const result = sampleAux ( re , rng , 1000 , lookupNodeCount )
840
+ if ( result !== null ) {
841
+ yield result
842
+ }
843
+ } catch {
844
+ // If we hit max depth or other issues, skip this sample
845
+ continue
846
+ }
847
+ }
848
+ }
849
+ function sampleAux (
850
+ regex : StdRegex ,
851
+ rng : PRNG ,
852
+ maxDepth : number ,
853
+ lookupNodeCount : ( subExpr : StdRegex ) => number
854
+ ) : string | null {
855
+ if ( maxDepth <= 0 ) {
856
+ throw new Error ( 'Max depth exceeded' )
857
+ }
858
+
859
+ switch ( regex . type ) {
860
+ case 'epsilon' :
861
+ return ''
862
+
863
+ case 'literal' : {
864
+ return CharSet . sampleChar ( regex . charset , ( max ) => rng . nextInt ( max ) )
865
+ }
866
+
867
+ case 'concat' : {
868
+ const leftSample = sampleAux ( regex . left , rng , maxDepth / 2 , lookupNodeCount )
869
+ if ( leftSample === null ) return null
870
+ const rightSample = sampleAux ( regex . right , rng , maxDepth / 2 , lookupNodeCount )
871
+ if ( rightSample === null ) return null
872
+ return leftSample + rightSample
873
+ }
874
+
875
+ case 'union' : {
876
+ // For unions we randomly sample from the left- or right subtree.
877
+ // The probability is weighted by the number of nodes in the subtree.
878
+ // Consider the expression /^(aa|(bb|cc))$/ which matches the three strings: "aa", "bb", "cc".
879
+ // If we give equal probability to all branches, we sample 50% "aa", 25% "bb" and 25% "cc".
880
+ // Weighting by node count does not eliminate this problem completely.
881
+ // We could also weight by the number of strings matched by the subtrees (computed using `size`).
882
+ // But what to we do if one of the subtrees matches infinitely many strings (e.g. /^(a|b*)$/)?
883
+ const leftCount = lookupNodeCount ( regex . left )
884
+ const rightCount = lookupNodeCount ( regex . right )
885
+ const chooseLeft = rng . next ( ) < leftCount / ( leftCount + rightCount )
886
+
887
+ if ( chooseLeft ) {
888
+ return sampleAux ( regex . left , rng , maxDepth - 1 , lookupNodeCount )
889
+ } else {
890
+ return sampleAux ( regex . right , rng , maxDepth - 1 , lookupNodeCount )
891
+ }
892
+ }
893
+
894
+ case 'star' : {
895
+ // Randomly choose whether to stop repetition or to continue:
896
+ const chooseStop = rng . next ( ) < 0.5
897
+ if ( chooseStop ) {
898
+ return ""
899
+ } else {
900
+ const innerSample = sampleAux ( regex . inner , rng , maxDepth / 2 , lookupNodeCount )
901
+ if ( innerSample === null ) return null
902
+ const restSample = sampleAux ( regex , rng , maxDepth / 2 , lookupNodeCount )
903
+ if ( restSample === null ) return null
904
+ return innerSample + restSample
905
+ }
906
+
907
+ }
908
+ }
909
+
910
+ checkedAllCases ( regex )
911
+ }
912
+
806
913
/**
807
914
* TODO
808
915
*/
@@ -906,6 +1013,9 @@ function nodeCountAux(
906
1013
export function debugShow ( regex : ExtRegex ) : any {
907
1014
return JSON . stringify ( debugShowAux ( regex ) , null , 2 )
908
1015
}
1016
+ export function debugPrint ( regex : ExtRegex ) : any {
1017
+ return console . debug ( JSON . stringify ( debugShowAux ( regex ) , null , 2 ) )
1018
+ }
909
1019
910
1020
function debugShowAux ( regex : ExtRegex ) : any {
911
1021
switch ( regex . type ) {
0 commit comments