@@ -471,6 +471,8 @@ export function isEmpty(regex: ExtRegex): boolean {
471
471
return regex . type === 'literal' && CharSet . isEmpty ( regex . charset )
472
472
}
473
473
474
+ export class CacheOverflowError extends Error { }
475
+
474
476
export function codePointDerivative ( codePoint : number , regex : StdRegex , cache : Table . Table < StdRegex > ) : StdRegex
475
477
export function codePointDerivative ( codePoint : number , regex : ExtRegex , cache : Table . Table < ExtRegex > ) : ExtRegex
476
478
export function codePointDerivative ( codePoint : number , regex : ExtRegex , cache : Table . Table < ExtRegex > ) : ExtRegex {
@@ -521,6 +523,13 @@ function codePointDerivativeAux(codePoint: number, regex: ExtRegex, cache: Table
521
523
function codePointDerivativeAux ( codePoint : number , regex : ExtRegex , cache : Table . Table < ExtRegex > ) : ExtRegex {
522
524
const cachedResult = Table . get ( codePoint , regex . hash , cache )
523
525
if ( cachedResult === undefined ) {
526
+ // Rather throw an error when cache grows too large than getting OOM killed.
527
+ // At least errors can be caught and handled. The limit is somewhat arbitrary.
528
+ // TODO: maybe make this user configurable:
529
+ if ( Table . size ( cache ) >= 10_000 ) {
530
+ throw new CacheOverflowError ( 'Cache overflow while computing DFA transitions.' )
531
+ }
532
+
524
533
const result = codePointDerivative ( codePoint , regex , cache )
525
534
Table . set ( codePoint , regex . hash , result , cache )
526
535
return result
@@ -608,6 +617,13 @@ function allNonEmptyIntersections(
608
617
return resultCached
609
618
}
610
619
620
+ // Rather throw an error when cache grows too large than getting OOM killed.
621
+ // At least errors can be caught and handled. The limit is somewhat arbitrary.
622
+ // TODO: maybe make this user configurable:
623
+ if ( Table . size ( cache ) >= 10_000 ) {
624
+ throw new CacheOverflowError ( )
625
+ }
626
+
611
627
const result : CharSet . CharSet [ ] = [ ]
612
628
for ( const classA of classesA ) {
613
629
for ( const classB of classesB ) {
@@ -668,12 +684,20 @@ export function derivativeClasses(
668
684
}
669
685
checkedAllCases ( regex )
670
686
}
687
+
671
688
function derivativeClassesAux (
672
689
regex : ExtRegex ,
673
690
cache : DerivativeClassesCache
674
691
) {
675
692
const cachedResult = cache . classes . get ( regex . hash )
676
693
if ( cachedResult === undefined ) {
694
+ // Rather throw an error when cache grows too large than getting OOM killed.
695
+ // At least errors can be caught and handled. The limit is somewhat arbitrary.
696
+ // TODO: maybe make this user configurable:
697
+ if ( cache . classes . size >= 10_000 ) {
698
+ throw new CacheOverflowError ( )
699
+ }
700
+
677
701
const result = derivativeClasses ( regex , cache )
678
702
cache . classes . set ( regex . hash , result )
679
703
return result
@@ -687,6 +711,8 @@ function derivativeClassesAux(
687
711
///// exclusive standard regex utils /////
688
712
//////////////////////////////////////////////
689
713
714
+ export class VeryLargeSyntaxTreeError extends Error { }
715
+
690
716
/**
691
717
* TODO: docs
692
718
*
@@ -697,7 +723,20 @@ export function toRegExp(regex: StdRegex): RegExp {
697
723
}
698
724
699
725
export function toString ( regex : ExtRegex ) : string {
700
- return '^(' + astToString ( toRegExpAST ( regex ) ) + ')$'
726
+ const size = nodeCount ( regex )
727
+ if ( size > 1_000_000 ) {
728
+ throw new VeryLargeSyntaxTreeError (
729
+ "Won't try to convert to RegExp. Syntax tree has over 1_000_000 nodes."
730
+ )
731
+ }
732
+
733
+ // Render parenthesis as non-capturing groups if there is a large number of them,
734
+ // i.e. `/(?:abc)` instead of `/(abc)/`. `new RegExp(...)` throws an error if there
735
+ // is a large number of capturing groups. Non-capturing groups are a bit more verbose
736
+ // but at large sizes like this it doesn't matter anyway:
737
+ const useNonCapturingGroups = size > 10_000
738
+
739
+ return '^(' + astToString ( toRegExpAST ( regex ) , { useNonCapturingGroups } ) + ')$'
701
740
}
702
741
703
742
// TODO: information is duplicated in parser:
@@ -786,37 +825,43 @@ function toRegExpAST(regex: ExtRegex): RegExpAST {
786
825
checkedAllCases ( regex )
787
826
}
788
827
789
- function astToString ( ast : RegExpAST ) : string {
828
+ type RenderOptions = {
829
+ useNonCapturingGroups : boolean
830
+ }
831
+
832
+ function astToString ( ast : RegExpAST , options : RenderOptions ) : string {
790
833
switch ( ast . type ) {
791
834
case 'epsilon' :
792
835
return ''
793
836
case 'literal' :
794
837
return CharSet . toString ( ast . charset )
795
838
case 'concat' :
796
- return maybeWithParens ( ast . left , ast ) + maybeWithParens ( ast . right , ast )
839
+ return maybeWithParens ( ast . left , ast , options ) + maybeWithParens ( ast . right , ast , options )
797
840
case 'union' :
798
- return maybeWithParens ( ast . left , ast ) + '|' + maybeWithParens ( ast . right , ast )
841
+ return maybeWithParens ( ast . left , ast , options ) + '|' + maybeWithParens ( ast . right , ast , options )
799
842
case 'star' :
800
- return maybeWithParens ( ast . inner , ast ) + '*'
843
+ return maybeWithParens ( ast . inner , ast , options ) + '*'
801
844
case 'plus' :
802
- return maybeWithParens ( ast . inner , ast ) + '+'
845
+ return maybeWithParens ( ast . inner , ast , options ) + '+'
803
846
case 'optional' :
804
- return maybeWithParens ( ast . inner , ast ) + '?'
847
+ return maybeWithParens ( ast . inner , ast , options ) + '?'
805
848
case 'boundedQuantifier' :
806
- return maybeWithParens ( ast . inner , ast ) + '{' + ast . count + '}'
849
+ return maybeWithParens ( ast . inner , ast , options ) + '{' + ast . count + '}'
807
850
case 'complement' :
808
- return '¬' + maybeWithParens ( ast . inner , ast )
851
+ return '¬' + maybeWithParens ( ast . inner , ast , options )
809
852
case 'intersection' :
810
- return maybeWithParens ( ast . left , ast ) + '∩' + maybeWithParens ( ast . right , ast )
853
+ return maybeWithParens ( ast . left , ast , options ) + '∩' + maybeWithParens ( ast . right , ast , options )
811
854
}
812
855
checkedAllCases ( ast )
813
856
}
814
857
815
- function maybeWithParens ( ast : RegExpAST , parent : RegExpAST ) : string {
858
+ function maybeWithParens ( ast : RegExpAST , parent : RegExpAST , options : RenderOptions ) : string {
816
859
if ( ast . type === parent . type || precLevel ( ast . type ) > precLevel ( parent . type ) )
817
- return astToString ( ast )
860
+ return astToString ( ast , options )
861
+ else if ( options . useNonCapturingGroups )
862
+ return '(?:' + astToString ( ast , options ) + ')'
818
863
else
819
- return '(' + astToString ( ast ) + ')'
864
+ return '(' + astToString ( ast , options ) + ')'
820
865
}
821
866
822
867
/**
@@ -938,6 +983,43 @@ function sizeMemoizedAux(
938
983
}
939
984
}
940
985
986
+ export function nodeCount (
987
+ regex : ExtRegex ,
988
+ cache : Map < number , number > = new Map ( )
989
+ ) : number {
990
+ switch ( regex . type ) {
991
+ case 'epsilon' :
992
+ return 1
993
+ case 'literal' :
994
+ return 1
995
+ case 'concat' :
996
+ return nodeCountAux ( regex . left , cache ) + nodeCountAux ( regex . right , cache ) + 1
997
+ case 'union' :
998
+ return nodeCountAux ( regex . left , cache ) + nodeCountAux ( regex . right , cache ) + 1
999
+ case 'star' :
1000
+ return nodeCountAux ( regex . inner , cache ) + 1
1001
+ case 'intersection' :
1002
+ return nodeCountAux ( regex . left , cache ) + nodeCountAux ( regex . right , cache ) + 1
1003
+ case 'complement' :
1004
+ return nodeCountAux ( regex . inner , cache ) + 1
1005
+ }
1006
+ checkedAllCases ( regex )
1007
+ }
1008
+
1009
+ function nodeCountAux (
1010
+ regex : ExtRegex ,
1011
+ cache : Map < number , number >
1012
+ ) : number {
1013
+ const cachedResult = cache . get ( regex . hash )
1014
+ if ( cachedResult === undefined ) {
1015
+ const result = nodeCount ( regex , cache )
1016
+ cache . set ( regex . hash , result )
1017
+ return result
1018
+ } else {
1019
+ return cachedResult
1020
+ }
1021
+ }
1022
+
941
1023
// export function equivalent(regex1: ExtRegex, regex2: ExtRegex): boolean {
942
1024
// if (equal(regex1, regex2)) {
943
1025
// return true
0 commit comments