fix: throw error on known anchor elimination issues

gruhn · gruhn · commit 5ad0261a3e3f · 2025-10-15T22:29:53.000+02:00
Throw `UnsupportedSyntaxError` for expressions with known unsupported
start/end anchor patterns. Better than processing them incorrectly.
diff --git a/benchmark/toStdRegex_output_length-result.txt b/benchmark/toStdRegex_output_length-result.txt
@@ -1,6 +1,6 @@
 
 failed instances:
-- parseError         : 45
+- parseError         : 49
 - cacheOverflow      : 89
 - veryLargeSyntaTree : 24
 - stackOverflow      : 12
diff --git a/src/ast.ts b/src/ast.ts
diff --git a/src/parser.ts b/src/parser.ts
@@ -4,6 +4,8 @@ export type ParseResult<T> = { value: T, restInput: string }
 
 export class ParseError extends Error {
 
+  name = "ParseError"
+
   constructor(
     message: string,
     public readonly restInput: string
diff --git a/src/regex-parser.ts b/src/regex-parser.ts
@@ -29,7 +29,9 @@ const unescapedCharInsideBrackets = P.satisfy(char => !Range.mustBeEscapedInside
 const unescapedCharOutsideBrackets = P.satisfy(char => !Range.mustBeEscapedOutsideBrackets(char))
   .map(CharSet.singleton)
 
-export class UnsupportedSyntaxError extends Error {}
+export class UnsupportedSyntaxError extends Error {
+  name = "UnsupportedSyntaxError"
+}
 
 const escapeSequence = P.string('\\').andThen(_ => P.anyChar).andThen(escapedChar => {
   switch (escapedChar) {
@@ -45,16 +47,16 @@ const escapeSequence = P.string('\\').andThen(_ => P.anyChar).andThen(escapedCha
     case 'v': return P.pure(CharSet.singleton('\v')) // vertical tab
     case 'f': return P.pure(CharSet.singleton('\f')) // form feed
     case '0': return P.pure(CharSet.singleton('\0')) // NUL character
-    case 'b': throw new UnsupportedSyntaxError('\b word-boundary assertion not supported')
-    case 'c': throw new UnsupportedSyntaxError('\cX control characters not supported')
+    case 'b': throw new UnsupportedSyntaxError('\b word-boundary assertion')
+    case 'c': throw new UnsupportedSyntaxError('\cX control characters')
     case 'x': return P.count(2, P.hexChar).map(chars => 
                 CharSet.fromRange(Range.singleton(parseInt(chars.join(''), 16)))
               )
     case 'u': return P.count(4, P.hexChar).map(chars =>
                 CharSet.fromRange(Range.singleton(parseInt(chars.join(''), 16)))
               )
-    case 'p': throw new UnsupportedSyntaxError('\\p not supported')
-    case 'P': throw new UnsupportedSyntaxError('\\P not supported')
+    case 'p': throw new UnsupportedSyntaxError('\\p')
+    case 'P': throw new UnsupportedSyntaxError('\\P')
     default: return P.pure(CharSet.singleton(escapedChar)) // match character literally
   }
 })
@@ -175,7 +177,7 @@ const lookbehind: P.Parser<AST.RegExpAST> =
     P.string(')'),
     regex(),
   ).map(_ => {
-    throw new UnsupportedSyntaxError('lookbehind assertions are not supported')
+    throw new UnsupportedSyntaxError('lookbehind assertions')
   })
 
 function regexTerm() {
diff --git a/src/regex.ts b/src/regex.ts
@@ -428,7 +428,9 @@ export function isEmpty(regex: ExtRegex): boolean {
   return regex.type === 'literal' && CharSet.isEmpty(regex.charset)
 }
 
-export class CacheOverflowError extends Error {}
+export class CacheOverflowError extends Error {
+  name = "CacheOverflowError"
+}
 
 export function codePointDerivative(codePoint: number, regex: StdRegex, cache: Table.Table<StdRegex>): StdRegex
 export function codePointDerivative(codePoint: number, regex: ExtRegex, cache: Table.Table<ExtRegex>): ExtRegex
@@ -484,7 +486,7 @@ function codePointDerivativeAux(codePoint: number, regex: ExtRegex, cache: Table
     // At least errors can be caught and handled. The limit is somewhat arbitrary.
     // TODO: maybe make this user configurable:
     if (Table.size(cache) >= 10_000) {
-      throw new CacheOverflowError('Cache overflow while computing DFA transitions.')
+      throw new CacheOverflowError('while computing DFA transitions.')
     }
 
     const result = codePointDerivative(codePoint, regex, cache)
@@ -668,7 +670,9 @@ function derivativeClassesAux(
 ///// exclusive standard regex utils     /////
 //////////////////////////////////////////////
 
-export class VeryLargeSyntaxTreeError extends Error {}
+export class VeryLargeSyntaxTreeError extends Error {
+  name = "VeryLargeSyntaxTreeError"
+}
 
 /**
  * TODO: docs
diff --git a/test/arbitrary-ast.ts b/test/arbitrary-ast.ts
@@ -78,7 +78,7 @@ function endAnchor(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AS
 
 /**
  * Traverses AST and renames capturing groups if the name already occurs in the expression.
- * `new RegExp(...)` throws an error when capture group names occur multiple times in the 
+ * `new RegExp(...)` throws an error when capture group names occur multiple times in the
  * same expression.
  */
 export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
@@ -94,7 +94,7 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
       return renameIfSeen(newName)
     }
   }
-  
+
   function traverse(node: AST.RegExpAST): AST.RegExpAST {
     switch (node.type) {
       case 'epsilon':
@@ -115,7 +115,7 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
         return AST.repeat(traverse(node.inner), node.bounds)
        case 'capture-group': {
         const innerProcessed = traverse(node.inner)
-        
+
         if (node.name === undefined) {
           return AST.captureGroup(innerProcessed, node.name)
         } else {
@@ -133,7 +133,7 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
         checkedAllCases(node)
     }
   }
-  
+
   return traverse(ast)
 }
 
diff --git a/test/ast.spec.ts b/test/ast.spec.ts
@@ -44,7 +44,7 @@ describe('toExtRegex', () => {
       [/^a^b/, RE.empty],
       // but two ^^ directly in a row are not a contradiction:
       [/(^^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
-      // in fact, as long as anything between two ^ can match epsilon, 
+      // in fact, as long as anything between two ^ can match epsilon,
       // there is no contradiction:
       [/(^(c|)^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
       [/(^c*^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
@@ -60,6 +60,9 @@ describe('toExtRegex', () => {
       // Nullable expressions on the left and right can be ignored:
       [/(a?)$^(b*)/, RE.epsilon],
 
+      // Contradiction inside lookahead collapses to empty set. Then empty set lookahead can't match anything:
+      [/(?=a^)/, RE.empty],
+
       [/(^a|)^b/, RE.seq([RE.singleChar('b'), dotStar])],
       [/^a(b^|c)/, RE.seq([RE.string('ac'), dotStar]) ],
       [/(^|a)b/, prefix(RE.concat(RE.optional(suffix(RE.singleChar('a'))), RE.singleChar('b')))],
@@ -89,7 +92,7 @@ describe('toExtRegex', () => {
       // negative lookahead:
       [/^a(?!b)c$/, RE.concat(RE.string('a'), RE.intersection(RE.complement(RE.string('b')), RE.string('c')))],
       // TODO: lookahead + lookbehind
-      // [/^a(?=b)(?<=a)b$/, RE.string('ab')], 
+      // [/^a(?=b)(?<=a)b$/, RE.string('ab')],
       // [/^b(?=ab)a(?<=ba)b$/, RE.string('bab')],
       // [/^a(?=b)(?<=a)(?!a)(?<!b)b$/, RE.string('ab')],
     ] as const
@@ -104,11 +107,11 @@ describe('toExtRegex', () => {
     it('fixme', { todo: true }, () => {
       const actual = AST.toExtRegex(parseRegExp(/^(a(?!b))*$/))
       const expected = RE.star(RE.string('a'))
-      assert.equal(actual.hash, expected.hash) 
+      assert.equal(actual.hash, expected.hash)
     })
 
   })
-  
+
 })
 
 describe('toString', () => {
diff --git a/test/regex-parser.spec.ts b/test/regex-parser.spec.ts
@@ -1,6 +1,6 @@
 import { describe, it, test } from "node:test"
 import assert from "node:assert"
-import { parseRegExp, parseRegExpString } from "../src/regex-parser"
+import { parseRegExp, parseRegExpString, UnsupportedSyntaxError } from "../src/regex-parser"
 import { RB } from "../src/index"
 import { ParseError } from "../src/parser"
 import * as AST from "../src/ast"
@@ -12,7 +12,7 @@ import * as Arbitrary from './arbitrary-ast'
 function char(c: string) {
   return AST.literal(CharSet.singleton(c))
 }
-function str(s: string) { 
+function str(s: string) {
   const chars = [...s].map(char)
   // Use right-associative concatenation: a(bc) not (ab)c
   return chars.reduceRight((acc, curr) => AST.concat(curr, acc))
@@ -44,7 +44,7 @@ describe('parseRegExp', () => {
     [/a{3,5}/, AST.repeat(char('a'), { min: 3, max: 5 })],
     // if curly bracket is not terminated the whole thing is interpreted literally:
     [/a{3,5/, str('a{3,5')],
-    // same if max value is given but min value is missing: 
+    // same if max value is given but min value is missing:
     [/a{,5}/, str('a{,5}')],
     // char classes / escaping:
     [/\w/, AST.literal(CharSet.wordChars)],
@@ -72,21 +72,21 @@ describe('parseRegExp', () => {
     [/^abc$/, AST.startAnchor(undefined, AST.endAnchor(str('abc'), undefined))],
     [/$a^/, AST.startAnchor(AST.endAnchor(undefined, char('a')), undefined)],
     // positive lookahead - now parsed as lookahead AST nodes, not intersections
-    [/(?=a)b/, AST.lookahead(true, char('a'), char('b'))], 
-    [/(?=a)(?:b)/, AST.lookahead(true, char('a'), char('b'))], 
-    [/(?=a)(?=b)c/, AST.lookahead(true, char('a'), AST.lookahead(true, char('b'), char('c')))], 
-    [/a(?=b)c/, AST.concat(char('a'), AST.lookahead(true, char('b'), char('c')))], 
-    [/a(?=b)/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.epsilon))], 
-    [/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.concat(char('c'), AST.lookahead(true, char('d'), char('e')))))], 
-    [/(?=)/, AST.lookahead(true, AST.epsilon, AST.epsilon)], 
+    [/(?=a)b/, AST.lookahead(true, char('a'), char('b'))],
+    [/(?=a)(?:b)/, AST.lookahead(true, char('a'), char('b'))],
+    [/(?=a)(?=b)c/, AST.lookahead(true, char('a'), AST.lookahead(true, char('b'), char('c')))],
+    [/a(?=b)c/, AST.concat(char('a'), AST.lookahead(true, char('b'), char('c')))],
+    [/a(?=b)/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.epsilon))],
+    [/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.concat(char('c'), AST.lookahead(true, char('d'), char('e')))))],
+    [/(?=)/, AST.lookahead(true, AST.epsilon, AST.epsilon)],
     // negative lookahead
-    [/(?!a)b/, AST.lookahead(false, char('a'), char('b'))], 
+    [/(?!a)b/, AST.lookahead(false, char('a'), char('b'))],
     [/(?!a)b|c/, AST.union(AST.lookahead(false, char('a'), char('b')), char('c'))],
     [/(?!)/, AST.lookahead(false, AST.epsilon, AST.epsilon)],
     // TODO: positive lookbehind
-    // [/(?<=a)/, AST.positiveLookbehind(char('a'))], 
+    // [/(?<=a)/, AST.positiveLookbehind(char('a'))],
     // TODO: negative lookbehind
-    // [/(?<!a)/, AST.negativeLookbehind(char('a'))], 
+    // [/(?<!a)/, AST.negativeLookbehind(char('a'))],
     // some special chars don't need escape when inside brackets:
     [/[.^$*+?()[{-|]/, AST.literal(CharSet.fromArray([...'.^$*+?()[{-|']))],
     // other special chars need escape even inside brackets:
@@ -134,12 +134,24 @@ describe('parseRegExp', () => {
 
 })
 
+function parse_skipKnownIssues(re: RegExp) {
+  try {
+    return RB(re)
+  } catch (error) {
+    if (error instanceof UnsupportedSyntaxError) {
+      fc.pre(false)
+    } else {
+      throw error
+    }
+  }
+}
+
 test('parse/stringify roundtrip preserves equivalence', {todo:true}, () => {
   fc.assert(
     fc.property(
       Arbitrary.regexp(),
       (inputRegExp: RegExp) => {
-        const builder = RB(inputRegExp)
+        const builder = parse_skipKnownIssues(inputRegExp)
         const outputRegExp = builder.toRegExp()
 
         for (const str of builder.enumerate().take(10)) {
@@ -148,19 +160,6 @@ test('parse/stringify roundtrip preserves equivalence', {todo:true}, () => {
         }
       },
     ),
-    // { numRuns: 1000 },
+    { numRuns: 100 },
   )
 })
-
-test('fixme 1', { todo: true }, () => {
-  const inputRegExp = /(^)+a/
-  const builder = RB(inputRegExp)
-  const outputRegExp = builder.toRegExp()
-
-  // console.debug(outputRegExp)
-
-  for (const str of builder.enumerate().take(10)) {
-    assert.match(str, outputRegExp)
-    assert.match(str, inputRegExp)
-  }
-})