fix: various start/end anchor edge cases

gruhn · web-flow · commit 16cf82b02561 · 2025-09-14T13:41:47.000+02:00
diff --git a/src/ast.ts b/src/ast.ts
diff --git a/src/index.ts b/src/index.ts
@@ -1,6 +1,6 @@
-import { RepeatBounds } from './ast'
 import { isEquivalent, toStdRegex } from './dfa'
 import * as RE from './regex'
+import * as AST from './ast'
 import { parseRegExp } from './regex-parser'
 
 export { ParseError } from './parser'
@@ -53,7 +53,7 @@ function fromRegexLike(re: RegexLike): RE.ExtRegex {
   else if (typeof re === 'string')
     return RE.string(re)
   else if (re instanceof RegExp)
-    return RE.fromRegExpAST(parseRegExp(re))
+    return AST.toExtRegex(parseRegExp(re))
   else if (re instanceof RegexBuilder)
     return re.regex
   else
@@ -183,7 +183,7 @@ class RegexBuilder {
    * 
    * @public
    */
-  repeat(bounds: RepeatBounds = { min: 0 }): RegexBuilder {
+  repeat(bounds: AST.RepeatBounds = { min: 0 }): RegexBuilder {
     return new RegexBuilder(
       RE.repeat(this.regex, bounds)
     )
diff --git a/src/regex-parser.ts b/src/regex-parser.ts
@@ -246,8 +246,8 @@ function regex(): P.Parser<AST.RegExpAST> {
       { type: 'postfix', op: P.string('?').map(_ => AST.optional) },
       { type: 'infixRight', op: P.string('').map(_ => AST.concat) },
       { type: 'infixRightOptional', op: lookAheadOp() },
-      { type: 'infixRightOptional', op: P.string('$').map(_ => AST.endMarker) },
-      { type: 'infixRightOptional', op: P.string('^').map(_ => AST.startMarker) },
+      { type: 'infixRightOptional', op: P.string('$').map(_ => AST.endAnchor) },
+      { type: 'infixRightOptional', op: P.string('^').map(_ => AST.startAnchor) },
       { type: 'infixRightOptional', op: P.string('|').map(_ => AST.union) },
     ]
   ))
diff --git a/src/regex.ts b/src/regex.ts
@@ -738,74 +738,6 @@ function toRegExpAST(regex: StdRegex): AST.RegExpAST {
   checkedAllCases(regex)
 }
 
-export function fromRegExpAST(ast: AST.RegExpAST): ExtRegex {
-  ast = AST.addImplicitEndMarker(ast)
-  ast = AST.addImplicitStartMarker(ast)
-  return fromRegExpAST_(ast)
-}
-
-function fromRegExpAST_(ast: AST.RegExpAST): ExtRegex {
-  switch (ast.type) {
-    case 'epsilon':
-      return epsilon
-    case 'literal':
-      return literal(ast.charset)
-    case 'concat':
-      return concat(fromRegExpAST_(ast.left), fromRegExpAST_(ast.right))
-    case 'union':
-      return union(fromRegExpAST_(ast.left), fromRegExpAST_(ast.right))
-    case 'star':
-      return star(fromRegExpAST_(ast.inner))
-    case 'plus': 
-      return repeat(fromRegExpAST_(ast.inner), { min: 1 })   
-    case 'optional':
-      return optional(fromRegExpAST_(ast.inner))
-    case 'repeat':
-      return repeat(fromRegExpAST_(ast.inner), ast.bounds)
-    case 'capture-group':
-      return fromRegExpAST_(ast.inner)
-    case 'start-marker': {
-      const left = fromRegExpAST_(ast.left)
-      const right = fromRegExpAST_(ast.right)
-
-      if (isNullable(left)) 
-        // If the sub-expression left of a start marker matches the empty string, 
-        // then it can ONLY match the empty string. E.g. /a*^b/ becomes /^b/.
-        return right
-      else
-        // If it doesn't match the empty string, then both the left and right
-        // sub-expression can only match the empty set. E.g. /(a^b|c)/ becomes /c/.
-        return empty
-    }
-    case 'end-marker': {
-      const left = fromRegExpAST_(ast.left)
-      const right = fromRegExpAST_(ast.right)
-
-      if (isNullable(right)) 
-        // If the sub-expression right of a end marker matches the empty string, 
-        // then it can ONLY match the empty string. E.g. /a$b*/ becomes /a$/.
-        return left
-      else
-        // If it doesn't match the empty string, then both the left and right
-        // sub-expression can only match the empty set. E.g. /(a|b$c)/ becomes /a/.
-        return empty
-    }
-    case 'positive-lookahead': {
-      const inner = fromRegExpAST_(ast.inner)
-      const right = fromRegExpAST_(ast.right)
-      return intersection(inner, right)
-    }
-    case 'negative-lookahead': {
-      const inner = fromRegExpAST_(ast.inner)
-      const right = fromRegExpAST_(ast.right)
-      return intersection(complement(inner), right)
-    }
-  }
-  checkedAllCases(ast)
-}
-
-
-
 /**
  * Rather ad-hoc way to find chains of same regexes, e.g. `[a-z][a-z][a-z]`,
  * to produce more compact representation when converting to string,
diff --git a/src/utils.ts b/src/utils.ts
@@ -187,6 +187,6 @@ export function sum(items: number[]) {
  * Type guard that checks if an unknown value is one of the elements in the provided array.
  * Returns true if the item is found in the array, with proper TypeScript type narrowing.
  */
-export function isOneOf<T>(item: unknown, array: T[]): item is T {
+export function isOneOf<T>(item: unknown, array: readonly T[]): item is T {
   return (array as unknown[]).includes(item)
 }
diff --git a/test/arbitrary-ast.ts b/test/arbitrary-ast.ts
@@ -71,14 +71,14 @@ function negativeLookahead(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbi
     .map(([inner, right]) => AST.negativeLookahead(inner, right))
 }
 
-function startMarker(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
+function startAnchor(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
   return fc.tuple(childArb(), childArb())
-    .map(([left, right]) => AST.startMarker(left, right))
+    .map(([left, right]) => AST.startAnchor(left, right))
 }
 
-function endMarker(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
+function endAnchor(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
   return fc.tuple(childArb(), childArb())
-    .map(([left, right]) => AST.endMarker(left, right))
+    .map(([left, right]) => AST.endAnchor(left, right))
 }
 
 /**
@@ -132,10 +132,10 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
         return AST.positiveLookahead(traverse(node.inner), traverse(node.right))
       case 'negative-lookahead':
         return AST.negativeLookahead(traverse(node.inner), traverse(node.right))
-      case 'start-marker':
-        return AST.startMarker(traverse(node.left), traverse(node.right))
-      case 'end-marker':
-        return AST.endMarker(traverse(node.left), traverse(node.right))
+      case 'start-anchor':
+        return AST.startAnchor(traverse(node.left), traverse(node.right))
+      case 'end-anchor':
+        return AST.endAnchor(traverse(node.left), traverse(node.right))
       default:
         checkedAllCases(node)
     }
@@ -167,8 +167,8 @@ function regexpAST_(size: number): fc.Arbitrary<AST.RegExpAST> {
       { arbitrary: captureGroup(() => regexpAST_(childSize)), weight: 2 },
       { arbitrary: positiveLookahead(() => regexpAST_(childSize)), weight: 1 },
       { arbitrary: negativeLookahead(() => regexpAST_(childSize)), weight: 1 },
-      { arbitrary: startMarker(() => regexpAST_(childSize)), weight: 1 },
-      { arbitrary: endMarker(() => regexpAST_(childSize)), weight: 1 }
+      { arbitrary: startAnchor(() => regexpAST_(childSize)), weight: 1 },
+      { arbitrary: endAnchor(() => regexpAST_(childSize)), weight: 1 }
     )
   }
 }
diff --git a/test/ast.spec.ts b/test/ast.spec.ts
@@ -0,0 +1,112 @@
+import { describe, it } from "node:test"
+import { strict as assert } from "node:assert"
+import * as RE from "../src/regex"
+import * as AST from "../src/ast"
+import { parseRegExp } from "../src/regex-parser"
+
+describe('toExtRegex', () => {
+
+  const dotStar = RE.star(RE.anySingleChar)
+
+  // function infix(regex: RE.ExtRegex) {
+  //   return RE.seq([ dotStar, regex, dotStar ])
+  // }
+
+  function prefix(regex: RE.ExtRegex) {
+    return RE.concat(regex, dotStar)
+  }
+
+  function suffix(regex: RE.ExtRegex) {
+    return RE.concat(dotStar, regex)
+  }
+
+  describe('union with empty members', () => {
+    const testCases = [
+      [/^(|a)$/, RE.optional(RE.singleChar('a'))],
+      [/^(a||)$/, RE.optional(RE.singleChar('a'), )],
+      [/^(|a|)$/, RE.optional(RE.singleChar('a'))],
+      [/^(|)$/, RE.epsilon],
+    ] as const
+
+    for (const [regexp, expected] of testCases) {
+      it(`${regexp}`, () => {
+        const actual = AST.toExtRegex(parseRegExp(regexp))
+        assert.equal(actual.hash, expected.hash)
+      })
+    }
+  })
+
+  describe('start/end anchor elimination', () => {
+    const testCases = [
+      [/^abc/, RE.seq([RE.string('abc'), dotStar])],
+      // start marker contradictions can only match empty set:
+      [/a^b/, RE.empty],
+      [/^a^b/, RE.empty],
+      // but two ^^ directly in a row are not a contradiction:
+      [/(^^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
+      // in fact, as long as anything between two ^ can match epsilon, 
+      // there is no contradiction:
+      [/(^(c|)^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
+      [/(^c*^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
+      // Also, contradiction inside a union does NOT collapse
+      // the whole expression to empty set:
+      [/(a^b|c)/, RE.seq([dotStar, RE.singleChar('c'), dotStar])],
+      [/^(a^b|c)/, RE.seq([RE.singleChar('c'), dotStar])],
+
+      // End anchor before start anchor is contradictory and describes empty set:
+      [/$.^/, RE.empty],
+      // Can still match epsilon as long as there's nothing between end- and start anchor:
+      [/$^/, RE.epsilon],
+      // Nullable expressions on the left and right can be ignored:
+      [/(a?)$^(b*)/, RE.epsilon],
+
+      [/(^a|)^b/, RE.seq([RE.singleChar('b'), dotStar])],
+      [/^a(b^|c)/, RE.seq([RE.string('ac'), dotStar]) ],
+      [/(^|a)b/, prefix(RE.concat(RE.optional(suffix(RE.singleChar('a'))), RE.singleChar('b')))],
+
+      // FIXME:
+      // [/(^)+a$/, RE.singleChar('a') ],
+      [/(^)*a$/, suffix(RE.singleChar('a')) ],
+      [/(b|^)a$/, RE.concat(RE.optional(suffix(RE.singleChar('b'))), RE.singleChar('a'))],
+      [/a(^)/, RE.empty],
+    ] as const
+
+    for (const [regexp, expected] of testCases) {
+      it(`${regexp}`, () => {
+        const actual = AST.toExtRegex(parseRegExp(regexp))
+        assert.equal(actual.hash, expected.hash)
+      })
+    }
+  })
+
+  describe('lookahead elimination', () => {
+    const testCases = [
+      // positive lookahead:
+      [/^(?=a)a$/, RE.string('a')],
+      [/^a(?=b)b$/, RE.string('ab')],
+      [/^((?=a)a|(?=b)b)$/, RE.union(RE.string('a'), RE.string('b'))],
+      [/^(?=[0-5])(?=[5-9])[3-7]$/, RE.string('5')],
+      // negative lookahead:
+      [/^a(?!b)c$/, RE.concat(RE.string('a'), RE.intersection(RE.complement(RE.string('b')), RE.string('c')))],
+      // TODO: lookahead + lookbehind
+      // [/^a(?=b)(?<=a)b$/, RE.string('ab')], 
+      // [/^b(?=ab)a(?<=ba)b$/, RE.string('bab')],
+      // [/^a(?=b)(?<=a)(?!a)(?<!b)b$/, RE.string('ab')],
+    ] as const
+
+    for (const [regexp, expected] of testCases) {
+      it(`${regexp}`, () => {
+        const actual = AST.toExtRegex(parseRegExp(regexp))
+        assert.equal(actual.hash, expected.hash, RE.debugShow(actual) + '\n\n' + RE.debugShow(expected))
+      })
+    }
+
+    it('fixme', { todo: true }, () => {
+      const actual = AST.toExtRegex(parseRegExp(/^(a(?!b))*$/))
+      const expected = RE.star(RE.string('a'))
+      assert.equal(actual.hash, expected.hash) 
+    })
+
+  })
+  
+})
diff --git a/test/regex-parser.spec.ts b/test/regex-parser.spec.ts
@@ -66,11 +66,11 @@ describe('parseRegExp', () => {
     [/(?<ABC>abc)/, group(str('abc'), 'ABC')],
     [/(?<___>abc)/, group(str('abc'), '___')],
     // start/end marker
-    [/^abc/, AST.startMarker(undefined, str('abc'))],
-    [/a^b/, AST.startMarker(char('a'), str('b'))],
-    [/^a|^b/, AST.union(AST.startMarker(undefined, str('a')), AST.startMarker(undefined, char('b')))],
-    [/^abc$/, AST.startMarker(undefined, AST.endMarker(str('abc'), undefined))],
-    [/$a^/, AST.startMarker(AST.endMarker(undefined, char('a')), undefined)],
+    [/^abc/, AST.startAnchor(undefined, str('abc'))],
+    [/a^b/, AST.startAnchor(char('a'), str('b'))],
+    [/^a|^b/, AST.union(AST.startAnchor(undefined, str('a')), AST.startAnchor(undefined, char('b')))],
+    [/^abc$/, AST.startAnchor(undefined, AST.endAnchor(str('abc'), undefined))],
+    [/$a^/, AST.startAnchor(AST.endAnchor(undefined, char('a')), undefined)],
     // positive lookahead - now parsed as lookahead AST nodes, not intersections
     [/(?=a)b/, AST.positiveLookahead(char('a'), char('b'))], 
     [/(?=a)(?:b)/, AST.positiveLookahead(char('a'), char('b'))], 
@@ -134,24 +134,33 @@ describe('parseRegExp', () => {
 
 })
 
-test('parse/stringify roundtrip preserves equivalence', { todo: true }, () => {
+test('parse/stringify roundtrip preserves equivalence', {todo:true}, () => {
   fc.assert(
     fc.property(
       Arbitrary.regexp(),
-      (inputRegExp) => {
+      (inputRegExp: RegExp) => {
         const builder = RB(inputRegExp)
         const outputRegExp = builder.toRegExp()
 
-        // console.debug(inputRegExp)
-        // console.debug(outputRegExp)
-
         for (const str of builder.enumerate().take(10)) {
           assert.match(str, outputRegExp)
           assert.match(str, inputRegExp)
         }
-      }
+      },
     ),
-    { seed: -1651123632, path: "89:0", endOnFailure: true }
+    // { numRuns: 1000 },
   )
 })
 
+test('fixme 1', { todo: true }, () => {
+  const inputRegExp = /(^)+a/
+  const builder = RB(inputRegExp)
+  const outputRegExp = builder.toRegExp()
+
+  // console.debug(outputRegExp)
+
+  for (const str of builder.enumerate().take(10)) {
+    assert.match(str, outputRegExp)
+    assert.match(str, inputRegExp)
+  }
+})
diff --git a/test/regex.spec.ts b/test/regex.spec.ts

Original file line number	Diff line number	Diff line change
`@@ -187,6 +187,6 @@ export function sum(items: number[]) {`
`187`	`187`	`* Type guard that checks if an unknown value is one of the elements in the provided array.`
`188`	`188`	`* Returns true if the item is found in the array, with proper TypeScript type narrowing.`
`189`	`189`	`*/`
`190`		`-export function isOneOf<T>(item: unknown, array: T[]): item is T {`
	`190`	`+export function isOneOf<T>(item: unknown, array: readonly T[]): item is T {`
`191`	`191`	`return (array as unknown[]).includes(item)`
`192`	`192`	`}`