feat(regex-parser): negative/positive lookAhead

gruhn · gruhn · commit 004226e955f0 · 2025-06-23T01:07:49.000+02:00
diff --git a/README.md b/README.md
@@ -98,24 +98,17 @@ RE.size(
 
 ## Limitations
 
-* Syntax support
-  - The library implements a custom parser for regular expressions,
-    so only a subset of the syntax is supported:
-    - quantifiers: `*`, `+`, `?`, `{3,5}`, ...
-    - alternation: `|`
-    - character classes: `.`, `\w`, `[a-z]`, ...
-    - optional start/end markers: `^` / `$` but only at the start/end
-      (technically they are allowed anywhere in the expression)
-    - escaped meta characters: `\$`, `\.`, ...
-    - capturing groups: `(...)`
-  - regex flags are not supported at all
-* performance of `intersection` and `complement`
-  - These function have worst case exponential complexity.
-    But often the worst case is not realized.
-    - Nested quantifiers are especially dangerous, e.g. `(a*|b)*`.
-  - A bigger problem is: even if computation is fast,
-    the output regex can be extremely large to the point that
-    the `new RegExp(...)` constructor crashes.
+The library implements a custom parser for regular expressions,
+so only a subset of the syntax is supported:
+ - quantifiers: `*`, `+`, `?`, `{3,5}`, ...
+ - alternation: `|`
+ - character classes: `.`, `\w`, `[a-z]`, ...
+ - optional start/end markers: `^` / `$` but only at the start/end
+   (technically they are allowed anywhere in the expression)
+ - escaped meta characters: `\$`, `\.`, ...
+ - (non-)capturing groups: `(...)`, `(?...)`
+ - positive/negative lookahead: `(?!...)`, `(?=...)`
+Regex flags are not supported at all.
 
 ## References
 
diff --git a/src/index.ts b/src/index.ts
@@ -124,7 +124,12 @@ export function complement(re: RegExp): RegExp {
  * ```
  */
 export function* enumerate(re: RegExp): Generator<string> {
-  yield* RE.enumerate(RE.parse(re))
+  const regex = RE.parse(re)
+  if (RE.isStdRegex(regex)) {
+    yield* RE.enumerate(regex)
+  } else {
+    yield* RE.enumerate(RE.toStdRegex(regex))
+  }
 }
 
 /**
@@ -151,7 +156,12 @@ export function* enumerate(re: RegExp): Generator<string> {
  * > The value should always be an upper bound though.
  */
 export function size(re: RegExp): bigint | undefined {
-  return RE.size(RE.parse(re))
+  const regex = RE.parse(re)
+  if (RE.isStdRegex(regex)) {
+    return RE.size(regex)
+  } else {
+    return RE.size(RE.toStdRegex(regex))
+  }
 }
 
 /**
@@ -160,5 +170,10 @@ export function size(re: RegExp): bigint | undefined {
  * TODO: examples.
  */
 export function derivative(prefix: string, re: RegExp): RegExp {
-  return RE.toRegExp(RE.derivative(prefix, RE.parse(re)))
+  const regex = RE.derivative(prefix, RE.parse(re))
+  if (RE.isStdRegex(regex)) {
+    return RE.toRegExp(regex)
+  } else {
+    return RE.toRegExp(RE.toStdRegex(regex))
+  }
 }
diff --git a/src/low-level-api.ts b/src/low-level-api.ts
@@ -24,6 +24,7 @@ export {
   type StdRegex,
   type ExtRegex,
   type RepeatBounds,
+  isStdRegex,
   // constructors:
   and,
   or,
diff --git a/src/regex-parser.ts b/src/regex-parser.ts
@@ -107,7 +107,7 @@ const group = P.between(
   regex(),
 )
 
-const boundedQuantifier: P.Parser<(inner: RE.StdRegex) => RE.StdRegex> = P.between(
+const boundedQuantifier: P.Expr.UnaryOperator<RE.ExtRegex> = P.between(
   P.string('{'),
   P.string('}'),
   P.optional(P.decimal).andThen(min => {
@@ -142,33 +142,51 @@ function regexTerm() {
     charSet.map(RE.literal),
   ])
 }
+
+function lookAhead(): P.Expr.UnaryOperator<RE.ExtRegex> {
+  return P.between(
+    P.string('(?'),
+    P.string(')'),
+    P.choice([
+      // positive lookahead
+      P.string('=').andThen(_ => regexWithBounds()),
+      // negative lookahead
+      P.string('!').andThen(_ => regexWithBounds().map(RE.complement)),
+    ]).map(
+      left => right => RE.intersection(left, right)
+    )
+  )
+}
  
-function regex(): P.Parser<RE.StdRegex> {
-  return P.lazy(() => P.Expr.makeExprParser<RE.StdRegex>(
+function regex(): P.Parser<RE.ExtRegex> {
+  return P.lazy(() => P.Expr.makeExprParser<RE.ExtRegex>(
     regexTerm(),
     [
       { type: 'postfix', op: P.string('*').map(_ => RE.star) },
       { type: 'postfix', op: boundedQuantifier },
       { type: 'postfix', op: P.string('+').map(_ => RE.plus) },
       { type: 'postfix', op: P.string('?').map(_ => RE.optional) },
       { type: 'infixRight', op: P.string('').map(_ => RE.concat) },
+      { type: 'prefix', op: lookAhead() },
       { type: 'infixRight', op: P.string('|').map(_ => RE.union) },
     ]
   ))
 }
 
 // TODO: start- and end marker are not necessarily at the 
 // beginning/end of the regex:
-const regexWithBounds = P.sequence([
-  startMarker,
-  regex(),
-  endMarker,
-]).map<RE.StdRegex>(RE.seq)
+function regexWithBounds() {
+  return P.sequence([
+    startMarker,
+    regex(),
+    endMarker,
+  ]).map<RE.ExtRegex>(RE.seq)
+}
 
 export function parseRegexString(
   regexStr: string,
-): RE.StdRegex {
-  const { value, restInput } = regexWithBounds.run(regexStr)
+): RE.ExtRegex {
+  const { value, restInput } = regexWithBounds().run(regexStr)
   if (restInput === '') {
     // TODO: parsing should always return stdandard regex instances:
     return value
@@ -182,7 +200,7 @@ export function parseRegexString(
  * 
  * @public
  */
-export function parseRegExp(regexp: RegExp): RE.StdRegex {
+export function parseRegExp(regexp: RegExp): RE.ExtRegex {
   for (const flag of regExpFlags) {
     assert(!regexp[flag], `[regex-utils] RegExp flags not supported`)
   }
diff --git a/src/regex.ts b/src/regex.ts
@@ -6,7 +6,7 @@ import * as Table from './table';
 /**
  * TODO
  */
-type StdRegexWithoutHash = (
+type StdRegexWithoutMetaInfo = (
   | { type: "epsilon" }
   | { type: "literal", charset: CharSet.CharSet }
   | { type: "concat", left: StdRegex, right: StdRegex }
@@ -17,7 +17,7 @@ type StdRegexWithoutHash = (
 /**
  * TODO
  */
-type ExtRegexWithoutHash = (
+type ExtRegexWithoutMetaInfo = (
   | { type: "epsilon" }
   | { type: "literal", charset: CharSet.CharSet }
   | { type: "concat", left: ExtRegex, right: ExtRegex }
@@ -31,40 +31,81 @@ type ExtRegexWithoutHash = (
 /**
  * TODO: docs
  */
-export type StdRegex = StdRegexWithoutHash & { hash: number }
+export type StdRegex = StdRegexWithoutMetaInfo & { hash: number, isStdRegex: true }
 
 /**
  * TODO: docs
  */
-export type ExtRegex = ExtRegexWithoutHash & { hash: number }
+export type ExtRegex = ExtRegexWithoutMetaInfo & { hash: number, isStdRegex: boolean }
 
-export function withHash(regex: StdRegexWithoutHash): StdRegex
-export function withHash(regex: ExtRegexWithoutHash): ExtRegex 
-export function withHash(regex: ExtRegexWithoutHash): ExtRegex {
+export function withMetaInfo(regex: StdRegexWithoutMetaInfo): StdRegex
+export function withMetaInfo(regex: ExtRegexWithoutMetaInfo): ExtRegex 
+export function withMetaInfo(regex: ExtRegexWithoutMetaInfo): ExtRegex {
   if (regex.type === 'epsilon')
-    return { ...regex, hash: hashStr(regex.type) }
+    return {
+      ...regex,
+      hash: hashStr(regex.type),
+      isStdRegex: true,
+    }
   else if (regex.type === 'literal')
-    return { ...regex, hash: hashNums([hashStr(regex.type), regex.charset.hash]) }
-  else if (regex.type === 'concat' || regex.type === 'union' || regex.type === 'intersection')
-    return { ...regex, hash: hashNums([
-      hashStr(regex.type),
-      // Need non-commutative hash operator for `concat`, otherwise "ac" and "ca" are the same:
-      regex.left.hash,
-      regex.right.hash,
-    ])}
-  else if (regex.type === 'star' || regex.type === 'complement')
-    return { ...regex, hash: hashNums([hashStr(regex.type), regex.inner.hash]) }
+    return {
+      ...regex,
+      hash: hashNums([hashStr(regex.type), regex.charset.hash]),
+      isStdRegex: true,
+    }
+  else if (regex.type === 'concat' || regex.type === 'union')
+    return {
+      ...regex,
+      hash: hashNums([
+        hashStr(regex.type),
+        // Need non-commutative hash operator for `concat`, otherwise "ac" and "ca" are the same:
+        regex.left.hash,
+        regex.right.hash,
+      ]),
+      isStdRegex: regex.left.isStdRegex && regex.right.isStdRegex,
+    }
+  else if (regex.type === 'intersection')
+    return {
+      ...regex,
+      hash: hashNums([
+        hashStr(regex.type),
+        regex.left.hash,
+        regex.right.hash,
+      ]),
+      isStdRegex: false,
+    }
+  else if (regex.type === 'star')
+    return {
+      ...regex,
+      hash: hashNums([hashStr(regex.type), regex.inner.hash]),
+      isStdRegex: regex.inner.isStdRegex,
+    }
+  else if (regex.type === 'complement')
+    return {
+      ...regex,
+      hash: hashNums([hashStr(regex.type), regex.inner.hash]),
+      isStdRegex: false
+    }
   checkedAllCases(regex)  
 }
 
+/**
+ * TODO
+ *
+ * @public
+ */
+export function isStdRegex(regex: ExtRegex): regex is StdRegex {
+  return regex.isStdRegex
+}
+
 //////////////////////////////////////////////
 ///// primitive composite constructors ///////
 //////////////////////////////////////////////
 
-export const epsilon: StdRegex = withHash({ type: 'epsilon'  })
+export const epsilon: StdRegex = withMetaInfo({ type: 'epsilon'  })
 
 export function literal(charset: CharSet.CharSet): StdRegex {
-  return withHash({ type: 'literal', charset })
+  return withMetaInfo({ type: 'literal', charset })
 }
 
 export const empty: StdRegex = literal(CharSet.empty)
@@ -113,7 +154,7 @@ export function concat(left: ExtRegex, right: ExtRegex): ExtRegex {
       return concat(left, right.right)
   }
 
-  return withHash({ type: 'concat', left, right })
+  return withMetaInfo({ type: 'concat', left, right })
 }
 
 function extractFront(regex: StdRegex): [StdRegex, StdRegex]
@@ -212,7 +253,7 @@ export function union(left: ExtRegex, right: ExtRegex): ExtRegex {
     // r       + (s · r) = (s + ε) · r
     return concat(union(leftInit, rightInit), leftLast)
 
-  return withHash({ type: 'union', left, right })
+  return withMetaInfo({ type: 'union', left, right })
 }
 
 export function star(inner: StdRegex): StdRegex
@@ -231,7 +272,7 @@ export function star(inner: ExtRegex): ExtRegex {
     // (r∗ · s∗)∗ = (r + s)∗
     return star(union(inner.left.inner, inner.right.inner))
   else
-    return withHash({ type: "star", inner })
+    return withMetaInfo({ type: "star", inner })
 }
 
 export function intersection(left: ExtRegex, right: ExtRegex): ExtRegex {
@@ -257,7 +298,7 @@ export function intersection(left: ExtRegex, right: ExtRegex): ExtRegex {
     // R & S ≈ R∩S
     return literal(CharSet.intersection(left.charset, right.charset))
 
-  return withHash({ type: "intersection", left, right })
+  return withMetaInfo({ type: "intersection", left, right })
 }
 
 /**
@@ -274,7 +315,7 @@ export function complement(inner: ExtRegex): ExtRegex {
   //   // ¬S ≈ (Σ\S
   //   return literal(CharSet.complement(inner.charset))
   else
-    return withHash({ type: "complement", inner })
+    return withMetaInfo({ type: "complement", inner })
 }
 
 //////////////////////////////////////////////
@@ -733,7 +774,7 @@ export function toString(regex: ExtRegex): string {
   // Render parenthesis as non-capturing groups if there is a large number of them,
   // i.e. `/(?:abc)` instead of `/(abc)/`. `new RegExp(...)` throws an error if there
   // is a large number of capturing groups. Non-capturing groups are a bit more verbose
-  // but at large sizes like this it doesn't matter anyway:
+  // but at large sizes like this it hardly still hurts readability:
   const useNonCapturingGroups = size > 10_000
 
   return '^(' + astToString(toRegExpAST(regex), { useNonCapturingGroups }) + ')$'
diff --git a/test/regex-parser.spec.ts b/test/regex-parser.spec.ts
@@ -32,6 +32,9 @@ describe('parseRegexString', () => {
     [/^[a-z]$/, RE.literal(CharSet.charRange('a', 'z'))],
     [/^[^abc]$/, RE.literal(CharSet.complement(CharSet.fromArray(['a', 'b', 'c'])))],
     [/^(?:ab)$/, RE.string('ab')], // non-capturing groups
+    [/^(?=^a$)a$/, RE.intersection(RE.string('a'), RE.string('a'))], // positive lookahead
+    [/^(?!^a$)b$/, RE.intersection(RE.complement(RE.string('a')), RE.string('b'))], // negative lookahead
+    [/^(?!^a$)b|c$/, RE.union(RE.intersection(RE.complement(RE.string('a')), RE.string('b')), RE.string('c'))],
   ])('can parse %s', (regexp, expected) => {
     expect(parseRegExp(regexp)).toEqual(expected)
   })