Skip to content

Commit 16cf82b

Browse files
authored
fix: various start/end anchor edge cases
1 parent 4fce65e commit 16cf82b

File tree

9 files changed

+600
-233
lines changed

9 files changed

+600
-233
lines changed

src/ast.ts

Lines changed: 447 additions & 48 deletions
Large diffs are not rendered by default.

src/index.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import { RepeatBounds } from './ast'
21
import { isEquivalent, toStdRegex } from './dfa'
32
import * as RE from './regex'
3+
import * as AST from './ast'
44
import { parseRegExp } from './regex-parser'
55

66
export { ParseError } from './parser'
@@ -53,7 +53,7 @@ function fromRegexLike(re: RegexLike): RE.ExtRegex {
5353
else if (typeof re === 'string')
5454
return RE.string(re)
5555
else if (re instanceof RegExp)
56-
return RE.fromRegExpAST(parseRegExp(re))
56+
return AST.toExtRegex(parseRegExp(re))
5757
else if (re instanceof RegexBuilder)
5858
return re.regex
5959
else
@@ -183,7 +183,7 @@ class RegexBuilder {
183183
*
184184
* @public
185185
*/
186-
repeat(bounds: RepeatBounds = { min: 0 }): RegexBuilder {
186+
repeat(bounds: AST.RepeatBounds = { min: 0 }): RegexBuilder {
187187
return new RegexBuilder(
188188
RE.repeat(this.regex, bounds)
189189
)

src/regex-parser.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,8 @@ function regex(): P.Parser<AST.RegExpAST> {
246246
{ type: 'postfix', op: P.string('?').map(_ => AST.optional) },
247247
{ type: 'infixRight', op: P.string('').map(_ => AST.concat) },
248248
{ type: 'infixRightOptional', op: lookAheadOp() },
249-
{ type: 'infixRightOptional', op: P.string('$').map(_ => AST.endMarker) },
250-
{ type: 'infixRightOptional', op: P.string('^').map(_ => AST.startMarker) },
249+
{ type: 'infixRightOptional', op: P.string('$').map(_ => AST.endAnchor) },
250+
{ type: 'infixRightOptional', op: P.string('^').map(_ => AST.startAnchor) },
251251
{ type: 'infixRightOptional', op: P.string('|').map(_ => AST.union) },
252252
]
253253
))

src/regex.ts

Lines changed: 0 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -738,74 +738,6 @@ function toRegExpAST(regex: StdRegex): AST.RegExpAST {
738738
checkedAllCases(regex)
739739
}
740740

741-
export function fromRegExpAST(ast: AST.RegExpAST): ExtRegex {
742-
ast = AST.addImplicitEndMarker(ast)
743-
ast = AST.addImplicitStartMarker(ast)
744-
return fromRegExpAST_(ast)
745-
}
746-
747-
function fromRegExpAST_(ast: AST.RegExpAST): ExtRegex {
748-
switch (ast.type) {
749-
case 'epsilon':
750-
return epsilon
751-
case 'literal':
752-
return literal(ast.charset)
753-
case 'concat':
754-
return concat(fromRegExpAST_(ast.left), fromRegExpAST_(ast.right))
755-
case 'union':
756-
return union(fromRegExpAST_(ast.left), fromRegExpAST_(ast.right))
757-
case 'star':
758-
return star(fromRegExpAST_(ast.inner))
759-
case 'plus':
760-
return repeat(fromRegExpAST_(ast.inner), { min: 1 })
761-
case 'optional':
762-
return optional(fromRegExpAST_(ast.inner))
763-
case 'repeat':
764-
return repeat(fromRegExpAST_(ast.inner), ast.bounds)
765-
case 'capture-group':
766-
return fromRegExpAST_(ast.inner)
767-
case 'start-marker': {
768-
const left = fromRegExpAST_(ast.left)
769-
const right = fromRegExpAST_(ast.right)
770-
771-
if (isNullable(left))
772-
// If the sub-expression left of a start marker matches the empty string,
773-
// then it can ONLY match the empty string. E.g. /a*^b/ becomes /^b/.
774-
return right
775-
else
776-
// If it doesn't match the empty string, then both the left and right
777-
// sub-expression can only match the empty set. E.g. /(a^b|c)/ becomes /c/.
778-
return empty
779-
}
780-
case 'end-marker': {
781-
const left = fromRegExpAST_(ast.left)
782-
const right = fromRegExpAST_(ast.right)
783-
784-
if (isNullable(right))
785-
// If the sub-expression right of a end marker matches the empty string,
786-
// then it can ONLY match the empty string. E.g. /a$b*/ becomes /a$/.
787-
return left
788-
else
789-
// If it doesn't match the empty string, then both the left and right
790-
// sub-expression can only match the empty set. E.g. /(a|b$c)/ becomes /a/.
791-
return empty
792-
}
793-
case 'positive-lookahead': {
794-
const inner = fromRegExpAST_(ast.inner)
795-
const right = fromRegExpAST_(ast.right)
796-
return intersection(inner, right)
797-
}
798-
case 'negative-lookahead': {
799-
const inner = fromRegExpAST_(ast.inner)
800-
const right = fromRegExpAST_(ast.right)
801-
return intersection(complement(inner), right)
802-
}
803-
}
804-
checkedAllCases(ast)
805-
}
806-
807-
808-
809741
/**
810742
* Rather ad-hoc way to find chains of same regexes, e.g. `[a-z][a-z][a-z]`,
811743
* to produce more compact representation when converting to string,

src/utils.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,6 @@ export function sum(items: number[]) {
187187
* Type guard that checks if an unknown value is one of the elements in the provided array.
188188
* Returns true if the item is found in the array, with proper TypeScript type narrowing.
189189
*/
190-
export function isOneOf<T>(item: unknown, array: T[]): item is T {
190+
export function isOneOf<T>(item: unknown, array: readonly T[]): item is T {
191191
return (array as unknown[]).includes(item)
192192
}

test/arbitrary-ast.ts

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,14 @@ function negativeLookahead(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbi
7171
.map(([inner, right]) => AST.negativeLookahead(inner, right))
7272
}
7373

74-
function startMarker(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
74+
function startAnchor(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
7575
return fc.tuple(childArb(), childArb())
76-
.map(([left, right]) => AST.startMarker(left, right))
76+
.map(([left, right]) => AST.startAnchor(left, right))
7777
}
7878

79-
function endMarker(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
79+
function endAnchor(childArb: () => fc.Arbitrary<AST.RegExpAST>): fc.Arbitrary<AST.RegExpAST> {
8080
return fc.tuple(childArb(), childArb())
81-
.map(([left, right]) => AST.endMarker(left, right))
81+
.map(([left, right]) => AST.endAnchor(left, right))
8282
}
8383

8484
/**
@@ -132,10 +132,10 @@ export function makeCaptureGroupNamesUnique(ast: AST.RegExpAST): AST.RegExpAST {
132132
return AST.positiveLookahead(traverse(node.inner), traverse(node.right))
133133
case 'negative-lookahead':
134134
return AST.negativeLookahead(traverse(node.inner), traverse(node.right))
135-
case 'start-marker':
136-
return AST.startMarker(traverse(node.left), traverse(node.right))
137-
case 'end-marker':
138-
return AST.endMarker(traverse(node.left), traverse(node.right))
135+
case 'start-anchor':
136+
return AST.startAnchor(traverse(node.left), traverse(node.right))
137+
case 'end-anchor':
138+
return AST.endAnchor(traverse(node.left), traverse(node.right))
139139
default:
140140
checkedAllCases(node)
141141
}
@@ -167,8 +167,8 @@ function regexpAST_(size: number): fc.Arbitrary<AST.RegExpAST> {
167167
{ arbitrary: captureGroup(() => regexpAST_(childSize)), weight: 2 },
168168
{ arbitrary: positiveLookahead(() => regexpAST_(childSize)), weight: 1 },
169169
{ arbitrary: negativeLookahead(() => regexpAST_(childSize)), weight: 1 },
170-
{ arbitrary: startMarker(() => regexpAST_(childSize)), weight: 1 },
171-
{ arbitrary: endMarker(() => regexpAST_(childSize)), weight: 1 }
170+
{ arbitrary: startAnchor(() => regexpAST_(childSize)), weight: 1 },
171+
{ arbitrary: endAnchor(() => regexpAST_(childSize)), weight: 1 }
172172
)
173173
}
174174
}

test/ast.spec.ts

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import { describe, it } from "node:test"
2+
import { strict as assert } from "node:assert"
3+
import * as RE from "../src/regex"
4+
import * as AST from "../src/ast"
5+
import { parseRegExp } from "../src/regex-parser"
6+
7+
describe('toExtRegex', () => {
8+
9+
const dotStar = RE.star(RE.anySingleChar)
10+
11+
// function infix(regex: RE.ExtRegex) {
12+
// return RE.seq([ dotStar, regex, dotStar ])
13+
// }
14+
15+
function prefix(regex: RE.ExtRegex) {
16+
return RE.concat(regex, dotStar)
17+
}
18+
19+
function suffix(regex: RE.ExtRegex) {
20+
return RE.concat(dotStar, regex)
21+
}
22+
23+
describe('union with empty members', () => {
24+
const testCases = [
25+
[/^(|a)$/, RE.optional(RE.singleChar('a'))],
26+
[/^(a||)$/, RE.optional(RE.singleChar('a'), )],
27+
[/^(|a|)$/, RE.optional(RE.singleChar('a'))],
28+
[/^(|)$/, RE.epsilon],
29+
] as const
30+
31+
for (const [regexp, expected] of testCases) {
32+
it(`${regexp}`, () => {
33+
const actual = AST.toExtRegex(parseRegExp(regexp))
34+
assert.equal(actual.hash, expected.hash)
35+
})
36+
}
37+
})
38+
39+
describe('start/end anchor elimination', () => {
40+
const testCases = [
41+
[/^abc/, RE.seq([RE.string('abc'), dotStar])],
42+
// start marker contradictions can only match empty set:
43+
[/a^b/, RE.empty],
44+
[/^a^b/, RE.empty],
45+
// but two ^^ directly in a row are not a contradiction:
46+
[/(^^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
47+
// in fact, as long as anything between two ^ can match epsilon,
48+
// there is no contradiction:
49+
[/(^(c|)^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
50+
[/(^c*^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))],
51+
// Also, contradiction inside a union does NOT collapse
52+
// the whole expression to empty set:
53+
[/(a^b|c)/, RE.seq([dotStar, RE.singleChar('c'), dotStar])],
54+
[/^(a^b|c)/, RE.seq([RE.singleChar('c'), dotStar])],
55+
56+
// End anchor before start anchor is contradictory and describes empty set:
57+
[/$.^/, RE.empty],
58+
// Can still match epsilon as long as there's nothing between end- and start anchor:
59+
[/$^/, RE.epsilon],
60+
// Nullable expressions on the left and right can be ignored:
61+
[/(a?)$^(b*)/, RE.epsilon],
62+
63+
[/(^a|)^b/, RE.seq([RE.singleChar('b'), dotStar])],
64+
[/^a(b^|c)/, RE.seq([RE.string('ac'), dotStar]) ],
65+
[/(^|a)b/, prefix(RE.concat(RE.optional(suffix(RE.singleChar('a'))), RE.singleChar('b')))],
66+
67+
// FIXME:
68+
// [/(^)+a$/, RE.singleChar('a') ],
69+
[/(^)*a$/, suffix(RE.singleChar('a')) ],
70+
[/(b|^)a$/, RE.concat(RE.optional(suffix(RE.singleChar('b'))), RE.singleChar('a'))],
71+
[/a(^)/, RE.empty],
72+
] as const
73+
74+
for (const [regexp, expected] of testCases) {
75+
it(`${regexp}`, () => {
76+
const actual = AST.toExtRegex(parseRegExp(regexp))
77+
assert.equal(actual.hash, expected.hash)
78+
})
79+
}
80+
})
81+
82+
describe('lookahead elimination', () => {
83+
const testCases = [
84+
// positive lookahead:
85+
[/^(?=a)a$/, RE.string('a')],
86+
[/^a(?=b)b$/, RE.string('ab')],
87+
[/^((?=a)a|(?=b)b)$/, RE.union(RE.string('a'), RE.string('b'))],
88+
[/^(?=[0-5])(?=[5-9])[3-7]$/, RE.string('5')],
89+
// negative lookahead:
90+
[/^a(?!b)c$/, RE.concat(RE.string('a'), RE.intersection(RE.complement(RE.string('b')), RE.string('c')))],
91+
// TODO: lookahead + lookbehind
92+
// [/^a(?=b)(?<=a)b$/, RE.string('ab')],
93+
// [/^b(?=ab)a(?<=ba)b$/, RE.string('bab')],
94+
// [/^a(?=b)(?<=a)(?!a)(?<!b)b$/, RE.string('ab')],
95+
] as const
96+
97+
for (const [regexp, expected] of testCases) {
98+
it(`${regexp}`, () => {
99+
const actual = AST.toExtRegex(parseRegExp(regexp))
100+
assert.equal(actual.hash, expected.hash, RE.debugShow(actual) + '\n\n' + RE.debugShow(expected))
101+
})
102+
}
103+
104+
it('fixme', { todo: true }, () => {
105+
const actual = AST.toExtRegex(parseRegExp(/^(a(?!b))*$/))
106+
const expected = RE.star(RE.string('a'))
107+
assert.equal(actual.hash, expected.hash)
108+
})
109+
110+
})
111+
112+
})

test/regex-parser.spec.ts

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,11 @@ describe('parseRegExp', () => {
6666
[/(?<ABC>abc)/, group(str('abc'), 'ABC')],
6767
[/(?<___>abc)/, group(str('abc'), '___')],
6868
// start/end marker
69-
[/^abc/, AST.startMarker(undefined, str('abc'))],
70-
[/a^b/, AST.startMarker(char('a'), str('b'))],
71-
[/^a|^b/, AST.union(AST.startMarker(undefined, str('a')), AST.startMarker(undefined, char('b')))],
72-
[/^abc$/, AST.startMarker(undefined, AST.endMarker(str('abc'), undefined))],
73-
[/$a^/, AST.startMarker(AST.endMarker(undefined, char('a')), undefined)],
69+
[/^abc/, AST.startAnchor(undefined, str('abc'))],
70+
[/a^b/, AST.startAnchor(char('a'), str('b'))],
71+
[/^a|^b/, AST.union(AST.startAnchor(undefined, str('a')), AST.startAnchor(undefined, char('b')))],
72+
[/^abc$/, AST.startAnchor(undefined, AST.endAnchor(str('abc'), undefined))],
73+
[/$a^/, AST.startAnchor(AST.endAnchor(undefined, char('a')), undefined)],
7474
// positive lookahead - now parsed as lookahead AST nodes, not intersections
7575
[/(?=a)b/, AST.positiveLookahead(char('a'), char('b'))],
7676
[/(?=a)(?:b)/, AST.positiveLookahead(char('a'), char('b'))],
@@ -134,24 +134,33 @@ describe('parseRegExp', () => {
134134

135135
})
136136

137-
test('parse/stringify roundtrip preserves equivalence', { todo: true }, () => {
137+
test('parse/stringify roundtrip preserves equivalence', {todo:true}, () => {
138138
fc.assert(
139139
fc.property(
140140
Arbitrary.regexp(),
141-
(inputRegExp) => {
141+
(inputRegExp: RegExp) => {
142142
const builder = RB(inputRegExp)
143143
const outputRegExp = builder.toRegExp()
144144

145-
// console.debug(inputRegExp)
146-
// console.debug(outputRegExp)
147-
148145
for (const str of builder.enumerate().take(10)) {
149146
assert.match(str, outputRegExp)
150147
assert.match(str, inputRegExp)
151148
}
152-
}
149+
},
153150
),
154-
{ seed: -1651123632, path: "89:0", endOnFailure: true }
151+
// { numRuns: 1000 },
155152
)
156153
})
157154

155+
test('fixme 1', { todo: true }, () => {
156+
const inputRegExp = /(^)+a/
157+
const builder = RB(inputRegExp)
158+
const outputRegExp = builder.toRegExp()
159+
160+
// console.debug(outputRegExp)
161+
162+
for (const str of builder.enumerate().take(10)) {
163+
assert.match(str, outputRegExp)
164+
assert.match(str, inputRegExp)
165+
}
166+
})

0 commit comments

Comments
 (0)