Skip to content

Commit 004226e

Browse files
committed
feat(regex-parser): negative/positive lookAhead
1 parent 8fab4eb commit 004226e

File tree

6 files changed

+129
-58
lines changed

6 files changed

+129
-58
lines changed

README.md

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -98,24 +98,17 @@ RE.size(
9898

9999
## Limitations
100100

101-
* Syntax support
102-
- The library implements a custom parser for regular expressions,
103-
so only a subset of the syntax is supported:
104-
- quantifiers: `*`, `+`, `?`, `{3,5}`, ...
105-
- alternation: `|`
106-
- character classes: `.`, `\w`, `[a-z]`, ...
107-
- optional start/end markers: `^` / `$` but only at the start/end
108-
(technically they are allowed anywhere in the expression)
109-
- escaped meta characters: `\$`, `\.`, ...
110-
- capturing groups: `(...)`
111-
- regex flags are not supported at all
112-
* performance of `intersection` and `complement`
113-
- These function have worst case exponential complexity.
114-
But often the worst case is not realized.
115-
- Nested quantifiers are especially dangerous, e.g. `(a*|b)*`.
116-
- A bigger problem is: even if computation is fast,
117-
the output regex can be extremely large to the point that
118-
the `new RegExp(...)` constructor crashes.
101+
The library implements a custom parser for regular expressions,
102+
so only a subset of the syntax is supported:
103+
- quantifiers: `*`, `+`, `?`, `{3,5}`, ...
104+
- alternation: `|`
105+
- character classes: `.`, `\w`, `[a-z]`, ...
106+
- optional start/end markers: `^` / `$` but only at the start/end
107+
(technically they are allowed anywhere in the expression)
108+
- escaped meta characters: `\$`, `\.`, ...
109+
- (non-)capturing groups: `(...)`, `(?...)`
110+
- positive/negative lookahead: `(?!...)`, `(?=...)`
111+
Regex flags are not supported at all.
119112

120113
## References
121114

src/index.ts

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,12 @@ export function complement(re: RegExp): RegExp {
124124
* ```
125125
*/
126126
export function* enumerate(re: RegExp): Generator<string> {
127-
yield* RE.enumerate(RE.parse(re))
127+
const regex = RE.parse(re)
128+
if (RE.isStdRegex(regex)) {
129+
yield* RE.enumerate(regex)
130+
} else {
131+
yield* RE.enumerate(RE.toStdRegex(regex))
132+
}
128133
}
129134

130135
/**
@@ -151,7 +156,12 @@ export function* enumerate(re: RegExp): Generator<string> {
151156
* > The value should always be an upper bound though.
152157
*/
153158
export function size(re: RegExp): bigint | undefined {
154-
return RE.size(RE.parse(re))
159+
const regex = RE.parse(re)
160+
if (RE.isStdRegex(regex)) {
161+
return RE.size(regex)
162+
} else {
163+
return RE.size(RE.toStdRegex(regex))
164+
}
155165
}
156166

157167
/**
@@ -160,5 +170,10 @@ export function size(re: RegExp): bigint | undefined {
160170
* TODO: examples.
161171
*/
162172
export function derivative(prefix: string, re: RegExp): RegExp {
163-
return RE.toRegExp(RE.derivative(prefix, RE.parse(re)))
173+
const regex = RE.derivative(prefix, RE.parse(re))
174+
if (RE.isStdRegex(regex)) {
175+
return RE.toRegExp(regex)
176+
} else {
177+
return RE.toRegExp(RE.toStdRegex(regex))
178+
}
164179
}

src/low-level-api.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export {
2424
type StdRegex,
2525
type ExtRegex,
2626
type RepeatBounds,
27+
isStdRegex,
2728
// constructors:
2829
and,
2930
or,

src/regex-parser.ts

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ const group = P.between(
107107
regex(),
108108
)
109109

110-
const boundedQuantifier: P.Parser<(inner: RE.StdRegex) => RE.StdRegex> = P.between(
110+
const boundedQuantifier: P.Expr.UnaryOperator<RE.ExtRegex> = P.between(
111111
P.string('{'),
112112
P.string('}'),
113113
P.optional(P.decimal).andThen(min => {
@@ -142,33 +142,51 @@ function regexTerm() {
142142
charSet.map(RE.literal),
143143
])
144144
}
145+
146+
function lookAhead(): P.Expr.UnaryOperator<RE.ExtRegex> {
147+
return P.between(
148+
P.string('(?'),
149+
P.string(')'),
150+
P.choice([
151+
// positive lookahead
152+
P.string('=').andThen(_ => regexWithBounds()),
153+
// negative lookahead
154+
P.string('!').andThen(_ => regexWithBounds().map(RE.complement)),
155+
]).map(
156+
left => right => RE.intersection(left, right)
157+
)
158+
)
159+
}
145160

146-
function regex(): P.Parser<RE.StdRegex> {
147-
return P.lazy(() => P.Expr.makeExprParser<RE.StdRegex>(
161+
function regex(): P.Parser<RE.ExtRegex> {
162+
return P.lazy(() => P.Expr.makeExprParser<RE.ExtRegex>(
148163
regexTerm(),
149164
[
150165
{ type: 'postfix', op: P.string('*').map(_ => RE.star) },
151166
{ type: 'postfix', op: boundedQuantifier },
152167
{ type: 'postfix', op: P.string('+').map(_ => RE.plus) },
153168
{ type: 'postfix', op: P.string('?').map(_ => RE.optional) },
154169
{ type: 'infixRight', op: P.string('').map(_ => RE.concat) },
170+
{ type: 'prefix', op: lookAhead() },
155171
{ type: 'infixRight', op: P.string('|').map(_ => RE.union) },
156172
]
157173
))
158174
}
159175

160176
// TODO: start- and end marker are not necessarily at the
161177
// beginning/end of the regex:
162-
const regexWithBounds = P.sequence([
163-
startMarker,
164-
regex(),
165-
endMarker,
166-
]).map<RE.StdRegex>(RE.seq)
178+
function regexWithBounds() {
179+
return P.sequence([
180+
startMarker,
181+
regex(),
182+
endMarker,
183+
]).map<RE.ExtRegex>(RE.seq)
184+
}
167185

168186
export function parseRegexString(
169187
regexStr: string,
170-
): RE.StdRegex {
171-
const { value, restInput } = regexWithBounds.run(regexStr)
188+
): RE.ExtRegex {
189+
const { value, restInput } = regexWithBounds().run(regexStr)
172190
if (restInput === '') {
173191
// TODO: parsing should always return stdandard regex instances:
174192
return value
@@ -182,7 +200,7 @@ export function parseRegexString(
182200
*
183201
* @public
184202
*/
185-
export function parseRegExp(regexp: RegExp): RE.StdRegex {
203+
export function parseRegExp(regexp: RegExp): RE.ExtRegex {
186204
for (const flag of regExpFlags) {
187205
assert(!regexp[flag], `[regex-utils] RegExp flags not supported`)
188206
}

src/regex.ts

Lines changed: 67 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import * as Table from './table';
66
/**
77
* TODO
88
*/
9-
type StdRegexWithoutHash = (
9+
type StdRegexWithoutMetaInfo = (
1010
| { type: "epsilon" }
1111
| { type: "literal", charset: CharSet.CharSet }
1212
| { type: "concat", left: StdRegex, right: StdRegex }
@@ -17,7 +17,7 @@ type StdRegexWithoutHash = (
1717
/**
1818
* TODO
1919
*/
20-
type ExtRegexWithoutHash = (
20+
type ExtRegexWithoutMetaInfo = (
2121
| { type: "epsilon" }
2222
| { type: "literal", charset: CharSet.CharSet }
2323
| { type: "concat", left: ExtRegex, right: ExtRegex }
@@ -31,40 +31,81 @@ type ExtRegexWithoutHash = (
3131
/**
3232
* TODO: docs
3333
*/
34-
export type StdRegex = StdRegexWithoutHash & { hash: number }
34+
export type StdRegex = StdRegexWithoutMetaInfo & { hash: number, isStdRegex: true }
3535

3636
/**
3737
* TODO: docs
3838
*/
39-
export type ExtRegex = ExtRegexWithoutHash & { hash: number }
39+
export type ExtRegex = ExtRegexWithoutMetaInfo & { hash: number, isStdRegex: boolean }
4040

41-
export function withHash(regex: StdRegexWithoutHash): StdRegex
42-
export function withHash(regex: ExtRegexWithoutHash): ExtRegex
43-
export function withHash(regex: ExtRegexWithoutHash): ExtRegex {
41+
export function withMetaInfo(regex: StdRegexWithoutMetaInfo): StdRegex
42+
export function withMetaInfo(regex: ExtRegexWithoutMetaInfo): ExtRegex
43+
export function withMetaInfo(regex: ExtRegexWithoutMetaInfo): ExtRegex {
4444
if (regex.type === 'epsilon')
45-
return { ...regex, hash: hashStr(regex.type) }
45+
return {
46+
...regex,
47+
hash: hashStr(regex.type),
48+
isStdRegex: true,
49+
}
4650
else if (regex.type === 'literal')
47-
return { ...regex, hash: hashNums([hashStr(regex.type), regex.charset.hash]) }
48-
else if (regex.type === 'concat' || regex.type === 'union' || regex.type === 'intersection')
49-
return { ...regex, hash: hashNums([
50-
hashStr(regex.type),
51-
// Need non-commutative hash operator for `concat`, otherwise "ac" and "ca" are the same:
52-
regex.left.hash,
53-
regex.right.hash,
54-
])}
55-
else if (regex.type === 'star' || regex.type === 'complement')
56-
return { ...regex, hash: hashNums([hashStr(regex.type), regex.inner.hash]) }
51+
return {
52+
...regex,
53+
hash: hashNums([hashStr(regex.type), regex.charset.hash]),
54+
isStdRegex: true,
55+
}
56+
else if (regex.type === 'concat' || regex.type === 'union')
57+
return {
58+
...regex,
59+
hash: hashNums([
60+
hashStr(regex.type),
61+
// Need non-commutative hash operator for `concat`, otherwise "ac" and "ca" are the same:
62+
regex.left.hash,
63+
regex.right.hash,
64+
]),
65+
isStdRegex: regex.left.isStdRegex && regex.right.isStdRegex,
66+
}
67+
else if (regex.type === 'intersection')
68+
return {
69+
...regex,
70+
hash: hashNums([
71+
hashStr(regex.type),
72+
regex.left.hash,
73+
regex.right.hash,
74+
]),
75+
isStdRegex: false,
76+
}
77+
else if (regex.type === 'star')
78+
return {
79+
...regex,
80+
hash: hashNums([hashStr(regex.type), regex.inner.hash]),
81+
isStdRegex: regex.inner.isStdRegex,
82+
}
83+
else if (regex.type === 'complement')
84+
return {
85+
...regex,
86+
hash: hashNums([hashStr(regex.type), regex.inner.hash]),
87+
isStdRegex: false
88+
}
5789
checkedAllCases(regex)
5890
}
5991

92+
/**
93+
* TODO
94+
*
95+
* @public
96+
*/
97+
export function isStdRegex(regex: ExtRegex): regex is StdRegex {
98+
return regex.isStdRegex
99+
}
100+
60101
//////////////////////////////////////////////
61102
///// primitive composite constructors ///////
62103
//////////////////////////////////////////////
63104

64-
export const epsilon: StdRegex = withHash({ type: 'epsilon' })
105+
export const epsilon: StdRegex = withMetaInfo({ type: 'epsilon' })
65106

66107
export function literal(charset: CharSet.CharSet): StdRegex {
67-
return withHash({ type: 'literal', charset })
108+
return withMetaInfo({ type: 'literal', charset })
68109
}
69110

70111
export const empty: StdRegex = literal(CharSet.empty)
@@ -113,7 +154,7 @@ export function concat(left: ExtRegex, right: ExtRegex): ExtRegex {
113154
return concat(left, right.right)
114155
}
115156

116-
return withHash({ type: 'concat', left, right })
157+
return withMetaInfo({ type: 'concat', left, right })
117158
}
118159

119160
function extractFront(regex: StdRegex): [StdRegex, StdRegex]
@@ -212,7 +253,7 @@ export function union(left: ExtRegex, right: ExtRegex): ExtRegex {
212253
// r + (s · r) = (s + ε) · r
213254
return concat(union(leftInit, rightInit), leftLast)
214255

215-
return withHash({ type: 'union', left, right })
256+
return withMetaInfo({ type: 'union', left, right })
216257
}
217258

218259
export function star(inner: StdRegex): StdRegex
@@ -231,7 +272,7 @@ export function star(inner: ExtRegex): ExtRegex {
231272
// (r∗ · s∗)∗ = (r + s)∗
232273
return star(union(inner.left.inner, inner.right.inner))
233274
else
234-
return withHash({ type: "star", inner })
275+
return withMetaInfo({ type: "star", inner })
235276
}
236277

237278
export function intersection(left: ExtRegex, right: ExtRegex): ExtRegex {
@@ -257,7 +298,7 @@ export function intersection(left: ExtRegex, right: ExtRegex): ExtRegex {
257298
// R & S ≈ R∩S
258299
return literal(CharSet.intersection(left.charset, right.charset))
259300

260-
return withHash({ type: "intersection", left, right })
301+
return withMetaInfo({ type: "intersection", left, right })
261302
}
262303

263304
/**
@@ -274,7 +315,7 @@ export function complement(inner: ExtRegex): ExtRegex {
274315
// // ¬S ≈ (Σ\S
275316
// return literal(CharSet.complement(inner.charset))
276317
else
277-
return withHash({ type: "complement", inner })
318+
return withMetaInfo({ type: "complement", inner })
278319
}
279320

280321
//////////////////////////////////////////////
@@ -733,7 +774,7 @@ export function toString(regex: ExtRegex): string {
733774
// Render parenthesis as non-capturing groups if there is a large number of them,
734775
// i.e. `/(?:abc)` instead of `/(abc)/`. `new RegExp(...)` throws an error if there
735776
// is a large number of capturing groups. Non-capturing groups are a bit more verbose
736-
// but at large sizes like this it doesn't matter anyway:
777+
// but at large sizes like this it hardly still hurts readability:
737778
const useNonCapturingGroups = size > 10_000
738779

739780
return '^(' + astToString(toRegExpAST(regex), { useNonCapturingGroups }) + ')$'

test/regex-parser.spec.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ describe('parseRegexString', () => {
3232
[/^[a-z]$/, RE.literal(CharSet.charRange('a', 'z'))],
3333
[/^[^abc]$/, RE.literal(CharSet.complement(CharSet.fromArray(['a', 'b', 'c'])))],
3434
[/^(?:ab)$/, RE.string('ab')], // non-capturing groups
35+
[/^(?=^a$)a$/, RE.intersection(RE.string('a'), RE.string('a'))], // positive lookahead
36+
[/^(?!^a$)b$/, RE.intersection(RE.complement(RE.string('a')), RE.string('b'))], // negative lookahead
37+
[/^(?!^a$)b|c$/, RE.union(RE.intersection(RE.complement(RE.string('a')), RE.string('b')), RE.string('c'))],
3538
])('can parse %s', (regexp, expected) => {
3639
expect(parseRegExp(regexp)).toEqual(expected)
3740
})

0 commit comments

Comments
 (0)