Skip to content

Commit d736290

Browse files
committed
fix(parser): some special chars unescaped in brackets
Some special chars that should appear literally in a regex don't need to be escaped when inside square brackets, e.g. "$" or ".". Previously, the parser would fail when encountering these. That brings parse errors in the parser benchmark down to 304.
1 parent f8981c4 commit d736290

File tree

4 files changed

+56
-26
lines changed

4 files changed

+56
-26
lines changed

benchmark/parser-bench.js

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,27 @@ export function* readDataset() {
1919

2020
let hasError = 0
2121
let noError = 0
22+
let totalParseTime = 0
2223

2324
for (const { regex, flags } of readDataset()) {
2425
try {
25-
const time = performance.now()
2626
// parseRegexString(regex)
27-
const regexp = RB(new RegExp(regex, flags))
27+
const regexp = new RegExp(regex, flags)
2828
console.log('====', regexp, '====')
29-
for (const word of RB(regexp).enumerate().take(10)) {
30-
console.log(JSON.stringify(word))
31-
}
32-
console.log(`time: ${Math.round(performance.now() - time)}ms`)
29+
30+
const timeStart = performance.now()
31+
const parsed = RB(regexp)
32+
const timeEnd = performance.now()
33+
34+
console.log(`time: ${Math.round(timeEnd - timeStart)}ms`)
35+
totalParseTime += timeEnd - timeStart
3336
noError++
3437
} catch (e) {
3538
// console.error(new RegExp(regex, flags))
3639
hasError++
3740
}
3841
}
3942

40-
console.debug(hasError, '/', hasError + noError)
43+
console.log('error ratio:', hasError, '/', hasError + noError)
44+
console.log('total parse time:', Math.round(totalParseTime), 'ms')
45+

src/code-point-range.ts

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import { isSingleton } from './char-set'
2-
import { assert, checkedAllCases } from './utils'
1+
import { assert } from './utils'
32

43
export type CodePointRange = { start: number, end: number }
54

@@ -106,14 +105,34 @@ export function difference(rangeA: CodePointRange, rangeB: CodePointRange): [] |
106105
return union(before, after)
107106
}
108107

109-
export function isMetaChar(char: string): boolean {
110-
return /^[.^$*+?()[\]{\|\\\/]$/.test(char)
108+
/**
109+
* Returns true iff the given char must always be escaped to occur literally
110+
* in a regular expression. Some special chars like `$` don't need to be
111+
* escaped when inside brackets (e.g. `/[$]/`). But `/`, `\` and `]` must
112+
* even be escaped when inside brackets.
113+
*/
114+
export function mustAlwaysBeEscaped(char: string) {
115+
return '\\\]\/'.includes(char)
116+
}
117+
118+
/**
119+
* Returns true iff the given char must be escaped to occur literally
120+
* in a regular expression, unless within square brackets. That's true
121+
* for special chars like `$`. Outside brackets we have to write `\$`.
122+
* Inside brackets `[$]` is allowed.
123+
*/
124+
export function mustBeEscapedOrInBrackets(char: string) {
125+
return '.^$*+?()[{-|'.includes(char)
126+
}
127+
128+
export function neverMustBeEscaped(char: string) {
129+
return !mustAlwaysBeEscaped(char) && !mustBeEscapedOrInBrackets(char)
111130
}
112131

113132
function codePointToString(codePoint: number): string {
114133
const char = String.fromCharCode(codePoint)
115134

116-
if (isMetaChar(char))
135+
if (mustAlwaysBeEscaped(char) || mustBeEscapedOrInBrackets(char))
117136
// e.g. \$ \+ \.
118137
return '\\' + char
119138
else if (codePoint > 126)

src/regex-parser.ts

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,10 @@ const wildcard = P.string('.').map(
4242
() => RE.literal(CharSet.wildcard({ dotAll: false }))
4343
)
4444

45-
const singleChar = P.satisfy(char => !Range.isMetaChar(char))
45+
const unescapedChar = P.satisfy(Range.neverMustBeEscaped)
4646

47-
const codePoint = singleChar.map(char => {
48-
const result = char.codePointAt(0)!
49-
assert(result !== undefined)
50-
return result
51-
})
47+
const unescapedCharInsideBrackets = P.satisfy(Range.mustBeEscapedOrInBrackets)
48+
.map(CharSet.singleton)
5249

5350
export class UnsupportedSyntaxError extends Error {}
5451

@@ -77,9 +74,9 @@ const escapeSequence = P.string('\\').andThen(_ => P.anyChar).map(escapedChar =>
7774
})
7875

7976
const codePointRange: P.Parser<CharSet.CharSet> =
80-
codePoint.andThen(start =>
81-
P.optional(P.string('-').andThen(_ => codePoint))
82-
.map(end => CharSet.fromRange({ start, end: end ?? start }))
77+
unescapedChar.andThen(start =>
78+
P.optional(P.string('-').andThen(_ => unescapedChar))
79+
.map(end => CharSet.charRange(start, end ?? start))
8380
)
8481

8582
const charSet = P.choice([
@@ -88,7 +85,7 @@ const charSet = P.choice([
8885
P.string('['),
8986
P.string(']'),
9087
P.optional(P.string('^')).andThen(negated =>
91-
P.many(P.choice([escapeSequence, codePointRange])).map(
88+
P.many(P.choice([escapeSequence, codePointRange, unescapedCharInsideBrackets])).map(
9289
sets => {
9390
if (negated === undefined)
9491
return sets.reduce(CharSet.union, CharSet.empty)
@@ -98,7 +95,7 @@ const charSet = P.choice([
9895
)
9996
)
10097
),
101-
singleChar.map(CharSet.singleton),
98+
unescapedChar.map(CharSet.singleton),
10299
])
103100

104101
const group = P.between(

test/regex-parser.spec.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,21 @@ describe('parseRegexString', () => {
2929
[/^\W$/, RE.literal(CharSet.nonWordChars)],
3030
[/^\n$/, RE.literal(CharSet.singleton('\n'))],
3131
[/^\.$/, RE.literal(CharSet.singleton('.'))],
32+
// char class from range:
3233
[/^[a-z]$/, RE.literal(CharSet.charRange('a', 'z'))],
34+
// negative char class:
3335
[/^[^abc]$/, RE.literal(CharSet.complement(CharSet.fromArray(['a', 'b', 'c'])))],
34-
[/^(?:ab)$/, RE.string('ab')], // non-capturing groups
35-
[/^(?=^a$)a$/, RE.intersection(RE.string('a'), RE.string('a'))], // positive lookahead
36-
[/^(?!^a$)b$/, RE.intersection(RE.complement(RE.string('a')), RE.string('b'))], // negative lookahead
36+
// non-capturing groups
37+
[/^(?:ab)$/, RE.string('ab')],
38+
// positive lookahead
39+
[/^(?=^a$)a$/, RE.intersection(RE.string('a'), RE.string('a'))],
40+
// negative lookahead
41+
[/^(?!^a$)b$/, RE.intersection(RE.complement(RE.string('a')), RE.string('b'))],
3742
[/^(?!^a$)b|c$/, RE.union(RE.intersection(RE.complement(RE.string('a')), RE.string('b')), RE.string('c'))],
43+
// some special chars don't need escape when inside brackets:
44+
[/^[.^$*+?()[{-|]$/, RE.literal(CharSet.fromArray([...'.^$*+?()[{-|']))],
45+
// other special chars need escaped even inside brackets:
46+
[/^[\\\]\/]$/, RE.literal(CharSet.fromArray([...'\\]/']))],
3847
])('can parse %s', (regexp, expected) => {
3948
expect(parseRegExp(regexp)).toEqual(expected)
4049
})

0 commit comments

Comments
 (0)