Skip to content

Commit 7a27f90

Browse files
committed
feat: builder API
BREAKING CHANGE: removed low-level and high-level API.
1 parent 004226e commit 7a27f90

13 files changed

+517
-497
lines changed

README.md

Lines changed: 7 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,19 @@ Zero-dependency TypeScript library for regex intersection, complement and other
44
These are surprisingly hard to come by for any programming language.
55

66
```typescript
7-
import { intersection, size, enumerate } from '@gruhn/regex-utils'
7+
import { RB } from '@gruhn/regex-utils'
88

9-
// `intersection` combines multiple regex into one:
10-
const passwordRegex = intersection(
11-
/^[a-zA-Z0-9]{12,32}$/, // 12-32 alphanumeric characters
12-
/[0-9]/, // at least one number
13-
/[A-Z]/, // at least one upper case letter
14-
/[a-z]/, // at least one lower case letter
15-
)
9+
const passwordRegex = RB(/^[a-zA-Z0-9]{12,32}$/) // 12-32 alphanumeric characters
10+
.and(/[0-9]/) // at least one number
11+
.and(/[A-Z]/) // at least one upper case letter
12+
.and(/[a-z]/) // at least one lower case letter
1613

1714
// `size` calculates the number of strings matching the regex:
18-
console.log(size(passwordRegex))
15+
console.log(passwordRegex.size())
1916
// 2301586451429392354821768871006991487961066695735482449920n
2017

2118
// `enumerate` returns a stream of strings matching the regex:
22-
for (const sample of enumerate(passwordRegex).take(10)) {
19+
for (const sample of passwordRegex.enumerate().take(10)) {
2320
console.log(sample)
2421
}
2522
// aaaaaaaaaaA0
@@ -40,76 +37,6 @@ for (const sample of enumerate(passwordRegex).take(10)) {
4037
npm install @gruhn/regex-utils
4138
```
4239

43-
## High- vs. Low-Level API
44-
45-
There is a high-level API and a low-level API:
46-
47-
- [high-level API documentation](https://gruhn.github.io/regex-utils/modules/High-level_API.html)
48-
- [low-level API documentation](https://gruhn.github.io/regex-utils/modules/Low-Level_API.html)
49-
50-
The high-level API operates directly on native JavaScript `RegExp` instances,
51-
which is more convenient but also requires parsing the regular expression.
52-
The low-level API operates on an internal representation
53-
which skips parsing step and is more efficient when combining multiple functions.
54-
For example, say you want to know how many strings match the intersection
55-
of two regular expressions:
56-
57-
```typescript
58-
import { size, intersection } from '@gruhn/regex-utils'
59-
60-
size(intersection(regex1, regex2))
61-
```
62-
63-
This:
64-
1. parses the two input `RegExp`
65-
2. computes the intersection
66-
3. converts the result back to `RegExp`
67-
4. parses that again
68-
5. computes the size
69-
70-
Step (1) should be fast for small handwritten regex.
71-
But the intersection of two regex can be quite large,
72-
which can make step (3) and (4) quite costly.
73-
With the low-level API, step (3) and step (4) can be eliminated:
74-
75-
```typescript
76-
import * as RE from '@gruhn/regex-utils/low-level-api'
77-
78-
RE.size(
79-
RE.toStdRegex(
80-
RE.and(
81-
RE.parse(regex1),
82-
RE.parse(regex2)
83-
)
84-
)
85-
)
86-
```
87-
88-
<!--
89-
90-
## Todo Utilities
91-
92-
* recognize regex prone to catastrophic backtracking
93-
- https://www.regular-expressions.info/catastrophic.html
94-
- https://www.youtube.com/watch?v=DDe-S3uef2w
95-
* check equivalence of two regex or find counterexample string
96-
97-
-->
98-
99-
## Limitations
100-
101-
The library implements a custom parser for regular expressions,
102-
so only a subset of the syntax is supported:
103-
- quantifiers: `*`, `+`, `?`, `{3,5}`, ...
104-
- alternation: `|`
105-
- character classes: `.`, `\w`, `[a-z]`, ...
106-
- optional start/end markers: `^` / `$` but only at the start/end
107-
(technically they are allowed anywhere in the expression)
108-
- escaped meta characters: `\$`, `\.`, ...
109-
- (non-)capturing groups: `(...)`, `(?...)`
110-
- positive/negative lookahead: `(?!...)`, `(?=...)`
111-
Regex flags are not supported at all.
112-
11340
## References
11441

11542
Heavily informed by these papers:

benchmark/aoc2023-day12.js

Lines changed: 37 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,59 @@
11
import fs from 'fs'
2-
import * as RE from '../dist/low-level-api.js'
2+
import { RB } from '../dist/index.js'
3+
import { assert } from '../dist/utils.js'
34

45
const input = fs.readFileSync('./benchmark/aoc2023-day12_input.txt', 'utf-8')
56
.trim()
67
.split('\n')
78
.map(line => line.split(' '))
89

910
/**
10-
* Maps pattern like "#?...##?#" to regex like `#(.|#)...##(.|#)#`
11+
* Maps pattern like "#?...##?#" to regex like /^#(o|#)ooo##(o|#)#$/
1112
*/
1213
function leftToRegex(pattern) {
1314
const inner = [...pattern].map(char => {
1415
switch (char) {
15-
case '.': return RE.singleChar('.')
16-
case '#': return RE.singleChar('#')
17-
case '?': return RE.or([RE.singleChar('.'), RE.singleChar('#')])
16+
case '.': return RB('o')
17+
case '#': return RB('#')
18+
case '?': return RB('o').or('#')
1819
}
1920
throw 'unknown symbol: ' + char
2021
})
21-
return RE.seq(inner)
22-
}
23-
24-
function interleave(array, sep) {
25-
if (array.length <= 1) {
26-
return array
27-
} else {
28-
const [ head, ...tail ] = array
29-
return [head, sep, ...interleave(tail, sep) ]
30-
}
22+
return inner.reduce((acc, re) => acc.concat(re))
3123
}
3224

3325
/**
34-
* Maps pattern like "2,4,3" to regex like `.*##.+####.+###.*`
26+
* Maps pattern like "2,4,3" to regex like /^o*##o+####o+###o*$/
3527
*/
3628
function rightToRegex(pattern) {
37-
const regexStartEnd = RE.repeat(RE.singleChar('.')) // .*
38-
const regexBetween = RE.repeat(RE.singleChar('.'), { min: 1 }) // .+
39-
40-
const inner = pattern.split(',')
29+
const [first, ...rest] = pattern.split(',')
4130
.map(digit => parseInt(digit))
42-
.map(count => RE.repeat(RE.singleChar('#'), count))
31+
.map(count => RB('#').repeat(count))
32+
33+
const start = RB('o').repeat() // o*
34+
const end = RB('o').repeat() // o*
35+
const separator = RB('o').repeat({ min: 1 }) // o+
4336

44-
return RE.seq([
45-
regexStartEnd,
46-
RE.seq(interleave(inner, regexBetween)),
47-
regexStartEnd,
48-
])
37+
let result = start.concat(first)
38+
for (const item of rest) {
39+
result = result.concat(separator).concat(item)
40+
}
41+
result = result.concat(end)
42+
43+
return result
4944
}
5045

51-
function part1() {
46+
function solve(patternPairs) {
5247
const startTime = performance.now()
5348
let totalCount = 0n
5449

55-
input.forEach(([left, right], i) => {
50+
patternPairs.forEach(([left, right], i) => {
5651
const leftRegex = leftToRegex(left)
5752
const rightRegex = rightToRegex(right)
5853

59-
// Compute intersection of the two regex:
60-
const intersection = RE.toStdRegex(RE.and([leftRegex, rightRegex]))
61-
// And count the number of matching strings using `size`:
62-
const count = RE.size(intersection)
54+
// Compute intersection of the two regex and
55+
// count the number of matching strings using `size`:
56+
const count = RB(leftRegex).and(rightRegex).size()
6357

6458
console.log(i, ':', count)
6559
totalCount += count
@@ -69,29 +63,17 @@ function part1() {
6963
return { totalCount, time }
7064
}
7165

72-
function part2() {
73-
const startTime = performance.now()
74-
let totalCount = 0n
66+
const part1 = solve(input)
67+
const part2 = solve(input.map(([left, right]) => [
68+
Array(5).fill(left).join('?'),
69+
Array(5).fill(right).join(',')
70+
]))
7571

76-
input.forEach(([left, right], i) => {
77-
const leftRegex = leftToRegex(Array(5).fill(left).join('?'))
78-
const rightRegex = rightToRegex(Array(5).fill(right).join(','))
79-
80-
// Compute intersection of the two regex:
81-
const intersection = RE.toStdRegex(RE.and([leftRegex, rightRegex]))
82-
// And count the number of matching strings using `size`:
83-
const count = RE.size(intersection)
84-
85-
console.log(i, ':', count)
86-
totalCount += count
87-
})
88-
89-
const time = performance.now() - startTime
90-
return { time, totalCount }
91-
}
72+
// best time: 992ms
73+
console.log('Part 1:', part1.totalCount, `(time: ${Math.ceil(part1.time)}ms)`)
9274

93-
const sol1 = part1() // best time: 992ms
94-
const sol2 = part2() // best time: 13182ms
75+
// best time: 11950ms
76+
console.log('Part 2:', part2.totalCount, `(time: ${Math.ceil(part2.time)}ms)`)
9577

96-
console.log('Part 1:', sol1.totalCount, `(time: ${Math.ceil(sol1.time)}ms)`)
97-
console.log('Part 2:', sol2.totalCount, `(time: ${Math.ceil(sol2.time)}ms)`)
78+
assert(part1.totalCount === 7191n)
79+
assert(part2.totalCount === 6512849198636n)

benchmark/demo.js

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import { RB } from '../dist/index.js'
2+
3+
const passwordRegex = RB(/^[a-zA-Z0-9]{12,32}$/) // 12-32 alphanumeric characters
4+
.and(/[0-9]/) // at least one number
5+
.and(/[A-Z]/) // at least one upper case letter
6+
.and(/[a-z]/) // at least one lower case letter
7+
8+
// `size` calculates the number of strings matching the regex:
9+
console.log(passwordRegex.size())
10+
// 2301586451429392354821768871006991487961066695735482449920n
11+
12+
// `enumerate` returns a stream of strings matching the regex:
13+
for (const sample of passwordRegex.enumerate().take(10)) {
14+
console.log(sample)
15+
}
16+
// aaaaaaaaaaA0
17+
// aaaaaaaaaa0A
18+
// aaaaaaaaaAA0
19+
// aaaaaaaaaA00
20+
// aaaaaaaaaaA1
21+
// aaaaaaaaa00A
22+
// baaaaaaaaaA0
23+
// AAAAAAAAAA0a
24+
// aaaaaaaaaAA1
25+
// aaaaaaaaaa0B

benchmark/filter-vs-intersection.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import fc from 'fast-check'
2-
import { intersection } from '../dist/index.js'
2+
import { RB } from '../dist/index.js'
33

44
const emailRegex = /^[\w\-\.]+@([\w-]+\.)+[\w-]{2,}$/
55

66
function runIntersection(sampleCount) {
77
const startTime = performance.now()
8-
fc.sample(fc.stringMatching(intersection(/^.{3,10}$/, emailRegex)), sampleCount)
8+
const intersection = RB(emailRegex).and(/^.{3,10}$/).toRegExp()
9+
fc.sample(fc.stringMatching(intersection), sampleCount)
910
return performance.now() - startTime
1011
}
1112

benchmark/parser-bench.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import fs from 'fs'
22
import { parseRegexString } from '../dist/regex-parser.js'
3-
import { enumerate } from '../dist/index.js'
3+
import { RB } from '../dist/index.js'
44

55
export function* readDataset() {
66
const jsonStr = fs.readFileSync('./benchmark/regex-dataset.json', 'utf-8')
@@ -24,9 +24,9 @@ for (const { regex, flags } of readDataset()) {
2424
try {
2525
const time = performance.now()
2626
// parseRegexString(regex)
27-
const regexp = new RegExp(regex, flags)
27+
const regexp = RB(new RegExp(regex, flags))
2828
console.log('====', regexp, '====')
29-
for (const word of enumerate(regexp).take(10)) {
29+
for (const word of RB(regexp).enumerate().take(10)) {
3030
console.log(JSON.stringify(word))
3131
}
3232
console.log(`time: ${Math.round(performance.now() - time)}ms`)

benchmark/toStdRegex_output_length.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import fc from 'fast-check'
22
import * as RE from '../dist/regex.js'
33
import { ParseError } from '../dist/parser.js'
4-
import { UnsupportedSyntaxError } from '../dist/regex-parser.js'
5-
import { parse, toStdRegex } from '../dist/low-level-api.js'
4+
import { UnsupportedSyntaxError, parseRegExp } from '../dist/regex-parser.js'
5+
import { toStdRegex } from '../dist/dfa.js'
66
import randomRegexDataset from './regex_random_unique_no-nested-star_1000.js'
77
import handwrittenRegexDataset from './regex_handwritten.js'
88

@@ -17,7 +17,7 @@ function run(inputRegExp, index) {
1717
console.log('#' + index, inputRegExp)
1818
const startTime = performance.now()
1919

20-
const inputRegex = parse(inputRegExp)
20+
const inputRegex = parseRegExp(inputRegExp)
2121
const outputRegex = toStdRegex(inputRegex)
2222
const outputRegExp = RE.toRegExp(outputRegex)
2323

package.json

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@
1717
".": {
1818
"import": "./dist/index.js",
1919
"types": "./dist/index.d.ts"
20-
},
21-
"./low-level-api": {
22-
"import": "./dist/low-level-api.js",
23-
"types": "./dist/low-level-api.d.ts"
2420
}
2521
},
2622
"files": [

src/char-set.ts

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
import { adjacentPairs, assert, checkedAllCases, hashNums, hashStr, xor, zip } from './utils'
1+
import { assert, checkedAllCases, hashStr, xor } from './utils'
22
import * as Range from './code-point-range'
33
import * as Stream from './stream'
44

55
type WithHash<T> = T & { hash: number }
66

7-
type EmptyCharSet = WithHash<{ type: 'empty' }>
8-
97
// TODO: ensure tree is balanced
108
type CharSetWithoutHash =
119
| { type: 'empty' }
@@ -202,7 +200,7 @@ export function deleteRange(set: CharSet, range: Range.CodePointRange): CharSet
202200
} else if (set.type === 'empty') {
203201
return empty
204202
} else if (set.type === 'node') {
205-
const [rangeBeforeStart, rangeRest1] = Range.splitAt(set.range.start-1, range)
203+
const [rangeBeforeStart, _rangeRest1] = Range.splitAt(set.range.start-1, range)
206204
const [rangeRest2, rangeAfterEnd] = Range.splitAt(set.range.end, range)
207205

208206
const newLeft = deleteRange(set.left, rangeBeforeStart)

0 commit comments

Comments
 (0)