Skip to content

Commit e82a94f

Browse files
committed
perf(toStdRegex): reduce generated regex size
`toStdRegex` and can generate very large expressions. Even if called on an instance that is already a `StdRegex`. Created a new benchmark "toStdRegex_output_length" to track this. The benchmark runs `toStdRegex` on fixed dataset of 1000 regular expressions. The current implementation can not get though all instances. On many instances there are call-stack overflows, out-of-memory errors and non-termination. A run on the first 500 (smaller) instances give the following result: time : 5973 ms avg. multiplier : 27 worst multiplier : 3127 Here avg. multiplier means that on average the output regex was 27 times larger then the input regex. In the worst recorded case it was 3127 times larger. After changing the order in which `dfaToRegex` eliminates states from a DFA, the benchmark on the first 500 instances improves to: time : 1551 ms avg. multiplier : 2 worst multiplier : 70 That's much better but we still don't get through all 1000 instances. Result for the first 750 instances: time : 3672 ms avg. multiplier : 7 worst multiplier : 962
1 parent f6a357e commit e82a94f

File tree

7 files changed

+1152
-29
lines changed

7 files changed

+1152
-29
lines changed

benchmark/regex_random_unique_no-nested-star_1000.js

Lines changed: 1005 additions & 0 deletions
Large diffs are not rendered by default.

benchmark/toStdRegex_output_length.js

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import fc from 'fast-check'
2+
import * as RE from '../dist/regex.js'
3+
import { parse, toStdRegex } from '../dist/low-level-api.js'
4+
import regexDataset from './regex_random_unique_no-nested-star_1000.js'
5+
6+
let avgMult = 0
7+
let maxMult = -Infinity
8+
9+
const hardInstances = new Set([
10+
290, // call-stack overflow
11+
556, // takes very long
12+
658, // takes very long
13+
689, // call-stack overflow
14+
724, // takes very long
15+
])
16+
17+
function run(inputRegExp, index) {
18+
// skip some hard early instances:
19+
if (hardInstances.has(index)) return
20+
// only consider first 800 instances for now:
21+
if (index > 750) return
22+
23+
console.log('#' + index, inputRegExp)
24+
25+
const outputRegex = toStdRegex(parse(inputRegExp))
26+
try {
27+
const outputRegExp = RE.toRegExp(outputRegex)
28+
29+
const inp = inputRegExp.source.length
30+
const out = outputRegExp.source.length
31+
const mult = out/inp
32+
33+
avgMult = (avgMult*index + mult)/(index+1)
34+
if (mult > maxMult) {
35+
maxMult = mult
36+
}
37+
38+
console.log(`
39+
regex input length : ${inp}
40+
regex ouptut length : ${out}
41+
multiplier : ${mult}
42+
avg. multiplier : ${avgMult}
43+
worst multiplier : ${maxMult}
44+
`)
45+
} catch (err) {
46+
console.log('too many captures')
47+
}
48+
}
49+
50+
const timeStart = performance.now()
51+
52+
regexDataset
53+
// do short (likely easier) instances first and see how far we get:
54+
.sort((a,b) => a.source.length - b.source.length)
55+
.forEach(run)
56+
57+
console.log('time:', performance.now() - timeStart)

src/dfa.ts

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -94,25 +94,65 @@ export function dfaToRegex(dfa: DFA): RE.StdRegex {
9494
graph
9595
)
9696
}
97+
98+
// All states except `newStartState` and `newFinalState` need to be eliminated.
99+
// After that, the only remaining transition is between `newStartState` and
100+
// `newFinalState` and is labeled with the result regex.
101+
// Thus, we put all these states in worklist to be iteratively eliminated.
102+
// Ripping out states with small in/out-degree earlier can result in smaller expressions.
103+
// For example:
104+
// b d
105+
// +---------(s2)---------+
106+
// a / \
107+
// (s0) ------------- (s1) (s4)
108+
// \ c e /
109+
// +---------(s3)---------+
110+
//
111+
// Ripping states in the order s2, s3, s1 produces:
112+
//
113+
// a(bd|ce)
114+
// (s0) --------------------------------------- (s4)
115+
//
116+
// Ripping states in the order s1, s2, s3 produces:
117+
//
118+
// (abd)|(ace)
119+
// (s0) --------------------------------------- (s4)
120+
//
121+
// Thus, we sort the worklist by degree. Note, that the degree of nodes changes during
122+
// the later iteration so it can still be that nodes with higher degree are sometimes
123+
// ripped out first. However, keeping the worklist sorted at the same time also has a
124+
// cost. Maybe this can be improved by choosing some heap structure:
125+
const worklist = [...dfa.allStates.keys()]
126+
// Avoid constantly re-computing degree during sorting by computing it once in a first pass:
127+
.map(state => ({ state, degree: Graph.degree(state, graph)}))
128+
// Sort states by degree:
129+
.sort((a,b) => a.degree - b.degree)
130+
// Through degree away again after sorting:
131+
.map(({ state }) => state)
97132

98-
for (const state of dfa.allStates.keys()) {
99-
const result = Graph.ripNode(state, graph)
100-
101-
for (const [pred, predLabel] of result.predecessors) {
102-
for (const [succ, succLabel] of result.successors) {
103-
const transitiveLabel = RE.seq([
104-
predLabel,
105-
RE.star(result.selfLoop ?? RE.epsilon),
106-
succLabel,
107-
])
108-
109-
Graph.setEdge(
110-
pred,
111-
succ,
112-
transitiveLabel,
113-
graph,
114-
RE.union,
115-
)
133+
while (true) {
134+
const state = worklist.shift()
135+
if (state === undefined) {
136+
break
137+
} else {
138+
const result = Graph.ripNode(state, graph)
139+
for (const [pred, predLabel] of result.predecessors) {
140+
for (const [succ, succLabel] of result.successors) {
141+
const transitiveLabel = RE.seq([
142+
predLabel,
143+
RE.star(result.selfLoop ?? RE.epsilon),
144+
succLabel,
145+
])
146+
Graph.setEdge(
147+
pred,
148+
succ,
149+
transitiveLabel,
150+
graph,
151+
// Flipping the arguments avoids that the associativity rewrite rule of `union`
152+
// keeps getting triggered. This makes a segnificant performance difference:
153+
(oldValue, newValue) => RE.union(newValue, oldValue),
154+
)
155+
}
116156
}
117157
}
118158
}

src/graph.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ export function outDegree<A>(node: number, graph: Graph<A>): number {
4141
return succs.size
4242
}
4343

44+
/**
45+
* Number of in- and out-going edges at `node` (not counting self-loop).
46+
*/
47+
export function degree<A>(node: number, graph: Graph<A>): number {
48+
return inDegree(node, graph) + outDegree(node, graph)
49+
}
50+
4451
export type RipNodeResult<A> = {
4552
predecessors: [number, A][]
4653
selfLoop: A | undefined

src/utils.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,16 @@ export function hashStr(str: string, seed = 0): number {
146146
export function xor(a: number, b: number): number {
147147
return a^b
148148
}
149+
150+
export function minBy<T>(iterable: Iterable<T>, scoreOf: (item: T) => number): T | undefined {
151+
let minItem = undefined
152+
let minScore = Infinity
153+
for (const item of iterable) {
154+
const score = scoreOf(item)
155+
if (scoreOf(item) < minScore) {
156+
minItem = item
157+
minScore = score
158+
}
159+
}
160+
return minItem
161+
}

test/arbitrary-regex.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,18 @@ export function stdRegexNoStar(size = 100): fc.Arbitrary<RE.StdRegex> {
5656
)
5757
}
5858

59+
export function stdRegexNoNestedStar(size = 100): fc.Arbitrary<RE.StdRegex> {
60+
if (size <= 0)
61+
return literal()
62+
else
63+
return fc.oneof(
64+
star(() => stdRegexNoStar(Math.floor(size/2))),
65+
concat(() => stdRegexNoNestedStar(Math.floor(size/2))),
66+
union(() => stdRegexNoNestedStar(Math.floor(size/2))),
67+
literal(),
68+
)
69+
}
70+
5971
export function stdRegexString(): fc.Arbitrary<string> {
6072
return stdRegex().map(RE.toString)
6173
}

test/regex.spec.ts

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,3 @@ describe('rewrite rules', () => {
199199
})
200200

201201
})
202-
203-
// describe('equivalent', () => {
204-
// it('every regex is equivalent to itself', () => {
205-
// fc.assert(
206-
// fc.property(extRegex(), (tree) => {
207-
// expect(RegexTree.equivalent(tree, tree)).toBe(true)
208-
// })
209-
// )
210-
// })
211-
// })
212-

0 commit comments

Comments
 (0)