Skip to content

Commit 8fab4eb

Browse files
committed
fix: invalid union rewrite rule
Generate `star` in very test now, since blow up can be caught and handled. With that discovered an invalid rewrite rule for `union`. The rule would rewrite: a{2}|a* --> a(a|a*) The result does not match the empty string anymore.
1 parent 50e0afc commit 8fab4eb

File tree

6 files changed

+107
-97
lines changed

6 files changed

+107
-97
lines changed

benchmark/toStdRegex_output_length.js

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,15 @@ import * as RE from '../dist/regex.js'
33
import { ParseError } from '../dist/parser.js'
44
import { UnsupportedSyntaxError } from '../dist/regex-parser.js'
55
import { parse, toStdRegex } from '../dist/low-level-api.js'
6-
import { regexToDFA } from '../dist/dfa.js'
76
import randomRegexDataset from './regex_random_unique_no-nested-star_1000.js'
87
import handwrittenRegexDataset from './regex_handwritten.js'
98

109
const fullRegexDataset = [
1110
...randomRegexDataset,
1211
...handwrittenRegexDataset,
13-
]
12+
]
1413

15-
16-
let avgMult = 0
17-
let maxMult = -Infinity
14+
const mults = []
1815

1916
function run(inputRegExp, index) {
2017
console.log('#' + index, inputRegExp)
@@ -27,18 +24,12 @@ function run(inputRegExp, index) {
2724
const inp = inputRegExp.source.length
2825
const out = outputRegExp.source.length
2926
const mult = out/inp
30-
31-
avgMult = (avgMult*index + mult)/(index+1)
32-
if (mult > maxMult) {
33-
maxMult = mult
34-
}
27+
mults.push(mult)
3528

3629
console.log(`
3730
regex input length : ${inp}
3831
regex ouptut length : ${out}
3932
multiplier : ${mult}
40-
avg. multiplier : ${avgMult}
41-
worst multiplier : ${maxMult}
4233
`)
4334
}
4435

@@ -75,3 +66,15 @@ console.debug('failed instances: ', {
7566
stackOverflow,
7667
regexSyntaxError
7768
})
69+
70+
const mean = mults.reduce((a,b) => a+b, 0) / mults.length
71+
const median = mults[Math.ceil(mults.length / 2)]
72+
const worst = mults.reduce((a,b) => Math.max(a,b), -Infinity)
73+
74+
console.log(`
75+
multipliers:
76+
mean : ${mean}
77+
median : ${median}
78+
max : ${worst}
79+
`)
80+

src/dfa.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ export function dfaToRegex(dfa: DFA): RE.StdRegex {
134134
.map(state => ({ state, degree: Graph.degree(state, graph)}))
135135
// Sort states by degree:
136136
.sort((a,b) => a.degree - b.degree)
137-
// Through degree away again after sorting:
137+
// Throw degree away again after sorting:
138138
.map(({ state }) => state)
139139

140140
while (true) {
@@ -183,13 +183,16 @@ export function dfaToRegex(dfa: DFA): RE.StdRegex {
183183
// TODO: can this round-trip through DFA construction be avoided?
184184
export function toStdRegex(inputRegex: RE.ExtRegex): RE.StdRegex {
185185
const dfa = regexToDFA(inputRegex)
186+
// printTrans(dfa)
186187
const outputRegex = dfaToRegex(dfa)
187188
return outputRegex
188189
}
189190

190-
// function printTrans(trans: Table.Table<CharSet.CharSet>) {
191+
// function printTrans(dfa: DFA) {
192+
// console.debug({ start: dfa.startState })
193+
// console.debug({ final: dfa.finalStates })
191194
// console.debug('=========trans===========')
192-
// for (const [source, succs] of trans.entries()) {
195+
// for (const [source, succs] of dfa.transitions.entries()) {
193196
// for (const [target, label] of succs) {
194197
// console.debug(source, target, new RegExp(CharSet.toString(label)))
195198
// // console.debug(source, target, RE.toString(label))

src/regex.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ function extractFront(regex: ExtRegex): [ExtRegex, ExtRegex] {
124124
case 'literal': return [regex, epsilon]
125125
case 'concat': return [regex.left, regex.right]
126126
case 'union': return [regex, epsilon]
127-
case 'star': return [regex.inner, regex]
127+
case 'star': return [regex, epsilon]
128128
case 'intersection': return [regex, epsilon]
129129
case 'complement': return [regex, epsilon]
130130
}
@@ -139,7 +139,7 @@ function extractBack(regex: ExtRegex): [ExtRegex, ExtRegex] {
139139
case 'literal': return [epsilon, regex]
140140
case 'concat': return [regex.left, regex.right]
141141
case 'union': return [epsilon, regex]
142-
case 'star': return [regex, regex.inner]
142+
case 'star': return [epsilon, regex]
143143
case 'intersection': return [epsilon, regex]
144144
case 'complement': return [epsilon, regex]
145145
}

test/arbitrary-regex.ts

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import fc from 'fast-check'
22
import * as RE from '../src/regex'
33
import * as CharSet from '../src/char-set'
4-
import { checkedAllCases } from '../src/utils'
54

65
// TODO: try larger alphabet:
76
export function charSet(): fc.Arbitrary<CharSet.CharSet> {
@@ -38,33 +37,10 @@ export function stdRegex(size = 100): fc.Arbitrary<RE.StdRegex> {
3837
return literal()
3938
else
4039
return fc.oneof(
41-
star(() => stdRegex(Math.floor(size/2))),
42-
concat(() => stdRegex(Math.floor(size/2))),
43-
union(() => stdRegex(Math.floor(size/2))),
44-
literal(),
45-
)
46-
}
47-
48-
export function stdRegexNoStar(size = 100): fc.Arbitrary<RE.StdRegex> {
49-
if (size <= 0)
50-
return literal()
51-
else
52-
return fc.oneof(
53-
concat(() => stdRegexNoStar(Math.floor(size/2))),
54-
union(() => stdRegexNoStar(Math.floor(size/2))),
55-
literal(),
56-
)
57-
}
58-
59-
export function stdRegexNoNestedStar(size = 100): fc.Arbitrary<RE.StdRegex> {
60-
if (size <= 0)
61-
return literal()
62-
else
63-
return fc.oneof(
64-
star(() => stdRegexNoStar(Math.floor(size/2))),
65-
concat(() => stdRegexNoNestedStar(Math.floor(size/2))),
66-
union(() => stdRegexNoNestedStar(Math.floor(size/2))),
67-
literal(),
40+
{ arbitrary: literal(), weight: 5 },
41+
{ arbitrary: concat(() => stdRegex(Math.floor(size/2))), weight: 3 },
42+
{ arbitrary: union(() => stdRegex(Math.floor(size/2))), weight: 3 },
43+
{ arbitrary: star(() => stdRegex(Math.floor(size/2))), weight: 1 },
6844
)
6945
}
7046

test/low-level-api.spec.ts

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,72 @@
11
import fc from "fast-check"
22
import { describe, it, expect, test } from "vitest"
3-
import { isEmpty } from '../src/regex'
3+
import { CacheOverflowError, isEmpty, VeryLargeSyntaxTreeError } from '../src/regex'
44
import * as RE from "../src/low-level-api"
55
import * as Arb from './arbitrary-regex'
6-
import * as Stream from '../src/stream'
7-
import { assert } from "../src/utils"
86

97
/**
108
* Stochastically verifies that `regex1` is a subset of `regex2`.
119
* It samples a bunch of matches from `regex1` and checks whether
1210
* they match `regex2` as well. If a mismatch is found it is returned.
13-
* Otherwise, `true` is returned.
11+
* Otherwise, `undefined` is returned.
1412
*/
15-
function isSubsetOf(regex1: RE.StdRegex, regex2: RE.StdRegex, maxSamples = 30): true | string {
16-
const re2 = RE.toRegExp(regex2)
17-
13+
function expectSubsetOf(regex1: RE.StdRegex, regex2: RE.StdRegex, maxSamples = 30) {
14+
const re2 = toRegExp_ignoreBlowUp(regex2)
1815
for (const match1 of RE.enumerate(regex1).take(maxSamples)) {
19-
if (!re2.test(match1)) {
20-
return match1
21-
}
16+
expect(match1).toMatch(re2)
17+
}
18+
}
19+
20+
function toRegExp_ignoreBlowUp(regex: RE.StdRegex) {
21+
try {
22+
return RE.toRegExp(regex)
23+
} catch (e) {
24+
if (e instanceof VeryLargeSyntaxTreeError) {
25+
console.warn(e)
26+
fc.pre(false)
27+
} else {
28+
throw e
29+
}
2230
}
31+
}
2332

24-
return true
33+
function toStdRegex_ignoreBlowUp(regex: RE.ExtRegex) {
34+
try {
35+
return RE.toStdRegex(regex)
36+
} catch (e) {
37+
if (e instanceof CacheOverflowError) {
38+
console.warn(e)
39+
fc.pre(false)
40+
} else {
41+
throw e
42+
}
43+
}
2544
}
2645

2746
describe('toStdRegex', () => {
2847

2948
it('is idempotent on StdRegex', () => {
3049
fc.assert(
3150
fc.property(
32-
// FIXME: `star` often leads to exponential blow up.
33-
Arb.stdRegexNoStar(),
51+
Arb.stdRegex(),
3452
inputRegex => {
35-
const outputRegex = RE.toStdRegex(inputRegex)
36-
expect(isSubsetOf(inputRegex, outputRegex)).toBe(true)
37-
expect(isSubsetOf(outputRegex, inputRegex)).toBe(true)
53+
const outputRegex = toStdRegex_ignoreBlowUp(inputRegex)
54+
expectSubsetOf(inputRegex, outputRegex)
55+
expectSubsetOf(outputRegex, inputRegex)
3856
}
3957
),
58+
{ numRuns: 100, maxSkipsPerRun: 100 }
4059
)
41-
})
60+
}, 10_000)
4261

4362
})
4463

4564
test('A ∩ ¬A = ∅', () => {
4665
fc.assert(
4766
fc.property(
48-
Arb.stdRegexNoStar(),
67+
Arb.stdRegex(),
4968
regexA => {
50-
const outputRegex = RE.toStdRegex(
69+
const outputRegex = toStdRegex_ignoreBlowUp(
5170
RE.and([regexA, RE.not(regexA)])
5271
)
5372
expect(isEmpty(outputRegex)).toBe(true)
@@ -59,14 +78,14 @@ test('A ∩ ¬A = ∅', () => {
5978
test('B ⊆ (A ∪ B) ∩ (B ∪ C)', () => {
6079
fc.assert(
6180
fc.property(
62-
Arb.stdRegexNoStar(),
63-
Arb.stdRegexNoStar(),
64-
Arb.stdRegexNoStar(),
81+
Arb.stdRegex(),
82+
Arb.stdRegex(),
83+
Arb.stdRegex(),
6584
(regexA, regexB, regexC) => {
6685
const unionAB = RE.or([regexA, regexB])
6786
const unionBC = RE.or([regexB, regexC])
68-
const interRegex = RE.toStdRegex(RE.and([unionAB, unionBC]))
69-
expect(isSubsetOf(regexB, interRegex)).toBe(true)
87+
const interRegex = toStdRegex_ignoreBlowUp(RE.and([unionAB, unionBC]))
88+
expectSubsetOf(regexB, interRegex)
7089
}
7190
),
7291
)
@@ -76,10 +95,10 @@ test('intersection with regex /^.{N}$/ has only words of length N', () => {
7695
fc.assert(
7796
fc.property(
7897
fc.nat({ max: 10 }),
79-
Arb.stdRegexNoStar(),
98+
Arb.stdRegex(),
8099
(length, regexA) => {
81100
const regexB = RE.repeat(RE.anySingleChar, length)
82-
const interAB = RE.toStdRegex(RE.and([regexA, regexB]))
101+
const interAB = toStdRegex_ignoreBlowUp(RE.and([regexA, regexB]))
83102

84103
for (const word of RE.enumerate(interAB).take(100)) {
85104
expect(word).toHaveLength(length)

test/regex.spec.ts

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,25 @@
11
import fc from "fast-check"
2-
import { describe, it, expect, test } from "vitest"
2+
import { describe, it, expect } from "vitest"
33
import * as RE from "../src/regex"
44
import * as DFA from '../src/dfa'
55
import * as Arb from './arbitrary-regex'
66
import * as Stream from '../src/stream'
77
import * as CharSet from '../src/char-set'
8-
import { toRegExp } from "../src/regex"
98
import { parseRegExp } from "../src/regex-parser"
109

10+
11+
function toStdRegex_ignoreBlowUp(regex: RE.ExtRegex) {
12+
try {
13+
return DFA.toStdRegex(regex)
14+
} catch (e) {
15+
if (e instanceof RE.CacheOverflowError) {
16+
fc.pre(false)
17+
} else {
18+
throw e
19+
}
20+
}
21+
}
22+
1123
describe('toString', () => {
1224

1325
it('output is accepted by RegExp constructor', () => {
@@ -47,36 +59,16 @@ describe('enumerate', () => {
4759
)
4860
})
4961

50-
// it.only('debug', () => {
51-
// const regexp = /^((a(fc)?([cef]|f*)|a*|([ce]b*e*(eb)*)*)((cd)*b*(ac*|d))*c)$/
52-
// const inputRegex = parseRegExp(regexp)
53-
54-
// // get words NOT in the output by enumerating words of the complement:
55-
// const inputRegexComplement = DFA.toStdRegex(RE.complement(inputRegex))
56-
// console.debug(RE.toRegExp(inputRegexComplement))
57-
// const allComplementWords = RE.enumerate(inputRegexComplement)
58-
59-
// // long words are likely result of repetition and are less interesting to test
60-
// // and also blow up memory:
61-
// const shortWords = Stream.takeWhile(word => word.length <= 30, allComplementWords)
62-
63-
// for (const complementWord of Stream.take(100, shortWords)) {
64-
// expect(complementWord).not.toMatch(regexp)
65-
// }
66-
// })
67-
6862
// completeness
6963
it('strings NOT in the output, do NOT match the input regex', () => {
7064
fc.assert(
7165
fc.property(
72-
// FIXME: have to exclude `star` because complement operation
73-
// then often leads to exponential blow-up:
74-
Arb.stdRegexNoStar(),
66+
Arb.stdRegex(),
7567
inputRegex => {
7668
const regexp = RE.toRegExp(inputRegex)
7769

7870
// get words NOT in the output by enumerating words of the complement:
79-
const inputRegexComplement = DFA.toStdRegex(RE.complement(inputRegex))
71+
const inputRegexComplement = toStdRegex_ignoreBlowUp(RE.complement(inputRegex))
8072
const allComplementWords = RE.enumerateAux(inputRegexComplement)
8173

8274
// long words are likely result of repetition and are less interesting to test
@@ -88,7 +80,8 @@ describe('enumerate', () => {
8880
}
8981
}
9082
),
91-
{ endOnFailure: true }
83+
// { endOnFailure: true }
84+
{ seed: -1078936918, path: "13", endOnFailure: true }
9285
)
9386
})
9487

@@ -180,7 +173,8 @@ describe('rewrite rules', () => {
180173
[/^(a|b)|a$/, /^([ab])$/],
181174
[/^(a?)?$/, /^(a?)$/],
182175
[/^(a*)?$/, /^(a*)$/],
183-
[/^(a|a*)$/, /^(aa*)$/],
176+
// TODO:
177+
// [/^(a|a*)$/, /^(aa*)$/],
184178
// union-of-concat rules:
185179
[/^ab|ac$/, /^(a[bc])$/],
186180
[/^ba|ca$/, /^([bc]a)$/],
@@ -199,3 +193,18 @@ describe('rewrite rules', () => {
199193
})
200194

201195
})
196+
197+
describe('derivative', () => {
198+
199+
it.each([
200+
[/^((aa*)?)$/, 'a', /^(a*)$/],
201+
[/^(a{2}(a{3})*)$/, 'a', /^(a(a{3})*)$/],
202+
[/^(a{2}(a*)|(aa*))$/, 'a', /^(a?a*)$/],
203+
[/^(a(a{3})*|(aa*)?)$/, 'a', /^((a{3})*|a*)$/],
204+
[/^(a{2}(a{3})*|(aa*)?)$/, 'a', /^(a(a{3})*|a*)$/],
205+
])('of %s with respect to "%s" is %s', (input, str, expected) => {
206+
const actual = RE.derivative(str, parseRegExp(input))
207+
expect(RE.toRegExp(actual)).toEqual(expected)
208+
})
209+
210+
})

0 commit comments

Comments
 (0)