diff --git a/src/__tests__/compiler.test.tsx b/src/__tests__/encoder.test.tsx similarity index 89% rename from src/__tests__/compiler.test.tsx rename to src/__tests__/encoder.test.tsx index 17308ce..56fc975 100644 --- a/src/__tests__/compiler.test.tsx +++ b/src/__tests__/encoder.test.tsx @@ -1,4 +1,4 @@ -import { buildPattern, buildRegex } from '../compiler'; +import { buildPattern, buildRegex } from '..'; import { one, oneOrMore, optionally, zeroOrMore } from '../quantifiers/base'; import { repeat } from '../quantifiers/repeat'; @@ -56,3 +56,9 @@ test('buildRegex throws error on unknown element', () => { buildRegex({ type: 'unknown' }) ).toThrowErrorMatchingInlineSnapshot(`"Unknown elements type unknown"`); }); + +test('buildPattern throws on empty text', () => { + expect(() => buildPattern('')).toThrowErrorMatchingInlineSnapshot( + `"\`encodeText\`: received text should not be empty"` + ); +}); diff --git a/src/character-classes/__tests__/any-of.test.ts b/src/character-classes/__tests__/any-of.test.ts index 0876929..2b412ab 100644 --- a/src/character-classes/__tests__/any-of.test.ts +++ b/src/character-classes/__tests__/any-of.test.ts @@ -1,23 +1,28 @@ -import { buildPattern as p } from '../../compiler'; +import { buildPattern } from '../..'; import { oneOrMore } from '../../quantifiers/base'; import { anyOf } from '../any-of'; test('"anyOf" base cases', () => { - expect(p(anyOf(''))).toBe(''); - expect(p(anyOf('a'))).toBe('a'); - expect(p(anyOf('abc'))).toBe('[abc]'); + expect(buildPattern(anyOf('a'))).toBe('a'); + expect(buildPattern(anyOf('abc'))).toBe('[abc]'); }); test('"anyOf" in context', () => { - expect(p('x', anyOf('a'), 'x')).toBe('xax'); - expect(p('x', anyOf('abc'), 'x')).toBe('x[abc]x'); - expect(p('x', oneOrMore(anyOf('abc')), 'x')).toBe('x(?:[abc])+x'); + expect(buildPattern('x', anyOf('a'), 'x')).toBe('xax'); + expect(buildPattern('x', anyOf('abc'), 'x')).toBe('x[abc]x'); + expect(buildPattern('x', oneOrMore(anyOf('abc')), 'x')).toBe('x[abc]+x'); }); test('"anyOf" escapes special characters', () => { - expect(p(anyOf('abc-+.'))).toBe('[-abc\\+\\.]'); + expect(buildPattern(anyOf('abc-+.'))).toBe('[-abc\\+\\.]'); }); test('"anyOf" moves hyphen to the first position', () => { - expect(p(anyOf('a-bc'))).toBe('[-abc]'); + expect(buildPattern(anyOf('a-bc'))).toBe('[-abc]'); +}); + +test('`anyOf` throws on empty text', () => { + expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot( + `"\`anyOf\` should received at least one character"` + ); }); diff --git a/src/character-classes/__tests__/base.test.ts b/src/character-classes/__tests__/base.test.ts index 0638662..37b5861 100644 --- a/src/character-classes/__tests__/base.test.ts +++ b/src/character-classes/__tests__/base.test.ts @@ -1,6 +1,6 @@ -import { any, digit, whitespace, word } from '../base'; -import { buildPattern } from '../../compiler'; +import { buildPattern } from '../..'; import { one } from '../../quantifiers/base'; +import { any, digit, whitespace, word } from '../base'; test('"whitespace" character class', () => { expect(buildPattern(whitespace)).toEqual(`\\s`); diff --git a/src/character-classes/__tests__/encoder.test.ts b/src/character-classes/__tests__/encoder.test.ts new file mode 100644 index 0000000..6b7ae77 --- /dev/null +++ b/src/character-classes/__tests__/encoder.test.ts @@ -0,0 +1,12 @@ +import { encodeCharacterClass } from '../encoder'; + +test('buildPattern throws on empty text', () => { + expect(() => + encodeCharacterClass({ + type: 'characterClass', + characters: [], + }) + ).toThrowErrorMatchingInlineSnapshot( + `"Character class should contain at least one character"` + ); +}); diff --git a/src/character-classes/any-of.ts b/src/character-classes/any-of.ts index 6776b38..23c0458 100644 --- a/src/character-classes/any-of.ts +++ b/src/character-classes/any-of.ts @@ -2,8 +2,13 @@ import type { CharacterClass } from '../types'; import { escapeText } from '../utils'; export function anyOf(characters: string): CharacterClass { + const charactersArray = characters.split('').map(escapeText); + if (charactersArray.length === 0) { + throw new Error('`anyOf` should received at least one character'); + } + return { type: 'characterClass', - characters: characters.split('').map(escapeText), + characters: charactersArray, }; } diff --git a/src/character-classes/base.ts b/src/character-classes/base.ts index 6042cff..e620ac6 100644 --- a/src/character-classes/base.ts +++ b/src/character-classes/base.ts @@ -1,5 +1,10 @@ import type { CharacterClass } from '../types'; +export const any: CharacterClass = { + type: 'characterClass', + characters: ['.'], +}; + export const whitespace: CharacterClass = { type: 'characterClass', characters: ['\\s'], @@ -14,8 +19,3 @@ export const word: CharacterClass = { type: 'characterClass', characters: ['\\w'], }; - -export const any: CharacterClass = { - type: 'characterClass', - characters: ['.'], -}; diff --git a/src/character-classes/compiler.ts b/src/character-classes/encoder.ts similarity index 51% rename from src/character-classes/compiler.ts rename to src/character-classes/encoder.ts index 5ce0a46..62a3ff3 100644 --- a/src/character-classes/compiler.ts +++ b/src/character-classes/encoder.ts @@ -1,21 +1,30 @@ import type { CharacterClass } from '../types'; +import { EncoderPriority, type EncoderNode } from '../types-internal'; -export function compileCharacterClass({ characters }: CharacterClass): string { +export function encodeCharacterClass({ + characters, +}: CharacterClass): EncoderNode { if (characters.length === 0) { - return ''; + throw new Error('Character class should contain at least one character'); } if (characters.length === 1) { - return characters[0]!; + return { + priority: EncoderPriority.Atom, + pattern: characters[0]!, + }; } - return `[${escapeHyphen(characters).join('')}]`; + return { + priority: EncoderPriority.Atom, + pattern: `[${reorderHyphen(characters).join('')}]`, + }; } // If passed characters includes hyphen (`-`) it need to be moved to // first (or last) place in order to treat it as hyphen character and not a range. // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types -function escapeHyphen(characters: string[]) { +function reorderHyphen(characters: string[]) { if (characters.includes('-')) { return ['-', ...characters.filter((c) => c !== '-')]; } diff --git a/src/compiler.ts b/src/compiler.ts deleted file mode 100644 index c3cbf82..0000000 --- a/src/compiler.ts +++ /dev/null @@ -1,60 +0,0 @@ -import type { RegexElement } from './types'; -import { compileChoiceOf } from './components/choiceOf'; -import { compileCharacterClass } from './character-classes/compiler'; -import { baseQuantifiers, isBaseQuantifier } from './quantifiers/base'; -import { compileRepeat } from './quantifiers/repeat'; -import { escapeText } from './utils'; - -/** - * Generate RegExp object for elements. - * - * @param elements - * @returns - */ -export function buildRegex(...elements: RegexElement[]): RegExp { - const pattern = compileList(elements); - return new RegExp(pattern); -} - -/** - * Generate regex pattern for elements. - * @param elements - * @returns - */ -export function buildPattern(...elements: RegexElement[]): string { - return compileList(elements); -} - -// Recursive compilation - -function compileList(elements: RegexElement[]): string { - return elements.map((c) => compileSingle(c)).join(''); -} - -function compileSingle(element: RegexElement): string { - if (typeof element === 'string') { - return escapeText(element); - } - - if (element.type === 'characterClass') { - return compileCharacterClass(element); - } - - if (element.type === 'choiceOf') { - return compileChoiceOf(element, compileSingle); - } - - if (element.type === 'repeat') { - const compiledChildren = compileList(element.children); - return compileRepeat(element.config, compiledChildren); - } - - if (isBaseQuantifier(element)) { - const compiledChildren = compileList(element.children); - const compiler = baseQuantifiers[element.type]; - return compiler(compiledChildren); - } - - // @ts-expect-error User passed incorrect type - throw new Error(`Unknown elements type ${element.type}`); -} diff --git a/src/components/__tests__/choiceOf.test.ts b/src/components/__tests__/choiceOf.test.ts index e5a765b..2ff500b 100644 --- a/src/components/__tests__/choiceOf.test.ts +++ b/src/components/__tests__/choiceOf.test.ts @@ -1,23 +1,38 @@ -import { buildPattern } from '../../compiler'; +import { buildPattern } from '../..'; import { oneOrMore, zeroOrMore } from '../../quantifiers/base'; import { repeat } from '../../quantifiers/repeat'; import { choiceOf } from '../choiceOf'; test('"choiceOf" using basic strings', () => { expect(buildPattern(choiceOf('a'))).toEqual('a'); - expect(buildPattern(choiceOf('a', 'b'))).toEqual('(?:a|b)'); - expect(buildPattern(choiceOf('a', 'b', 'c'))).toEqual('(?:a|b|c)'); + expect(buildPattern(choiceOf('a', 'b'))).toEqual('a|b'); + expect(buildPattern(choiceOf('a', 'b', 'c'))).toEqual('a|b|c'); + expect(buildPattern(choiceOf('aaa', 'bbb'))).toEqual('aaa|bbb'); +}); + +test('"choiceOf" used in sequence', () => { + expect(buildPattern('x', choiceOf('a'), 'x')).toEqual('xax'); + expect(buildPattern(choiceOf('a', 'b'), 'x')).toEqual('(?:a|b)x'); + expect(buildPattern('x', choiceOf('a', 'b'))).toEqual('x(?:a|b)'); - expect(buildPattern(choiceOf('aaa', 'bbb'))).toEqual('(?:aaa|bbb)'); + expect(buildPattern(choiceOf('a', 'b', 'c'))).toEqual('a|b|c'); + expect(buildPattern('x', choiceOf('a', 'b', 'c'))).toEqual('x(?:a|b|c)'); + expect(buildPattern(choiceOf('a', 'b', 'c'), 'x')).toEqual('(?:a|b|c)x'); + + expect(buildPattern(choiceOf('aaa', 'bbb'))).toEqual('aaa|bbb'); }); test('"choiceOf" using nested regex', () => { - expect(buildPattern(choiceOf(oneOrMore('a'), zeroOrMore('b')))).toBe( - '(?:a+|b*)' - ); + expect(buildPattern(choiceOf(oneOrMore('a'), zeroOrMore('b')))).toBe('a+|b*'); expect( buildPattern( choiceOf(repeat({ min: 1, max: 3 }, 'a'), repeat({ count: 5 }, 'bx')) ) - ).toBe('(?:a{1,3}|(?:bx){5})'); + ).toBe('a{1,3}|(?:bx){5}'); +}); + +test('`anyOf` throws on empty options', () => { + expect(() => choiceOf()).toThrowErrorMatchingInlineSnapshot( + `"\`choiceOf\` should receive at least one option"` + ); }); diff --git a/src/components/choiceOf.ts b/src/components/choiceOf.ts index 62d0cac..0d1afa8 100644 --- a/src/components/choiceOf.ts +++ b/src/components/choiceOf.ts @@ -1,18 +1,32 @@ import type { ChoiceOf, RegexElement } from '../types'; -import type { CompileSingle } from '../types-internal'; -import { wrapGroup } from '../utils'; +import { + EncoderPriority, + type EncodeElement, + type EncoderNode, +} from '../types-internal'; export function choiceOf(...children: RegexElement[]): ChoiceOf { + if (children.length === 0) { + throw new Error('`choiceOf` should receive at least one option'); + } + return { type: 'choiceOf', children, }; } -export function compileChoiceOf( +export function encodeChoiceOf( element: ChoiceOf, - compileSingle: CompileSingle -): string { - const compiledChildren = element.children.map(compileSingle); - return wrapGroup(compiledChildren.join('|')); + encodeElement: EncodeElement +): EncoderNode { + const encodedNodes = element.children.map(encodeElement); + if (encodedNodes.length === 1) { + return encodedNodes[0]!; + } + + return { + priority: EncoderPriority.Alternation, + pattern: encodedNodes.map((n) => n.pattern).join('|'), + }; } diff --git a/src/encoder.ts b/src/encoder.ts new file mode 100644 index 0000000..5438656 --- /dev/null +++ b/src/encoder.ts @@ -0,0 +1,71 @@ +import type { RegexElement } from './types'; +import { EncoderPriority, type EncoderNode } from './types-internal'; +import { encodeChoiceOf } from './components/choiceOf'; +import { encodeCharacterClass } from './character-classes/encoder'; +import { + encodeOne, + encodeOneOrMore, + encodeOptionally, + encodeZeroOrMore, +} from './quantifiers/base'; +import { encodeRepeat } from './quantifiers/repeat'; +import { concatNodes, escapeText } from './utils'; + +export function encodeSequence(elements: RegexElement[]): EncoderNode { + return concatNodes(elements.map((c) => encodeElement(c))); +} + +export function encodeElement(element: RegexElement): EncoderNode { + if (typeof element === 'string') { + return encodeText(element); + } + + if (element.type === 'characterClass') { + return encodeCharacterClass(element); + } + + if (element.type === 'choiceOf') { + return encodeChoiceOf(element, encodeElement); + } + + if (element.type === 'repeat') { + return encodeRepeat(element.config, encodeSequence(element.children)); + } + + if (element.type === 'one') { + return encodeOne(encodeSequence(element.children)); + } + + if (element.type === 'oneOrMore') { + return encodeOneOrMore(encodeSequence(element.children)); + } + + if (element.type === 'optionally') { + return encodeOptionally(encodeSequence(element.children)); + } + + if (element.type === 'zeroOrMore') { + return encodeZeroOrMore(encodeSequence(element.children)); + } + + // @ts-expect-error User passed incorrect type + throw new Error(`Unknown elements type ${element.type}`); +} + +function encodeText(text: string): EncoderNode { + if (text.length === 0) { + throw new Error('`encodeText`: received text should not be empty'); + } + + if (text.length === 1) { + return { + priority: EncoderPriority.Atom, + pattern: escapeText(text), + }; + } + + return { + priority: EncoderPriority.Sequence, + pattern: escapeText(text), + }; +} diff --git a/src/index.ts b/src/index.ts index 9e536e0..2d9c1ad 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,9 +1,30 @@ -export type * from './types'; +import type { RegexElement } from './types'; +import { encodeSequence } from './encoder'; -export { buildRegex, buildPattern } from './compiler'; +export type * from './types'; export { any, digit, whitespace, word } from './character-classes/base'; export { anyOf } from './character-classes/any-of'; export { one, oneOrMore, optionally, zeroOrMore } from './quantifiers/base'; export { repeat } from './quantifiers/repeat'; export { choiceOf } from './components/choiceOf'; + +/** + * Generate RegExp object for elements. + * + * @param elements + * @returns + */ +export function buildRegex(...elements: RegexElement[]): RegExp { + const pattern = encodeSequence(elements).pattern; + return new RegExp(pattern); +} + +/** + * Generate regex pattern for elements. + * @param elements + * @returns + */ +export function buildPattern(...elements: RegexElement[]): string { + return encodeSequence(elements).pattern; +} diff --git a/src/index.tsx b/src/index.tsx deleted file mode 100644 index abc20f1..0000000 --- a/src/index.tsx +++ /dev/null @@ -1,5 +0,0 @@ -export type * from './types'; - -export { whitespace } from './character-classes/base'; -export { buildRegex, buildPattern } from './compiler'; -export { oneOrMore, optionally } from './quantifiers/base'; diff --git a/src/quantifiers/__tests__/base.test.tsx b/src/quantifiers/__tests__/base.test.tsx index 05cd750..de1d3a3 100644 --- a/src/quantifiers/__tests__/base.test.tsx +++ b/src/quantifiers/__tests__/base.test.tsx @@ -1,5 +1,6 @@ +import { buildPattern, buildRegex } from '../..'; +import { digit } from '../../character-classes/base'; import { one, oneOrMore, optionally, zeroOrMore } from '../base'; -import { buildPattern, buildRegex } from '../../compiler'; test('"oneOrMore" quantifier', () => { expect(buildPattern(oneOrMore('a'))).toEqual('a+'); @@ -44,3 +45,10 @@ test('zeroOrMore does not generate capture when grouping', () => { const groups = [...'aa'.match(regex)!]; expect(groups).toEqual(['aa']); }); + +test('base quantifiers optimize grouping for atoms', () => { + expect(buildPattern(one(digit))).toBe('\\d'); + expect(buildPattern(oneOrMore(digit))).toBe('\\d+'); + expect(buildPattern(optionally(digit))).toBe('\\d?'); + expect(buildPattern(zeroOrMore(digit))).toBe('\\d*'); +}); diff --git a/src/quantifiers/__tests__/repeat.test.tsx b/src/quantifiers/__tests__/repeat.test.tsx index d0015c7..9340ab5 100644 --- a/src/quantifiers/__tests__/repeat.test.tsx +++ b/src/quantifiers/__tests__/repeat.test.tsx @@ -1,4 +1,5 @@ -import { buildPattern } from '../../compiler'; +import { buildPattern } from '../..'; +import { digit } from '../../character-classes/base'; import { zeroOrMore, oneOrMore } from '../base'; import { repeat } from '../repeat'; @@ -14,3 +15,15 @@ test('"repeat" quantifier', () => { buildPattern(repeat({ count: 5 }, 'text', ' ', oneOrMore('d'))) ).toEqual('(?:text d+){5}'); }); + +test('"repeat"" optimizes grouping for atoms', () => { + expect(buildPattern(repeat({ count: 2 }, digit))).toBe('\\d{2}'); + expect(buildPattern(repeat({ min: 2 }, digit))).toBe('\\d{2,}'); + expect(buildPattern(repeat({ min: 1, max: 5 }, digit))).toBe('\\d{1,5}'); +}); + +test('`repeat` throws on no children', () => { + expect(() => repeat({ count: 1 })).toThrowErrorMatchingInlineSnapshot( + `"\`repeat\` should receive at least one element"` + ); +}); diff --git a/src/quantifiers/base.ts b/src/quantifiers/base.ts index 7f12221..7dda93a 100644 --- a/src/quantifiers/base.ts +++ b/src/quantifiers/base.ts @@ -2,11 +2,11 @@ import type { One, OneOrMore, Optionally, - Quantifier, RegexElement, ZeroOrMore, } from '../types'; -import { wrapGroup } from '../utils'; +import { EncoderPriority, type EncoderNode } from '../types-internal'; +import { toAtom } from '../utils'; export function one(...children: RegexElement[]): One { return { @@ -36,15 +36,27 @@ export function zeroOrMore(...children: RegexElement[]): ZeroOrMore { }; } -export const baseQuantifiers = { - one: (compiledChildren) => compiledChildren, - oneOrMore: (compiledChildren) => `${wrapGroup(compiledChildren)}+`, - optionally: (compiledChildren) => `${wrapGroup(compiledChildren)}?`, - zeroOrMore: (compiledChildren) => `${wrapGroup(compiledChildren)}*`, -} as const satisfies Record string>; +export function encodeOne(node: EncoderNode) { + return node; +} + +export function encodeOneOrMore(node: EncoderNode): EncoderNode { + return { + priority: EncoderPriority.Sequence, + pattern: `${toAtom(node)}+`, + }; +} -export function isBaseQuantifier( - element: Exclude -): element is Quantifier { - return element.type in baseQuantifiers; +export function encodeOptionally(node: EncoderNode): EncoderNode { + return { + priority: EncoderPriority.Sequence, + pattern: `${toAtom(node)}?`, + }; +} + +export function encodeZeroOrMore(node: EncoderNode): EncoderNode { + return { + priority: EncoderPriority.Sequence, + pattern: `${toAtom(node)}*`, + }; } diff --git a/src/quantifiers/repeat.ts b/src/quantifiers/repeat.ts index c6c6288..76a17de 100644 --- a/src/quantifiers/repeat.ts +++ b/src/quantifiers/repeat.ts @@ -1,10 +1,15 @@ import type { RegexElement, Repeat, RepeatConfig } from '../types'; -import { wrapGroup } from '../utils'; +import { EncoderPriority, type EncoderNode } from '../types-internal'; +import { toAtom } from '../utils'; export function repeat( config: RepeatConfig, ...children: RegexElement[] ): Repeat { + if (children.length === 0) { + throw new Error('`repeat` should receive at least one element'); + } + return { type: 'repeat', children, @@ -12,13 +17,19 @@ export function repeat( }; } -export function compileRepeat( +export function encodeRepeat( config: RepeatConfig, - compiledChildren: string -): string { + node: EncoderNode +): EncoderNode { if ('count' in config) { - return `${wrapGroup(compiledChildren)}{${config.count}}`; + return { + priority: EncoderPriority.Sequence, + pattern: `${toAtom(node)}{${config.count}}`, + }; } - return `${wrapGroup(compiledChildren)}{${config.min},${config?.max ?? ''}}`; + return { + priority: EncoderPriority.Sequence, + pattern: `${toAtom(node)}{${config.min},${config?.max ?? ''}}`, + }; } diff --git a/src/types-internal.ts b/src/types-internal.ts index ab54caf..6496e82 100644 --- a/src/types-internal.ts +++ b/src/types-internal.ts @@ -1,3 +1,29 @@ import type { RegexElement } from './types'; -export type CompileSingle = (element: RegexElement) => string; +/** + * Encoded regex pattern with information about its type (atom, sequence) + */ +export interface EncoderNode { + pattern: string; + priority: EncoderPriority; +} + +/** + * Higher is more important. + */ +export const EncoderPriority = { + // Atoms: single characters, character classes (`\d`, `[a-z]`), + // capturing and non-capturing groups (`()`) + Atom: 3, + + // Sequence of atoms, e.g., `abc` + Sequence: 2, + + // Alteration (OR, `|`) expression, e.g., `a|b` + Alternation: 1, +} as const; + +type ValueOf = T[keyof T]; +type EncoderPriority = ValueOf; + +export type EncodeElement = (element: RegexElement) => EncoderNode; diff --git a/src/types.ts b/src/types.ts index 32b4ed6..c65ccd3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -29,11 +29,10 @@ export type Optionally = { children: RegexElement[]; }; -export type RepeatConfig = - | { min: number; max?: number } - | { - count: number; - }; +export type ZeroOrMore = { + type: 'zeroOrMore'; + children: RegexElement[]; +}; export type Repeat = { type: 'repeat'; @@ -41,7 +40,4 @@ export type Repeat = { config: RepeatConfig; }; -export type ZeroOrMore = { - type: 'zeroOrMore'; - children: RegexElement[]; -}; +export type RepeatConfig = { count: number } | { min: number; max?: number }; diff --git a/src/utils.ts b/src/utils.ts index 6b48f1f..4383111 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,11 +1,32 @@ +import { EncoderPriority, type EncoderNode } from './types-internal'; + /** - * Wraps regex string in a non-capturing group if it is more than one character long. + * Returns atomic pattern for given node. * - * @param regex + * @param node * @returns */ -export function wrapGroup(regex: string): string { - return regex.length === 1 ? regex : `(?:${regex})`; +export function toAtom(node: EncoderNode): string { + if (node.priority === EncoderPriority.Atom) { + return node.pattern; + } + + return `(?:${node.pattern})`; +} + +export function concatNodes(nodes: EncoderNode[]): EncoderNode { + if (nodes.length === 1) { + return nodes[0]!; + } + + return { + priority: EncoderPriority.Sequence, + pattern: nodes + .map((n) => + n.priority < EncoderPriority.Sequence ? toAtom(n) : n.pattern + ) + .join(''), + }; } // Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping