From e0aaddd05ab6f4f28aecaee6dd7c7967f9f65323 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Tue, 30 Apr 2024 16:17:04 +0200 Subject: [PATCH 01/18] refactor: simplify anchors --- src/constructs/anchors.ts | 41 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/src/constructs/anchors.ts b/src/constructs/anchors.ts index 718e7d6..15606fc 100644 --- a/src/constructs/anchors.ts +++ b/src/constructs/anchors.ts @@ -1,43 +1,26 @@ import type { EncodeResult } from '../encoder/types'; -import type { RegexConstruct } from '../types'; -export interface Anchor extends RegexConstruct { - type: 'anchor'; - symbol: string; -} - -export const startOfString: Anchor = { - type: 'anchor', - symbol: '^', - encode: encodeAnchor, +export const startOfString: EncodeResult = { + precedence: 'atom', + pattern: '^', }; -export const endOfString: Anchor = { - type: 'anchor', - symbol: '$', - encode: encodeAnchor, +export const endOfString: EncodeResult = { + precedence: 'atom', + pattern: '$', }; -export const wordBoundary: Anchor = { - type: 'anchor', - symbol: '\\b', - encode: encodeAnchor, +export const wordBoundary: EncodeResult = { + precedence: 'atom', + pattern: '\\b', }; -export const nonWordBoundary: Anchor = { - type: 'anchor', - symbol: '\\B', - encode: encodeAnchor, +export const nonWordBoundary: EncodeResult = { + precedence: 'atom', + pattern: '\\B', }; /** * @deprecated Renamed to `nonWordBoundary`. */ export const notWordBoundary = nonWordBoundary; - -function encodeAnchor(this: Anchor): EncodeResult { - return { - precedence: 'sequence', - pattern: this.symbol, - }; -} From 22bf39b7aa1787faa9606f6b45f24b79ddd438cf Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Tue, 30 Apr 2024 16:23:30 +0200 Subject: [PATCH 02/18] refactor: qualifiers --- src/constructs/quantifiers.ts | 59 +++++------------------------------ src/encoder/encoder.ts | 10 +++--- 2 files changed, 13 insertions(+), 56 deletions(-) diff --git a/src/constructs/quantifiers.ts b/src/constructs/quantifiers.ts index bf3d575..d3058d3 100644 --- a/src/constructs/quantifiers.ts +++ b/src/constructs/quantifiers.ts @@ -1,73 +1,28 @@ import { encodeAtom } from '../encoder/encoder'; import type { EncodeResult } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import type { RegexSequence } from '../types'; export interface QuantifierOptions { greedy?: boolean; } -export interface ZeroOrMore extends RegexConstruct { - type: 'zeroOrMore'; - children: RegexElement[]; - options?: QuantifierOptions; -} - -export interface OneOrMore extends RegexConstruct { - type: 'oneOrMore'; - children: RegexElement[]; - options?: QuantifierOptions; -} - -export interface Optional extends RegexConstruct { - type: 'optional'; - children: RegexElement[]; - options?: QuantifierOptions; -} - -export function zeroOrMore(sequence: RegexSequence, options?: QuantifierOptions): ZeroOrMore { - return { - type: 'zeroOrMore', - children: ensureArray(sequence), - options, - encode: encodeZeroOrMore, - }; -} - -export function oneOrMore(sequence: RegexSequence, options?: QuantifierOptions): OneOrMore { - return { - type: 'oneOrMore', - children: ensureArray(sequence), - options, - encode: encodeOneOrMore, - }; -} - -export function optional(sequence: RegexSequence, options?: QuantifierOptions): Optional { - return { - type: 'optional', - children: ensureArray(sequence), - options, - encode: encodeOptional, - }; -} -function encodeZeroOrMore(this: ZeroOrMore): EncodeResult { +export function zeroOrMore(sequence: RegexSequence, options?: QuantifierOptions): EncodeResult { return { precedence: 'sequence', - pattern: `${encodeAtom(this.children).pattern}*${this.options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtom(sequence).pattern}*${options?.greedy === false ? '?' : ''}`, }; } -function encodeOneOrMore(this: OneOrMore): EncodeResult { +export function oneOrMore(sequence: RegexSequence, options?: QuantifierOptions): EncodeResult { return { precedence: 'sequence', - pattern: `${encodeAtom(this.children).pattern}+${this.options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtom(sequence).pattern}+${options?.greedy === false ? '?' : ''}`, }; } -function encodeOptional(this: Optional): EncodeResult { +export function optional(sequence: RegexSequence, options?: QuantifierOptions): EncodeResult { return { precedence: 'sequence', - pattern: `${encodeAtom(this.children).pattern}?${this.options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtom(sequence).pattern}?${options?.greedy === false ? '?' : ''}`, }; } diff --git a/src/encoder/encoder.ts b/src/encoder/encoder.ts index bd61dad..a8718dd 100644 --- a/src/encoder/encoder.ts +++ b/src/encoder/encoder.ts @@ -1,14 +1,16 @@ -import type { RegexElement } from '../types'; +import type { RegexElement, RegexSequence } from '../types'; +import { ensureArray } from '../utils/elements'; import { escapeText } from '../utils/text'; import type { EncodeResult } from './types'; -export function encodeSequence(elements: RegexElement[]): EncodeResult { +export function encodeSequence(sequence: RegexSequence): EncodeResult { + const elements = ensureArray(sequence); const encodedNodes = elements.map((n) => encodeNode(n)); return concatSequence(encodedNodes); } -export function encodeAtom(elements: RegexElement[]): EncodeResult { - return wrapAtom(encodeSequence(elements)); +export function encodeAtom(sequence: RegexSequence): EncodeResult { + return wrapAtom(encodeSequence(sequence)); } function encodeNode(element: RegexElement): EncodeResult { From b0476ceed87f6b2e9a6f14d582c77c5fa32e5cf6 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 09:24:10 +0200 Subject: [PATCH 03/18] refactor: rename and capture --- src/constructs/anchors.ts | 10 ++--- src/constructs/capture.ts | 56 ++++++++------------------- src/constructs/char-class.ts | 6 +-- src/constructs/char-escape.ts | 6 +-- src/constructs/choice-of.ts | 4 +- src/constructs/lookahead.ts | 4 +- src/constructs/lookbehind.ts | 4 +- src/constructs/negative-lookahead.ts | 4 +- src/constructs/negative-lookbehind.ts | 4 +- src/constructs/quantifiers.ts | 8 ++-- src/constructs/regex.ts | 4 +- src/constructs/repeat.ts | 4 +- src/encoder/encoder.ts | 16 ++++---- src/encoder/types.ts | 2 +- src/types.ts | 6 +-- 15 files changed, 57 insertions(+), 81 deletions(-) diff --git a/src/constructs/anchors.ts b/src/constructs/anchors.ts index 15606fc..20b0b94 100644 --- a/src/constructs/anchors.ts +++ b/src/constructs/anchors.ts @@ -1,21 +1,21 @@ -import type { EncodeResult } from '../encoder/types'; +import type { EncodedRegex } from '../encoder/types'; -export const startOfString: EncodeResult = { +export const startOfString: EncodedRegex = { precedence: 'atom', pattern: '^', }; -export const endOfString: EncodeResult = { +export const endOfString: EncodedRegex = { precedence: 'atom', pattern: '$', }; -export const wordBoundary: EncodeResult = { +export const wordBoundary: EncodedRegex = { precedence: 'atom', pattern: '\\b', }; -export const nonWordBoundary: EncodeResult = { +export const nonWordBoundary: EncodedRegex = { precedence: 'atom', pattern: '\\B', }; diff --git a/src/constructs/capture.ts b/src/constructs/capture.ts index 471c463..799d4fb 100644 --- a/src/constructs/capture.ts +++ b/src/constructs/capture.ts @@ -1,13 +1,6 @@ import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; - -export interface Capture extends RegexConstruct { - type: 'capture'; - children: RegexElement[]; - options?: CaptureOptions; -} +import type { EncodedRegex } from '../encoder/types'; +import type { RegexSequence } from '../types'; export type CaptureOptions = { /** @@ -16,8 +9,7 @@ export type CaptureOptions = { name?: string; }; -export interface Reference extends RegexConstruct { - type: 'reference'; +export interface Reference extends EncodedRegex { name: string; } @@ -26,12 +18,18 @@ export interface Reference extends RegexConstruct { * - in the match results (`String.match`, `String.matchAll`, or `RegExp.exec`) * - in the regex itself, through {@link ref} */ -export function capture(sequence: RegexSequence, options?: CaptureOptions): Capture { +export function capture(sequence: RegexSequence, options?: CaptureOptions): EncodedRegex { + const name = options?.name; + if (name) { + return { + precedence: 'atom', + pattern: `(?<${name}>${encodeSequence(sequence).pattern})`, + }; + } + return { - type: 'capture', - children: ensureArray(sequence), - options, - encode: encodeCapture, + precedence: 'atom', + pattern: `(${encodeSequence(sequence).pattern})`, }; } @@ -45,31 +43,9 @@ export function capture(sequence: RegexSequence, options?: CaptureOptions): Capt * @param name - Name of the capturing group to reference. */ export function ref(name: string): Reference { - return { - type: 'reference', - name, - encode: encodeReference, - }; -} - -function encodeCapture(this: Capture): EncodeResult { - const name = this.options?.name; - if (name) { - return { - precedence: 'atom', - pattern: `(?<${name}>${encodeSequence(this.children).pattern})`, - }; - } - return { precedence: 'atom', - pattern: `(${encodeSequence(this.children).pattern})`, - }; -} - -function encodeReference(this: Reference): EncodeResult { - return { - precedence: 'atom', - pattern: `\\k<${this.name}>`, + pattern: `\\k<${name}>`, + name, }; } diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index ed90621..b7634a1 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -1,4 +1,4 @@ -import type { EncodeResult } from '../encoder/types'; +import type { EncodedRegex } from '../encoder/types'; import type { RegexConstruct } from '../types'; import type { CharacterEscape } from './char-escape'; @@ -60,7 +60,7 @@ export function anyOf(characters: string): CharacterClass { }; } -export function negated(element: CharacterClass | CharacterEscape): EncodeResult { +export function negated(element: CharacterClass | CharacterEscape): EncodedRegex { return encodeCharacterClass.call(element, true); } @@ -72,7 +72,7 @@ export const inverted = negated; export function encodeCharacterClass( this: CharacterClass | CharacterEscape, isNegated?: boolean, -): EncodeResult { +): EncodedRegex { if (!this.chars.length && !this.ranges?.length) { throw new Error('Character class should contain at least one character or character range'); } diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index dfbd35e..d3cb0f2 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -1,6 +1,6 @@ -import type { EncodeResult } from '../encoder/types'; +import type { EncodedRegex } from '../encoder/types'; -export interface CharacterEscape extends EncodeResult { +export interface CharacterEscape extends EncodedRegex { kind: 'escape'; // `CharacterClass` compatibility @@ -12,7 +12,7 @@ export interface CharacterEscape extends EncodeResult { * Matches any single character. * Specifically this one is NOT a character escape. */ -export const any: EncodeResult = { +export const any: EncodedRegex = { precedence: 'atom', pattern: '.', }; diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index d2bd3ac..592f2bc 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -1,5 +1,5 @@ import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; +import type { EncodedRegex } from '../encoder/types'; import { ensureArray } from '../utils/elements'; import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; @@ -20,7 +20,7 @@ export function choiceOf(...alternatives: RegexSequence[]): ChoiceOf { }; } -function encodeChoiceOf(this: ChoiceOf): EncodeResult { +function encodeChoiceOf(this: ChoiceOf): EncodedRegex { const encodedAlternatives = this.alternatives.map((c) => encodeSequence(c)); if (encodedAlternatives.length === 1) { return encodedAlternatives[0]!; diff --git a/src/constructs/lookahead.ts b/src/constructs/lookahead.ts index 5715dad..811e2b0 100644 --- a/src/constructs/lookahead.ts +++ b/src/constructs/lookahead.ts @@ -1,5 +1,5 @@ import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; +import type { EncodedRegex } from '../encoder/types'; import { ensureArray } from '../utils/elements'; import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; @@ -30,7 +30,7 @@ export function lookahead(sequence: RegexSequence): Lookahead { }; } -function encodeLookahead(this: Lookahead): EncodeResult { +function encodeLookahead(this: Lookahead): EncodedRegex { return { precedence: 'atom', pattern: `(?=${encodeSequence(this.children).pattern})`, diff --git a/src/constructs/lookbehind.ts b/src/constructs/lookbehind.ts index 0ed418e..ff9c9a5 100644 --- a/src/constructs/lookbehind.ts +++ b/src/constructs/lookbehind.ts @@ -1,5 +1,5 @@ import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; +import type { EncodedRegex } from '../encoder/types'; import { ensureArray } from '../utils/elements'; import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; @@ -30,7 +30,7 @@ export function lookbehind(sequence: RegexSequence): Lookbehind { }; } -function encodeLookbehind(this: Lookbehind): EncodeResult { +function encodeLookbehind(this: Lookbehind): EncodedRegex { return { precedence: 'atom', pattern: `(?<=${encodeSequence(this.children).pattern})`, diff --git a/src/constructs/negative-lookahead.ts b/src/constructs/negative-lookahead.ts index 18b9a18..0a41645 100644 --- a/src/constructs/negative-lookahead.ts +++ b/src/constructs/negative-lookahead.ts @@ -1,5 +1,5 @@ import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; +import type { EncodedRegex } from '../encoder/types'; import { ensureArray } from '../utils/elements'; import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; @@ -30,7 +30,7 @@ export function negativeLookahead(sequence: RegexSequence): NegativeLookahead { }; } -function encodeNegativeLookahead(this: NegativeLookahead): EncodeResult { +function encodeNegativeLookahead(this: NegativeLookahead): EncodedRegex { return { precedence: 'atom', pattern: `(?!${encodeSequence(this.children).pattern})`, diff --git a/src/constructs/negative-lookbehind.ts b/src/constructs/negative-lookbehind.ts index f2e5fcc..0572a83 100644 --- a/src/constructs/negative-lookbehind.ts +++ b/src/constructs/negative-lookbehind.ts @@ -1,5 +1,5 @@ import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; +import type { EncodedRegex } from '../encoder/types'; import { ensureArray } from '../utils/elements'; import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; @@ -30,7 +30,7 @@ export function negativeLookbehind(sequence: RegexSequence): NegativeLookbehind }; } -function encodeNegativeLookbehind(this: NegativeLookbehind): EncodeResult { +function encodeNegativeLookbehind(this: NegativeLookbehind): EncodedRegex { return { precedence: 'atom', pattern: `(? encodeNode(n)); return concatSequence(encodedNodes); } -export function encodeAtom(sequence: RegexSequence): EncodeResult { +export function encodeAtom(sequence: RegexSequence): EncodedRegex { return wrapAtom(encodeSequence(sequence)); } -function encodeNode(element: RegexElement): EncodeResult { +function encodeNode(element: RegexElement): EncodedRegex { if (typeof element === 'string') { return encodeText(element); } @@ -33,7 +33,7 @@ function encodeNode(element: RegexElement): EncodeResult { return element.encode(); } -function encodeText(text: string): EncodeResult { +function encodeText(text: string): EncodedRegex { if (text.length === 0) { throw new Error('`encodeText`: received text should not be empty'); } @@ -52,7 +52,7 @@ function encodeText(text: string): EncodeResult { }; } -function encodeRegExp(regexp: RegExp): EncodeResult { +function encodeRegExp(regexp: RegExp): EncodedRegex { const pattern = regexp.source; // Encode at safe precedence @@ -79,7 +79,7 @@ function isAtomicPattern(pattern: string): boolean { return false; } -function concatSequence(encoded: EncodeResult[]): EncodeResult { +function concatSequence(encoded: EncodedRegex[]): EncodedRegex { if (encoded.length === 1) { return encoded[0]!; } @@ -92,7 +92,7 @@ function concatSequence(encoded: EncodeResult[]): EncodeResult { }; } -function wrapAtom(encoded: EncodeResult): EncodeResult { +function wrapAtom(encoded: EncodedRegex): EncodedRegex { if (encoded.precedence === 'atom') { return encoded; } diff --git a/src/encoder/types.ts b/src/encoder/types.ts index 97a3807..adcd805 100644 --- a/src/encoder/types.ts +++ b/src/encoder/types.ts @@ -1,7 +1,7 @@ /** * Encoded regex pattern with information about its type (atom, sequence) */ -export interface EncodeResult { +export interface EncodedRegex { precedence: EncodePrecedence; pattern: string; } diff --git a/src/types.ts b/src/types.ts index a81f995..42b53c8 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,4 +1,4 @@ -import type { EncodeResult } from './encoder/types'; +import type { EncodedRegex } from './encoder/types'; export type ArrayOrSingle = T[] | T; @@ -12,14 +12,14 @@ export type RegexSequence = RegexElement[] | RegexElement; /** * Fundamental building block of a regular expression, defined as either a regex construct or a string. */ -export type RegexElement = RegexConstruct | EncodeResult | string | RegExp; +export type RegexElement = RegexConstruct | EncodedRegex | string | RegExp; /** * Common interface for all regex constructs like character classes, quantifiers, and anchors. */ export interface RegexConstruct { type: string; - encode(): EncodeResult; + encode(): EncodedRegex; } /** From ced704bb55358de0003315f3181383563b477c8a Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 09:25:38 +0200 Subject: [PATCH 04/18] refactor: migrate repeat --- src/constructs/repeat.ts | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/src/constructs/repeat.ts b/src/constructs/repeat.ts index 6f21aaf..5c17e3d 100644 --- a/src/constructs/repeat.ts +++ b/src/constructs/repeat.ts @@ -1,45 +1,30 @@ import { encodeAtom } from '../encoder/encoder'; import type { EncodedRegex } from '../encoder/types'; import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; - -export interface Repeat extends RegexConstruct { - type: 'repeat'; - children: RegexElement[]; - options: RepeatOptions; -} +import type { RegexSequence } from '../types'; export type RepeatOptions = number | { min: number; max?: number; greedy?: boolean }; -export function repeat(sequence: RegexSequence, options: RepeatOptions): Repeat { +export function repeat(sequence: RegexSequence, options: RepeatOptions): EncodedRegex { const children = ensureArray(sequence); if (children.length === 0) { throw new Error('`repeat` should receive at least one element'); } - return { - type: 'repeat', - children, - options, - encode: encodeRepeat, - }; -} - -function encodeRepeat(this: Repeat): EncodedRegex { - const atomicNodes = encodeAtom(this.children); + const atomicNodes = encodeAtom(sequence); - if (typeof this.options === 'number') { + if (typeof options === 'number') { return { precedence: 'sequence', - pattern: `${atomicNodes.pattern}{${this.options}}`, + pattern: `${atomicNodes.pattern}{${options}}`, }; } return { precedence: 'sequence', - pattern: `${atomicNodes.pattern}{${this.options.min},${this.options?.max ?? ''}}${ - this.options.greedy === false ? '?' : '' + pattern: `${atomicNodes.pattern}{${options.min},${options?.max ?? ''}}${ + options.greedy === false ? '?' : '' }`, }; } From d320f26859b840b3e63bb687e914e4af2c070588 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 09:26:15 +0200 Subject: [PATCH 05/18] refactor: migrate regex --- src/constructs/regex.ts | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/constructs/regex.ts b/src/constructs/regex.ts index 5920926..af9231c 100644 --- a/src/constructs/regex.ts +++ b/src/constructs/regex.ts @@ -1,21 +1,7 @@ import { encodeSequence } from '../encoder/encoder'; import type { EncodedRegex } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import type { RegexSequence } from '../types'; -export interface Regex extends RegexConstruct { - type: 'sequence'; - children: RegexElement[]; -} - -export function regex(sequence: RegexSequence): Regex { - return { - type: 'sequence', - children: ensureArray(sequence), - encode: encodeRegex, - }; -} - -function encodeRegex(this: Regex): EncodedRegex { - return encodeSequence(this.children); +export function regex(sequence: RegexSequence): EncodedRegex { + return encodeSequence(sequence); } From af01bd4ea77e5a7836d9d9d4fd8af532b5077f54 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 09:28:11 +0200 Subject: [PATCH 06/18] refactor: migrate lookarounds --- src/constructs/lookahead.ts | 20 +++----------------- src/constructs/lookbehind.ts | 20 +++----------------- src/constructs/negative-lookahead.ts | 20 +++----------------- src/constructs/negative-lookbehind.ts | 20 +++----------------- 4 files changed, 12 insertions(+), 68 deletions(-) diff --git a/src/constructs/lookahead.ts b/src/constructs/lookahead.ts index 811e2b0..480b851 100644 --- a/src/constructs/lookahead.ts +++ b/src/constructs/lookahead.ts @@ -1,7 +1,6 @@ import { encodeSequence } from '../encoder/encoder'; import type { EncodedRegex } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import type { RegexSequence } from '../types'; /** * Positive lookahead assertion. @@ -17,22 +16,9 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; * // /(?=abc)/ * ``` */ -export interface Lookahead extends RegexConstruct { - type: 'lookahead'; - children: RegexElement[]; -} - -export function lookahead(sequence: RegexSequence): Lookahead { - return { - type: 'lookahead', - children: ensureArray(sequence), - encode: encodeLookahead, - }; -} - -function encodeLookahead(this: Lookahead): EncodedRegex { +export function lookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?=${encodeSequence(this.children).pattern})`, + pattern: `(?=${encodeSequence(sequence).pattern})`, }; } diff --git a/src/constructs/lookbehind.ts b/src/constructs/lookbehind.ts index ff9c9a5..96062f7 100644 --- a/src/constructs/lookbehind.ts +++ b/src/constructs/lookbehind.ts @@ -1,7 +1,6 @@ import { encodeSequence } from '../encoder/encoder'; import type { EncodedRegex } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import type { RegexSequence } from '../types'; /** * Positive lookbehind assertion. @@ -17,22 +16,9 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; * // /(?<=abc)/ * ``` */ -export interface Lookbehind extends RegexConstruct { - type: 'lookbehind'; - children: RegexElement[]; -} - -export function lookbehind(sequence: RegexSequence): Lookbehind { - return { - type: 'lookbehind', - children: ensureArray(sequence), - encode: encodeLookbehind, - }; -} - -function encodeLookbehind(this: Lookbehind): EncodedRegex { +export function lookbehind(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?<=${encodeSequence(this.children).pattern})`, + pattern: `(?<=${encodeSequence(sequence).pattern})`, }; } diff --git a/src/constructs/negative-lookahead.ts b/src/constructs/negative-lookahead.ts index 0a41645..40487d0 100644 --- a/src/constructs/negative-lookahead.ts +++ b/src/constructs/negative-lookahead.ts @@ -1,7 +1,6 @@ import { encodeSequence } from '../encoder/encoder'; import type { EncodedRegex } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import type { RegexSequence } from '../types'; /** * Negative lookahead assertion. @@ -17,22 +16,9 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; * // /(?=abc)/ * ``` */ -export interface NegativeLookahead extends RegexConstruct { - type: 'negativeLookahead'; - children: RegexElement[]; -} - -export function negativeLookahead(sequence: RegexSequence): NegativeLookahead { - return { - type: 'negativeLookahead', - children: ensureArray(sequence), - encode: encodeNegativeLookahead, - }; -} - -function encodeNegativeLookahead(this: NegativeLookahead): EncodedRegex { +export function negativeLookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?!${encodeSequence(this.children).pattern})`, + pattern: `(?!${encodeSequence(sequence).pattern})`, }; } diff --git a/src/constructs/negative-lookbehind.ts b/src/constructs/negative-lookbehind.ts index 0572a83..b590d8e 100644 --- a/src/constructs/negative-lookbehind.ts +++ b/src/constructs/negative-lookbehind.ts @@ -1,7 +1,6 @@ import { encodeSequence } from '../encoder/encoder'; import type { EncodedRegex } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import type { RegexSequence } from '../types'; /** * Negative lookbehind assertion. @@ -17,22 +16,9 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; * // /(? Date: Mon, 6 May 2024 09:29:07 +0200 Subject: [PATCH 07/18] refactor: refactor choiceOf --- src/constructs/choice-of.ts | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index 592f2bc..9a61096 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -1,27 +1,13 @@ import { encodeSequence } from '../encoder/encoder'; import type { EncodedRegex } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import type { RegexSequence } from '../types'; -export interface ChoiceOf extends RegexConstruct { - type: 'choiceOf'; - alternatives: RegexElement[][]; -} - -export function choiceOf(...alternatives: RegexSequence[]): ChoiceOf { +export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { if (alternatives.length === 0) { throw new Error('`choiceOf` should receive at least one alternative'); } - return { - type: 'choiceOf', - alternatives: alternatives.map((c) => ensureArray(c)), - encode: encodeChoiceOf, - }; -} - -function encodeChoiceOf(this: ChoiceOf): EncodedRegex { - const encodedAlternatives = this.alternatives.map((c) => encodeSequence(c)); + const encodedAlternatives = alternatives.map((c) => encodeSequence(c)); if (encodedAlternatives.length === 1) { return encodedAlternatives[0]!; } From 88f4558660615907188430a69714d48bffc206e0 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 09:30:54 +0200 Subject: [PATCH 08/18] chore: simplify char class --- src/constructs/char-class.ts | 4 ---- src/types.ts | 1 - 2 files changed, 5 deletions(-) diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index b7634a1..f8051c8 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -11,14 +11,12 @@ export interface CharacterRange { } export interface CharacterClass extends RegexConstruct { - type: 'characterClass'; chars: string[]; ranges?: CharacterRange[]; } export function charClass(...elements: Array): CharacterClass { return { - type: 'characterClass', chars: elements.map((c) => c.chars).flat(), ranges: elements.map((c) => c.ranges ?? []).flat(), encode: encodeCharacterClass, @@ -39,7 +37,6 @@ export function charRange(start: string, end: string): CharacterClass { } return { - type: 'characterClass', chars: [], ranges: [{ start, end }], encode: encodeCharacterClass, @@ -54,7 +51,6 @@ export function anyOf(characters: string): CharacterClass { } return { - type: 'characterClass', chars, encode: encodeCharacterClass, }; diff --git a/src/types.ts b/src/types.ts index 42b53c8..0812bb4 100644 --- a/src/types.ts +++ b/src/types.ts @@ -18,7 +18,6 @@ export type RegexElement = RegexConstruct | EncodedRegex | string | RegExp; * Common interface for all regex constructs like character classes, quantifiers, and anchors. */ export interface RegexConstruct { - type: string; encode(): EncodedRegex; } From 0e5093d4f1e9127b1276baa9d89f791884ff1ae5 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 09:44:44 +0200 Subject: [PATCH 09/18] refactor: simplify types structure and char class encoding --- README.md | 2 +- src/builders.ts | 2 +- src/constructs/__tests__/char-class.test.ts | 11 +---- .../__tests__/encoder.test.tsx | 6 ++- src/constructs/anchors.ts | 2 +- src/constructs/capture.ts | 4 +- src/constructs/char-class.ts | 49 ++----------------- src/constructs/char-escape.ts | 16 +----- src/constructs/choice-of.ts | 5 +- src/constructs/lookahead.ts | 5 +- src/constructs/lookbehind.ts | 5 +- src/constructs/negative-lookahead.ts | 5 +- src/constructs/negative-lookbehind.ts | 5 +- src/constructs/quantifiers.ts | 5 +- src/constructs/regex.ts | 5 +- src/constructs/repeat.ts | 5 +- src/{encoder => }/encoder.ts | 36 +++++++++++--- src/encoder/types.ts | 9 ---- src/types.ts | 29 ++++++++--- test-utils/utils.ts | 15 +----- website/docs/api/overview.md | 2 +- website/docs/api/types.md | 14 ++++-- 22 files changed, 95 insertions(+), 142 deletions(-) rename src/{encoder => constructs}/__tests__/encoder.test.tsx (95%) rename src/{encoder => }/encoder.ts (61%) delete mode 100644 src/encoder/types.ts diff --git a/README.md b/README.md index d3c0fa7..05acca3 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ TS Regex Builder allows you to build complex regular expressions using domain-sp Terminology: -- regex construct (`RegexConstruct`) - common name for all regex constructs like character classes, quantifiers, and anchors. +- regex construct - common name for all regex constructs like character classes, quantifiers, and anchors. - regex element (`RegexElement`) - a fundamental building block of a regular expression, defined as either a regex construct, a string, or `RegExp` literal (`/.../`). - regex sequence (`RegexSequence`) - a sequence of regex elements forming a regular expression. For developer convenience, it also accepts a single element instead of an array. diff --git a/src/builders.ts b/src/builders.ts index c698ca9..e2d1710 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -1,5 +1,5 @@ import type { RegexFlags, RegexSequence } from './types'; -import { encodeSequence } from './encoder/encoder'; +import { encodeSequence } from './encoder'; import { ensureArray } from './utils/elements'; /** diff --git a/src/constructs/__tests__/char-class.test.ts b/src/constructs/__tests__/char-class.test.ts index 678abcb..6cd746d 100644 --- a/src/constructs/__tests__/char-class.test.ts +++ b/src/constructs/__tests__/char-class.test.ts @@ -115,16 +115,7 @@ test('`negated` character class matching', () => { }); test('`encodeCharacterClass` throws on empty text', () => { - expect(() => - buildRegExp( - // @ts-expect-error - negated({ - type: 'characterClass', - chars: [], - ranges: [], - }), - ), - ).toThrowErrorMatchingInlineSnapshot( + expect(() => buildRegExp(negated({ chars: [], ranges: [] }))).toThrowErrorMatchingInlineSnapshot( `"Character class should contain at least one character or character range"`, ); }); diff --git a/src/encoder/__tests__/encoder.test.tsx b/src/constructs/__tests__/encoder.test.tsx similarity index 95% rename from src/encoder/__tests__/encoder.test.tsx rename to src/constructs/__tests__/encoder.test.tsx index afc9624..558f3e2 100644 --- a/src/encoder/__tests__/encoder.test.tsx +++ b/src/constructs/__tests__/encoder.test.tsx @@ -74,7 +74,11 @@ test('`buildRegExp` throws error on unknown element', () => { expect(() => // @ts-expect-error intentionally passing incorrect object buildRegExp({ type: 'unknown' }), - ).toThrowErrorMatchingInlineSnapshot(`"\`encodeNode\`: unknown element type unknown"`); + ).toThrowErrorMatchingInlineSnapshot(` + "\`encodeNode\`: unknown element: { + "type": "unknown" + }" + `); }); test('`buildPattern` throws on empty text', () => { diff --git a/src/constructs/anchors.ts b/src/constructs/anchors.ts index 20b0b94..6d61f42 100644 --- a/src/constructs/anchors.ts +++ b/src/constructs/anchors.ts @@ -1,4 +1,4 @@ -import type { EncodedRegex } from '../encoder/types'; +import type { EncodedRegex } from '../types'; export const startOfString: EncodedRegex = { precedence: 'atom', diff --git a/src/constructs/capture.ts b/src/constructs/capture.ts index 799d4fb..d0abdb8 100644 --- a/src/constructs/capture.ts +++ b/src/constructs/capture.ts @@ -1,5 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; +import { encodeSequence } from '../encoder'; +import type { EncodedRegex } from '../types'; import type { RegexSequence } from '../types'; export type CaptureOptions = { diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index f8051c8..e841bf3 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -1,25 +1,10 @@ -import type { EncodedRegex } from '../encoder/types'; -import type { RegexConstruct } from '../types'; -import type { CharacterEscape } from './char-escape'; - -/** - * Character range from start to end (inclusive). - */ -export interface CharacterRange { - start: string; - end: string; -} - -export interface CharacterClass extends RegexConstruct { - chars: string[]; - ranges?: CharacterRange[]; -} +import { encodeCharClass } from '../encoder'; +import type { CharacterClass, CharacterEscape, EncodedRegex } from '../types'; export function charClass(...elements: Array): CharacterClass { return { chars: elements.map((c) => c.chars).flat(), ranges: elements.map((c) => c.ranges ?? []).flat(), - encode: encodeCharacterClass, }; } @@ -39,7 +24,6 @@ export function charRange(start: string, end: string): CharacterClass { return { chars: [], ranges: [{ start, end }], - encode: encodeCharacterClass, }; } @@ -52,12 +36,11 @@ export function anyOf(characters: string): CharacterClass { return { chars, - encode: encodeCharacterClass, }; } export function negated(element: CharacterClass | CharacterEscape): EncodedRegex { - return encodeCharacterClass.call(element, true); + return encodeCharClass(element, true); } /** @@ -65,32 +48,6 @@ export function negated(element: CharacterClass | CharacterEscape): EncodedRegex */ export const inverted = negated; -export function encodeCharacterClass( - this: CharacterClass | CharacterEscape, - isNegated?: boolean, -): EncodedRegex { - if (!this.chars.length && !this.ranges?.length) { - throw new Error('Character class should contain at least one character or character range'); - } - - // If passed characters includes hyphen (`-`) it need to be moved to - // first (or last) place in order to treat it as hyphen character and not a range. - // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types - const hyphen = this.chars.includes('-') ? '-' : ''; - const caret = this.chars.includes('^') ? '^' : ''; - const otherChars = this.chars.filter((c) => c !== '-' && c !== '^').join(''); - const ranges = this.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; - const negation = isNegated ? '^' : ''; - - let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; - if (pattern === '[^-]') pattern = '[\\^-]'; - - return { - precedence: 'atom', - pattern, - }; -} - function escapeForCharacterClass(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index d3cb0f2..77aa2cb 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -1,12 +1,4 @@ -import type { EncodedRegex } from '../encoder/types'; - -export interface CharacterEscape extends EncodedRegex { - kind: 'escape'; - - // `CharacterClass` compatibility - chars: string[]; - ranges?: never; -} +import type { CharacterEscape, EncodedRegex } from '../types'; /** * Matches any single character. @@ -21,42 +13,36 @@ export const digit: CharacterEscape = { precedence: 'atom', pattern: '\\d', chars: ['\\d'], - kind: 'escape', }; export const nonDigit: CharacterEscape = { precedence: 'atom', pattern: '\\D', chars: ['\\D'], - kind: 'escape', }; export const word: CharacterEscape = { precedence: 'atom', pattern: '\\w', chars: ['\\w'], - kind: 'escape', }; export const nonWord: CharacterEscape = { precedence: 'atom', pattern: '\\W', chars: ['\\W'], - kind: 'escape', }; export const whitespace: CharacterEscape = { precedence: 'atom', pattern: '\\s', chars: ['\\s'], - kind: 'escape', }; export const nonWhitespace: CharacterEscape = { precedence: 'atom', pattern: '\\S', chars: ['\\S'], - kind: 'escape', }; /** diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index 9a61096..0b03698 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -1,6 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; -import type { RegexSequence } from '../types'; +import { encodeSequence } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { if (alternatives.length === 0) { diff --git a/src/constructs/lookahead.ts b/src/constructs/lookahead.ts index 480b851..f944d12 100644 --- a/src/constructs/lookahead.ts +++ b/src/constructs/lookahead.ts @@ -1,6 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; -import type { RegexSequence } from '../types'; +import { encodeSequence } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; /** * Positive lookahead assertion. diff --git a/src/constructs/lookbehind.ts b/src/constructs/lookbehind.ts index 96062f7..b7b022c 100644 --- a/src/constructs/lookbehind.ts +++ b/src/constructs/lookbehind.ts @@ -1,6 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; -import type { RegexSequence } from '../types'; +import { encodeSequence } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; /** * Positive lookbehind assertion. diff --git a/src/constructs/negative-lookahead.ts b/src/constructs/negative-lookahead.ts index 40487d0..e321089 100644 --- a/src/constructs/negative-lookahead.ts +++ b/src/constructs/negative-lookahead.ts @@ -1,6 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; -import type { RegexSequence } from '../types'; +import { encodeSequence } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; /** * Negative lookahead assertion. diff --git a/src/constructs/negative-lookbehind.ts b/src/constructs/negative-lookbehind.ts index b590d8e..ce647f3 100644 --- a/src/constructs/negative-lookbehind.ts +++ b/src/constructs/negative-lookbehind.ts @@ -1,6 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; -import type { RegexSequence } from '../types'; +import { encodeSequence } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; /** * Negative lookbehind assertion. diff --git a/src/constructs/quantifiers.ts b/src/constructs/quantifiers.ts index 0d95bdb..685c41e 100644 --- a/src/constructs/quantifiers.ts +++ b/src/constructs/quantifiers.ts @@ -1,6 +1,5 @@ -import { encodeAtom } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; -import type { RegexSequence } from '../types'; +import { encodeAtom } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; export interface QuantifierOptions { greedy?: boolean; diff --git a/src/constructs/regex.ts b/src/constructs/regex.ts index af9231c..4f78893 100644 --- a/src/constructs/regex.ts +++ b/src/constructs/regex.ts @@ -1,6 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; -import type { RegexSequence } from '../types'; +import { encodeSequence } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; export function regex(sequence: RegexSequence): EncodedRegex { return encodeSequence(sequence); diff --git a/src/constructs/repeat.ts b/src/constructs/repeat.ts index 5c17e3d..aa8f23b 100644 --- a/src/constructs/repeat.ts +++ b/src/constructs/repeat.ts @@ -1,7 +1,6 @@ -import { encodeAtom } from '../encoder/encoder'; -import type { EncodedRegex } from '../encoder/types'; +import { encodeAtom } from '../encoder'; import { ensureArray } from '../utils/elements'; -import type { RegexSequence } from '../types'; +import type { EncodedRegex, RegexSequence } from '../types'; export type RepeatOptions = number | { min: number; max?: number; greedy?: boolean }; diff --git a/src/encoder/encoder.ts b/src/encoder.ts similarity index 61% rename from src/encoder/encoder.ts rename to src/encoder.ts index 4f8dd43..ba08b46 100644 --- a/src/encoder/encoder.ts +++ b/src/encoder.ts @@ -1,7 +1,6 @@ -import type { RegexElement, RegexSequence } from '../types'; -import { ensureArray } from '../utils/elements'; -import { escapeText } from '../utils/text'; -import type { EncodedRegex } from './types'; +import type { CharacterClass, EncodedRegex, RegexElement, RegexSequence } from './types'; +import { ensureArray } from './utils/elements'; +import { escapeText } from './utils/text'; export function encodeSequence(sequence: RegexSequence): EncodedRegex { const elements = ensureArray(sequence); @@ -26,11 +25,11 @@ function encodeNode(element: RegexElement): EncodedRegex { return element; } - if (typeof element === 'object' && typeof element.encode !== 'function') { - throw new Error(`\`encodeNode\`: unknown element type ${element.type}`); + if (typeof element === 'object' && 'chars' in element) { + return encodeCharClass(element); } - return element.encode(); + throw new Error(`\`encodeNode\`: unknown element: ${JSON.stringify(element, null, 2)}`); } function encodeText(text: string): EncodedRegex { @@ -62,6 +61,29 @@ function encodeRegExp(regexp: RegExp): EncodedRegex { }; } +export function encodeCharClass(element: CharacterClass, isNegated?: boolean): EncodedRegex { + if (!element.chars.length && !element.ranges?.length) { + throw new Error('Character class should contain at least one character or character range'); + } + + // If passed characters includes hyphen (`-`) it need to be moved to + // first (or last) place in order to treat it as hyphen character and not a range. + // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types + const hyphen = element.chars.includes('-') ? '-' : ''; + const caret = element.chars.includes('^') ? '^' : ''; + const otherChars = element.chars.filter((c) => c !== '-' && c !== '^').join(''); + const ranges = element.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; + const negation = isNegated ? '^' : ''; + + let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; + if (pattern === '[^-]') pattern = '[\\^-]'; + + return { + precedence: 'atom', + pattern, + }; +} + // This is intended to catch only some popular atomic patterns like char classes. function isAtomicPattern(pattern: string): boolean { if (pattern.length === 1) { diff --git a/src/encoder/types.ts b/src/encoder/types.ts deleted file mode 100644 index adcd805..0000000 --- a/src/encoder/types.ts +++ /dev/null @@ -1,9 +0,0 @@ -/** - * Encoded regex pattern with information about its type (atom, sequence) - */ -export interface EncodedRegex { - precedence: EncodePrecedence; - pattern: string; -} - -export type EncodePrecedence = 'atom' | 'sequence' | 'disjunction'; diff --git a/src/types.ts b/src/types.ts index 0812bb4..c01ae67 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,5 +1,3 @@ -import type { EncodedRegex } from './encoder/types'; - export type ArrayOrSingle = T[] | T; /** @@ -12,13 +10,32 @@ export type RegexSequence = RegexElement[] | RegexElement; /** * Fundamental building block of a regular expression, defined as either a regex construct or a string. */ -export type RegexElement = RegexConstruct | EncodedRegex | string | RegExp; +export type RegexElement = EncodedRegex | CharacterClass | string | RegExp; /** - * Common interface for all regex constructs like character classes, quantifiers, and anchors. + * Encoded regex pattern with information about its type (atom, sequence) */ -export interface RegexConstruct { - encode(): EncodedRegex; +export interface EncodedRegex { + precedence: EncodePrecedence; + pattern: string; +} + +export type EncodePrecedence = 'atom' | 'sequence' | 'disjunction'; + +export interface CharacterEscape extends EncodedRegex { + // `CharacterClass` compatibility + chars: string[]; + ranges?: never; +} + +export interface CharacterClass { + chars: string[]; + ranges?: CharacterRange[]; +} + +export interface CharacterRange { + start: string; + end: string; } /** diff --git a/test-utils/utils.ts b/test-utils/utils.ts index 323da9f..b8f38c8 100644 --- a/test-utils/utils.ts +++ b/test-utils/utils.ts @@ -1,18 +1,5 @@ import { buildRegExp } from '../src/builders'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../src/types'; - -export function isRegexElement(node: unknown): node is RegexElement { - return typeof node === 'string' || isRegexConstruct(node); -} - -export function isRegexConstruct(element: unknown): element is RegexConstruct { - return ( - typeof element === 'object' && - element !== null && - 'encode' in element && - typeof element.encode === 'function' - ); -} +import type { RegexSequence } from '../src/types'; export function wrapRegExp(regex: RegExp | RegexSequence) { if (regex instanceof RegExp) { diff --git a/website/docs/api/overview.md b/website/docs/api/overview.md index 6f5f71f..4ad55d0 100644 --- a/website/docs/api/overview.md +++ b/website/docs/api/overview.md @@ -8,7 +8,7 @@ TS Regex Builder allows you to build complex regular expressions using domain-sp ### Terminology -- regex construct (`RegexConstruct`) - common name for all regex constructs like character classes, quantifiers, and anchors. +- regex construct - common name for all regex constructs like character classes, quantifiers, and anchors. - regex element (`RegexElement`) - a fundamental building block of a regular expression, defined as either a regex construct, a string, or `RegExp` literal (`/.../`). - regex sequence (`RegexSequence`) - a sequence of regex elements forming a regular expression. For developer convenience, it also accepts a single element instead of an array. diff --git a/website/docs/api/types.md b/website/docs/api/types.md index 69d565f..338d8f1 100644 --- a/website/docs/api/types.md +++ b/website/docs/api/types.md @@ -14,13 +14,19 @@ The sequence of regex elements forming a regular expression. For developer conve ### `RegexElement` ```ts -type RegexElement = RegexConstruct | RegExp | string; +type RegexElement = EncodedRegex | CharacterClass | RegExp | string; ``` Regex elements are fundamental building blocks of a regular expression. These can be either further regex constructs, regular strings to be matched literally or `RegExp` literals (`/.../`) for including simple regexes as part of a larger structure. -### `RegexConstruct` +### `EncodedRegex` -The common type for all regex constructs like character classes, quantifiers, and anchors. You should not need to use this type directly, it is returned by all regex construct functions. +TODO -Note: the shape of the `RegexConstruct` is considered private and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with +Note: the shape of the `EncodedRegex` is considered private and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with + +### `CharacterClass` + +TODO + +Note: the shape of the `CharacterClass` is considered private and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with From 23925e82bdb58ae13a35ebfcf5ed1678706fedb7 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 09:55:40 +0200 Subject: [PATCH 10/18] chore: self code review --- src/encoder.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/encoder.ts b/src/encoder.ts index ba08b46..0451061 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -5,7 +5,7 @@ import { escapeText } from './utils/text'; export function encodeSequence(sequence: RegexSequence): EncodedRegex { const elements = ensureArray(sequence); const encodedNodes = elements.map((n) => encodeNode(n)); - return concatSequence(encodedNodes); + return concatEncodedRegexes(encodedNodes); } export function encodeAtom(sequence: RegexSequence): EncodedRegex { @@ -101,7 +101,7 @@ function isAtomicPattern(pattern: string): boolean { return false; } -function concatSequence(encoded: EncodedRegex[]): EncodedRegex { +function concatEncodedRegexes(encoded: EncodedRegex[]): EncodedRegex { if (encoded.length === 1) { return encoded[0]!; } From f193102908b00ab56f4e79dea7e6aed19b874106 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 10:18:29 +0200 Subject: [PATCH 11/18] refactor: improve naming --- src/builders.ts | 7 +++---- src/constructs/capture.ts | 6 +++--- src/constructs/choice-of.ts | 4 ++-- src/constructs/lookahead.ts | 4 ++-- src/constructs/lookbehind.ts | 4 ++-- src/constructs/negative-lookahead.ts | 4 ++-- src/constructs/negative-lookbehind.ts | 4 ++-- src/constructs/quantifiers.ts | 8 ++++---- src/constructs/regex.ts | 4 ++-- src/constructs/repeat.ts | 8 +++----- src/encoder.ts | 18 +++++++++++------- 11 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/builders.ts b/src/builders.ts index e2d1710..10f5fb6 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -1,6 +1,5 @@ import type { RegexFlags, RegexSequence } from './types'; -import { encodeSequence } from './encoder'; -import { ensureArray } from './utils/elements'; +import { encodePattern } from './encoder'; /** * Generate RegExp object from elements with optional flags. @@ -10,7 +9,7 @@ import { ensureArray } from './utils/elements'; * @returns RegExp object */ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp { - const pattern = encodeSequence(ensureArray(sequence)).pattern; + const pattern = encodePattern(sequence); const flagsString = encodeFlags(flags ?? {}); return new RegExp(pattern, flagsString); } @@ -21,7 +20,7 @@ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp * @returns regex pattern string */ export function buildPattern(sequence: RegexSequence): string { - return encodeSequence(ensureArray(sequence)).pattern; + return encodePattern(sequence); } function encodeFlags(flags: RegexFlags): string { diff --git a/src/constructs/capture.ts b/src/constructs/capture.ts index d0abdb8..f404a5e 100644 --- a/src/constructs/capture.ts +++ b/src/constructs/capture.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encodePattern } from '../encoder'; import type { EncodedRegex } from '../types'; import type { RegexSequence } from '../types'; @@ -23,13 +23,13 @@ export function capture(sequence: RegexSequence, options?: CaptureOptions): Enco if (name) { return { precedence: 'atom', - pattern: `(?<${name}>${encodeSequence(sequence).pattern})`, + pattern: `(?<${name}>${encodePattern(sequence)})`, }; } return { precedence: 'atom', - pattern: `(${encodeSequence(sequence).pattern})`, + pattern: `(${encodePattern(sequence)})`, }; } diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index 0b03698..40be23a 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encode } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { @@ -6,7 +6,7 @@ export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { throw new Error('`choiceOf` should receive at least one alternative'); } - const encodedAlternatives = alternatives.map((c) => encodeSequence(c)); + const encodedAlternatives = alternatives.map((c) => encode(c)); if (encodedAlternatives.length === 1) { return encodedAlternatives[0]!; } diff --git a/src/constructs/lookahead.ts b/src/constructs/lookahead.ts index f944d12..a4f7521 100644 --- a/src/constructs/lookahead.ts +++ b/src/constructs/lookahead.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encodePattern } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function lookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?=${encodeSequence(sequence).pattern})`, + pattern: `(?=${encodePattern(sequence)})`, }; } diff --git a/src/constructs/lookbehind.ts b/src/constructs/lookbehind.ts index b7b022c..1347fe5 100644 --- a/src/constructs/lookbehind.ts +++ b/src/constructs/lookbehind.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encodePattern } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function lookbehind(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?<=${encodeSequence(sequence).pattern})`, + pattern: `(?<=${encodePattern(sequence)})`, }; } diff --git a/src/constructs/negative-lookahead.ts b/src/constructs/negative-lookahead.ts index e321089..06486a1 100644 --- a/src/constructs/negative-lookahead.ts +++ b/src/constructs/negative-lookahead.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encodePattern } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function negativeLookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?!${encodeSequence(sequence).pattern})`, + pattern: `(?!${encodePattern(sequence)})`, }; } diff --git a/src/constructs/negative-lookbehind.ts b/src/constructs/negative-lookbehind.ts index ce647f3..4227b94 100644 --- a/src/constructs/negative-lookbehind.ts +++ b/src/constructs/negative-lookbehind.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encodePattern } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function negativeLookbehind(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(? encodeNode(n)); - return concatEncodedRegexes(encodedNodes); + const encodedNodes = elements.map((n) => encodeElement(n)); + return concat(encodedNodes); } -export function encodeAtom(sequence: RegexSequence): EncodedRegex { - return wrapAtom(encodeSequence(sequence)); +export function encodePattern(sequence: RegexSequence): string { + return encode(sequence).pattern; } -function encodeNode(element: RegexElement): EncodedRegex { +export function encodeAtomicPattern(sequence: RegexSequence): string { + return wrapAtom(encode(sequence)).pattern; +} + +function encodeElement(element: RegexElement): EncodedRegex { if (typeof element === 'string') { return encodeText(element); } @@ -101,7 +105,7 @@ function isAtomicPattern(pattern: string): boolean { return false; } -function concatEncodedRegexes(encoded: EncodedRegex[]): EncodedRegex { +function concat(encoded: EncodedRegex[]): EncodedRegex { if (encoded.length === 1) { return encoded[0]!; } From 727520118c9d8e3c146904980aa1be93ed7c1002 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 10:22:42 +0200 Subject: [PATCH 12/18] refactor: missing edge cases --- src/constructs/__tests__/char-class.test.ts | 6 ++++++ src/constructs/__tests__/encoder.test.tsx | 2 +- src/constructs/char-class.ts | 4 ++++ src/encoder.ts | 2 +- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/constructs/__tests__/char-class.test.ts b/src/constructs/__tests__/char-class.test.ts index 6cd746d..bccd0e2 100644 --- a/src/constructs/__tests__/char-class.test.ts +++ b/src/constructs/__tests__/char-class.test.ts @@ -37,6 +37,12 @@ test('`charClass` joins character escapes', () => { expect(charClass(word, nonDigit)).toEqualRegex(/[\w\D]/); }); +test('`charClass` throws on empty text', () => { + expect(() => charClass()).toThrowErrorMatchingInlineSnapshot( + `"\`charClass\` should receive at least one element"`, + ); +}); + test('`charRange` pattern', () => { expect(charRange('a', 'z')).toEqualRegex(/[a-z]/); expect(['x', charRange('0', '9')]).toEqualRegex(/x[0-9]/); diff --git a/src/constructs/__tests__/encoder.test.tsx b/src/constructs/__tests__/encoder.test.tsx index 558f3e2..4713160 100644 --- a/src/constructs/__tests__/encoder.test.tsx +++ b/src/constructs/__tests__/encoder.test.tsx @@ -75,7 +75,7 @@ test('`buildRegExp` throws error on unknown element', () => { // @ts-expect-error intentionally passing incorrect object buildRegExp({ type: 'unknown' }), ).toThrowErrorMatchingInlineSnapshot(` - "\`encodeNode\`: unknown element: { + "\`encodeElement\`: unknown element: { "type": "unknown" }" `); diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index e841bf3..b2bc758 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -2,6 +2,10 @@ import { encodeCharClass } from '../encoder'; import type { CharacterClass, CharacterEscape, EncodedRegex } from '../types'; export function charClass(...elements: Array): CharacterClass { + if (!elements.length) { + throw new Error('`charClass` should receive at least one element'); + } + return { chars: elements.map((c) => c.chars).flat(), ranges: elements.map((c) => c.ranges ?? []).flat(), diff --git a/src/encoder.ts b/src/encoder.ts index ecc7ced..a96f1c9 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -33,7 +33,7 @@ function encodeElement(element: RegexElement): EncodedRegex { return encodeCharClass(element); } - throw new Error(`\`encodeNode\`: unknown element: ${JSON.stringify(element, null, 2)}`); + throw new Error(`\`encodeElement\`: unknown element: ${JSON.stringify(element, null, 2)}`); } function encodeText(text: string): EncodedRegex { From 83660c3950d5dd7d93e3c7a47b57c6c7232dd2ce Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 10:59:01 +0200 Subject: [PATCH 13/18] chore: types --- src/types.ts | 9 +++++++-- website/docs/api/assertions.md | 16 ++++++++-------- website/docs/api/captures.md | 4 ++-- website/docs/api/character-classes.md | 16 ++++++++-------- website/docs/api/constructs.md | 4 ++-- website/docs/api/quantifiers.md | 8 ++++---- website/docs/api/types.md | 14 ++++---------- 7 files changed, 35 insertions(+), 36 deletions(-) diff --git a/src/types.ts b/src/types.ts index c01ae67..2b102d5 100644 --- a/src/types.ts +++ b/src/types.ts @@ -8,9 +8,14 @@ export type ArrayOrSingle = T[] | T; export type RegexSequence = RegexElement[] | RegexElement; /** - * Fundamental building block of a regular expression, defined as either a regex construct or a string. + * Fundamental building block of a regular expression, defined as either a regex construct, `RegExp` object or a string. */ -export type RegexElement = EncodedRegex | CharacterClass | string | RegExp; +export type RegexElement = RegexConstruct | RegExp | string; + +/** + * Fundamental building block of a regular expression, defined as either an encoded regex or a character class. + */ +export type RegexConstruct = EncodedRegex | CharacterClass; /** * Encoded regex pattern with information about its type (atom, sequence) diff --git a/website/docs/api/assertions.md b/website/docs/api/assertions.md index aacc76d..a190eab 100644 --- a/website/docs/api/assertions.md +++ b/website/docs/api/assertions.md @@ -10,8 +10,8 @@ Anchors are special characters or sequences that specify positions in the input ### Start and end of string ```ts -const startOfString: Anchor; -const endOfString: Anchor; +const startOfString: RegexConstruct; +const endOfString: RegexConstruct; ``` - `startOfString` anchor matches the start of a string (or line, if multiline mode is enabled). Regex syntax: `^`. @@ -22,8 +22,8 @@ const endOfString: Anchor; _This API was added in version 1.3.0._ ```ts -const wordBoundary: Anchor; -const nonWordBoundary: Anchor; +const wordBoundary: RegexConstruct; +const nonWordBoundary: RegexConstruct; ``` - `wordBoundary` matches the positions where a word character is not followed or preceded by another word character, effectively indicating the start or end of a word. Regex syntax: `\b`. @@ -40,7 +40,7 @@ Lookarounds in regex are used for asserting that some pattern is or isn't follow _This API was added in version 1.3.0._ ```ts -function lookahead(sequence: RegexSequence): Lookahead; +function lookahead(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: `(?=...)`. @@ -52,7 +52,7 @@ Allows for conditional matching by checking for subsequent patterns in regexes w _This API was added in version 1.3.0._ ```ts -function negativeLookahead(sequence: RegexSequence): NegativeLookahead; +function negativeLookahead(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: `(?!...)`. @@ -64,7 +64,7 @@ Allows for matches to be rejected if a specified subsequent pattern is present, _This API was added in version 1.3.0._ ```ts -function lookbehind(sequence: RegexSequence): Lookahead; +function lookbehind(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: `(?<=...)`. @@ -76,7 +76,7 @@ Allows for conditional matching by checking for preceeding patterns in regexes w _This API was added in version 1.3.0._ ```ts -function negativeLookahead(sequence: RegexSequence): NegativeLookahead; +function negativeLookahead(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: `(?`. diff --git a/website/docs/api/character-classes.md b/website/docs/api/character-classes.md index 732c346..95b89c6 100644 --- a/website/docs/api/character-classes.md +++ b/website/docs/api/character-classes.md @@ -8,13 +8,13 @@ Character classes are a set of characters that match any one of the characters i ### Common character classes ```ts -const any: CharacterClass; -const word: CharacterClass; -const nonWord: CharacterClass; -const digit: CharacterClass; -const nonDigit: CharacterClass; -const whitespace: CharacterClass; -const nonWhitespace: CharacterClass; +const any: RegexConstruct; +const word: RegexConstruct; +const nonWord: RegexConstruct; +const digit: RegexConstruct; +const nonDigit: RegexConstruct; +const whitespace: RegexConstruct; +const nonWhitespace: RegexConstruct; ``` - `any` matches any character except newline characters. Regex syntax: `*`. @@ -71,7 +71,7 @@ Examples: ### `negated()` ```ts -function negated(element: CharacterClass): CharacterClass; +function negated(element: CharacterClass): RegexConstruct; ``` Regex syntax: `[^...]`. diff --git a/website/docs/api/constructs.md b/website/docs/api/constructs.md index 5182ed8..32a0824 100644 --- a/website/docs/api/constructs.md +++ b/website/docs/api/constructs.md @@ -10,7 +10,7 @@ These functions and objects represent available regex constructs. ```ts function choiceOf( ...alternatives: RegexSequence[], -): ChoiceOf { +): RegexConstruct { ``` Regex syntax: `a|b|c`. @@ -22,7 +22,7 @@ Example: `choiceOf("color", "colour")` matches either `color` or `colour` patter ### `regex()` ```ts -function regex(sequence: RegexSequence): Regex; +function regex(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: the pattern remains unchanged when wrapped by this construct. diff --git a/website/docs/api/quantifiers.md b/website/docs/api/quantifiers.md index 101902c..53065d9 100644 --- a/website/docs/api/quantifiers.md +++ b/website/docs/api/quantifiers.md @@ -13,7 +13,7 @@ function zeroOrMore( options?: { greedy?: boolean; // default=true }, -): ZeroOrMore; +): RegexConstruct; ``` Regex syntax: @@ -31,7 +31,7 @@ function oneOrMore( options?: { greedy?: boolean; // default=true }, -): OneOrMore; +): RegexConstruct; ``` Regex syntax: @@ -49,7 +49,7 @@ function optional( options?: { greedy?: boolean; // default=true }, -): Optionally; +): RegexConstruct; ``` Regex syntax: @@ -71,7 +71,7 @@ function repeat( max?: number; greedy?: boolean; // default=true }, -): Repeat; +): RegexConstruct; ``` Regex syntax: diff --git a/website/docs/api/types.md b/website/docs/api/types.md index 338d8f1..fd3266c 100644 --- a/website/docs/api/types.md +++ b/website/docs/api/types.md @@ -14,19 +14,13 @@ The sequence of regex elements forming a regular expression. For developer conve ### `RegexElement` ```ts -type RegexElement = EncodedRegex | CharacterClass | RegExp | string; +type RegexElement = RegexConstruct | string | RegExp; ``` Regex elements are fundamental building blocks of a regular expression. These can be either further regex constructs, regular strings to be matched literally or `RegExp` literals (`/.../`) for including simple regexes as part of a larger structure. -### `EncodedRegex` +### `RegexConstruct` -TODO +The common type for all regex constructs like character classes, quantifiers, and captures. You should not need to use this type directly, it is returned by all regex construct functions. -Note: the shape of the `EncodedRegex` is considered private and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with - -### `CharacterClass` - -TODO - -Note: the shape of the `CharacterClass` is considered private and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with +Note: the shape of the `RegexConstruct` is considered private and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with it. From 3a3474c6426aad897bf5b9b41b89ee21761df511 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 11:38:11 +0200 Subject: [PATCH 14/18] refactor: simplify --- src/constructs/repeat.ts | 5 ++- src/encoder.ts | 45 +++++++++++---------------- website/docs/api/character-classes.md | 16 +++++----- website/docs/api/overview.md | 2 +- 4 files changed, 29 insertions(+), 39 deletions(-) diff --git a/src/constructs/repeat.ts b/src/constructs/repeat.ts index 33dd8bb..48974eb 100644 --- a/src/constructs/repeat.ts +++ b/src/constructs/repeat.ts @@ -6,7 +6,6 @@ export type RepeatOptions = number | { min: number; max?: number; greedy?: boole export function repeat(sequence: RegexSequence, options: RepeatOptions): EncodedRegex { const children = ensureArray(sequence); - if (children.length === 0) { throw new Error('`repeat` should receive at least one element'); } @@ -14,13 +13,13 @@ export function repeat(sequence: RegexSequence, options: RepeatOptions): Encoded if (typeof options === 'number') { return { precedence: 'sequence', - pattern: `${encodeAtomicPattern(sequence)}{${options}}`, + pattern: `${encodeAtomicPattern(children)}{${options}}`, }; } return { precedence: 'sequence', - pattern: `${encodeAtomicPattern(sequence)}{${options.min},${options?.max ?? ''}}${ + pattern: `${encodeAtomicPattern(children)}{${options.min},${options?.max ?? ''}}${ options.greedy === false ? '?' : '' }`, }; diff --git a/src/encoder.ts b/src/encoder.ts index a96f1c9..17ea8dc 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -4,8 +4,18 @@ import { escapeText } from './utils/text'; export function encode(sequence: RegexSequence): EncodedRegex { const elements = ensureArray(sequence); - const encodedNodes = elements.map((n) => encodeElement(n)); - return concat(encodedNodes); + const encoded = elements.map((n) => encodeElement(n)); + + if (encoded.length === 1) { + return encoded[0]!; + } + + return { + precedence: 'sequence', + pattern: encoded + .map((n) => (n.precedence === 'disjunction' ? encodeAtomicPattern(n) : n.pattern)) + .join(''), + }; } export function encodePattern(sequence: RegexSequence): string { @@ -13,7 +23,12 @@ export function encodePattern(sequence: RegexSequence): string { } export function encodeAtomicPattern(sequence: RegexSequence): string { - return wrapAtom(encode(sequence)).pattern; + const encoded = encode(sequence); + if (encoded.precedence === 'atom') { + return encoded.pattern; + } + + return `(?:${encoded.pattern})`; } function encodeElement(element: RegexElement): EncodedRegex { @@ -104,27 +119,3 @@ function isAtomicPattern(pattern: string): boolean { return false; } - -function concat(encoded: EncodedRegex[]): EncodedRegex { - if (encoded.length === 1) { - return encoded[0]!; - } - - return { - precedence: 'sequence', - pattern: encoded - .map((n) => (n.precedence === 'disjunction' ? wrapAtom(n) : n).pattern) - .join(''), - }; -} - -function wrapAtom(encoded: EncodedRegex): EncodedRegex { - if (encoded.precedence === 'atom') { - return encoded; - } - - return { - precedence: 'atom', - pattern: `(?:${encoded.pattern})`, - }; -} diff --git a/website/docs/api/character-classes.md b/website/docs/api/character-classes.md index 95b89c6..71df0b5 100644 --- a/website/docs/api/character-classes.md +++ b/website/docs/api/character-classes.md @@ -5,19 +5,19 @@ title: Character Classes Character classes are a set of characters that match any one of the characters in the set. -### Common character classes +### Common character class escapes ```ts const any: RegexConstruct; -const word: RegexConstruct; -const nonWord: RegexConstruct; -const digit: RegexConstruct; -const nonDigit: RegexConstruct; -const whitespace: RegexConstruct; -const nonWhitespace: RegexConstruct; +const word: CharacterEscape; +const nonWord: CharacterEscape; +const digit: CharacterEscape; +const nonDigit: CharacterEscape; +const whitespace: CharacterEscape; +const nonWhitespace: CharacterEscape; ``` -- `any` matches any character except newline characters. Regex syntax: `*`. +- `any` matches any character except newline characters. Regex syntax: `.`. - `word` matches any word character (letters, digits & underscore). Regex syntax: `\w`. - `nonWord` matches any character **except** word characters (letters, digits & underscore). Regex syntax: `\W`. - `digit` matches any digit. Regex syntax: `\d`. diff --git a/website/docs/api/overview.md b/website/docs/api/overview.md index 4ad55d0..6f5f71f 100644 --- a/website/docs/api/overview.md +++ b/website/docs/api/overview.md @@ -8,7 +8,7 @@ TS Regex Builder allows you to build complex regular expressions using domain-sp ### Terminology -- regex construct - common name for all regex constructs like character classes, quantifiers, and anchors. +- regex construct (`RegexConstruct`) - common name for all regex constructs like character classes, quantifiers, and anchors. - regex element (`RegexElement`) - a fundamental building block of a regular expression, defined as either a regex construct, a string, or `RegExp` literal (`/.../`). - regex sequence (`RegexSequence`) - a sequence of regex elements forming a regular expression. For developer convenience, it also accepts a single element instead of an array. From 400e9ea184661d8955b604c3d590f0b3ffd48b28 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 12:51:43 +0200 Subject: [PATCH 15/18] refactor: cleanup --- src/builders.ts | 6 ++--- src/constructs/capture.ts | 6 ++--- src/constructs/choice-of.ts | 4 ++-- src/constructs/lookahead.ts | 4 ++-- src/constructs/lookbehind.ts | 4 ++-- src/constructs/negative-lookahead.ts | 4 ++-- src/constructs/negative-lookbehind.ts | 4 ++-- src/constructs/quantifiers.ts | 8 +++---- src/constructs/regex.ts | 4 ++-- src/constructs/repeat.ts | 11 ++++----- src/encoder.ts | 33 ++++++++++++--------------- src/utils/elements.ts | 5 ---- src/utils/text.ts | 4 ---- 13 files changed, 42 insertions(+), 55 deletions(-) delete mode 100644 src/utils/elements.ts delete mode 100644 src/utils/text.ts diff --git a/src/builders.ts b/src/builders.ts index 10f5fb6..15435cf 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -1,5 +1,5 @@ import type { RegexFlags, RegexSequence } from './types'; -import { encodePattern } from './encoder'; +import { encodeSequence } from './encoder'; /** * Generate RegExp object from elements with optional flags. @@ -9,7 +9,7 @@ import { encodePattern } from './encoder'; * @returns RegExp object */ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp { - const pattern = encodePattern(sequence); + const pattern = encodeSequence(sequence).pattern; const flagsString = encodeFlags(flags ?? {}); return new RegExp(pattern, flagsString); } @@ -20,7 +20,7 @@ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp * @returns regex pattern string */ export function buildPattern(sequence: RegexSequence): string { - return encodePattern(sequence); + return encodeSequence(sequence).pattern; } function encodeFlags(flags: RegexFlags): string { diff --git a/src/constructs/capture.ts b/src/constructs/capture.ts index f404a5e..d0abdb8 100644 --- a/src/constructs/capture.ts +++ b/src/constructs/capture.ts @@ -1,4 +1,4 @@ -import { encodePattern } from '../encoder'; +import { encodeSequence } from '../encoder'; import type { EncodedRegex } from '../types'; import type { RegexSequence } from '../types'; @@ -23,13 +23,13 @@ export function capture(sequence: RegexSequence, options?: CaptureOptions): Enco if (name) { return { precedence: 'atom', - pattern: `(?<${name}>${encodePattern(sequence)})`, + pattern: `(?<${name}>${encodeSequence(sequence).pattern})`, }; } return { precedence: 'atom', - pattern: `(${encodePattern(sequence)})`, + pattern: `(${encodeSequence(sequence).pattern})`, }; } diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index 40be23a..0b03698 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -1,4 +1,4 @@ -import { encode } from '../encoder'; +import { encodeSequence } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { @@ -6,7 +6,7 @@ export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { throw new Error('`choiceOf` should receive at least one alternative'); } - const encodedAlternatives = alternatives.map((c) => encode(c)); + const encodedAlternatives = alternatives.map((c) => encodeSequence(c)); if (encodedAlternatives.length === 1) { return encodedAlternatives[0]!; } diff --git a/src/constructs/lookahead.ts b/src/constructs/lookahead.ts index a4f7521..f944d12 100644 --- a/src/constructs/lookahead.ts +++ b/src/constructs/lookahead.ts @@ -1,4 +1,4 @@ -import { encodePattern } from '../encoder'; +import { encodeSequence } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function lookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?=${encodePattern(sequence)})`, + pattern: `(?=${encodeSequence(sequence).pattern})`, }; } diff --git a/src/constructs/lookbehind.ts b/src/constructs/lookbehind.ts index 1347fe5..b7b022c 100644 --- a/src/constructs/lookbehind.ts +++ b/src/constructs/lookbehind.ts @@ -1,4 +1,4 @@ -import { encodePattern } from '../encoder'; +import { encodeSequence } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function lookbehind(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?<=${encodePattern(sequence)})`, + pattern: `(?<=${encodeSequence(sequence).pattern})`, }; } diff --git a/src/constructs/negative-lookahead.ts b/src/constructs/negative-lookahead.ts index 06486a1..e321089 100644 --- a/src/constructs/negative-lookahead.ts +++ b/src/constructs/negative-lookahead.ts @@ -1,4 +1,4 @@ -import { encodePattern } from '../encoder'; +import { encodeSequence } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function negativeLookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?!${encodePattern(sequence)})`, + pattern: `(?!${encodeSequence(sequence).pattern})`, }; } diff --git a/src/constructs/negative-lookbehind.ts b/src/constructs/negative-lookbehind.ts index 4227b94..ce647f3 100644 --- a/src/constructs/negative-lookbehind.ts +++ b/src/constructs/negative-lookbehind.ts @@ -1,4 +1,4 @@ -import { encodePattern } from '../encoder'; +import { encodeSequence } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function negativeLookbehind(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(? encodeElement(n)); if (encoded.length === 1) { @@ -13,24 +16,11 @@ export function encode(sequence: RegexSequence): EncodedRegex { return { precedence: 'sequence', pattern: encoded - .map((n) => (n.precedence === 'disjunction' ? encodeAtomicPattern(n) : n.pattern)) + .map((n) => (n.precedence === 'disjunction' ? encodeAtomic(n) : n.pattern)) .join(''), }; } -export function encodePattern(sequence: RegexSequence): string { - return encode(sequence).pattern; -} - -export function encodeAtomicPattern(sequence: RegexSequence): string { - const encoded = encode(sequence); - if (encoded.precedence === 'atom') { - return encoded.pattern; - } - - return `(?:${encoded.pattern})`; -} - function encodeElement(element: RegexElement): EncodedRegex { if (typeof element === 'string') { return encodeText(element); @@ -40,10 +30,12 @@ function encodeElement(element: RegexElement): EncodedRegex { return encodeRegExp(element); } + // EncodedRegex if (typeof element === 'object' && 'pattern' in element) { return element; } + // CharacterClass if (typeof element === 'object' && 'chars' in element) { return encodeCharClass(element); } @@ -119,3 +111,8 @@ function isAtomicPattern(pattern: string): boolean { return false; } + +// Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping +function escapeText(text: string) { + return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string +} diff --git a/src/utils/elements.ts b/src/utils/elements.ts deleted file mode 100644 index c9eb283..0000000 --- a/src/utils/elements.ts +++ /dev/null @@ -1,5 +0,0 @@ -import type { RegexElement, RegexSequence } from '../types'; - -export function ensureArray(sequence: RegexSequence): RegexElement[] { - return Array.isArray(sequence) ? sequence : [sequence]; -} diff --git a/src/utils/text.ts b/src/utils/text.ts deleted file mode 100644 index 9187463..0000000 --- a/src/utils/text.ts +++ /dev/null @@ -1,4 +0,0 @@ -// Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping -export function escapeText(text: string) { - return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string -} From 8a1e51087bfe525c05c5b9c371344aca9e7e012f Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 12:53:54 +0200 Subject: [PATCH 16/18] chore: tweaks --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 05acca3..d3c0fa7 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ TS Regex Builder allows you to build complex regular expressions using domain-sp Terminology: -- regex construct - common name for all regex constructs like character classes, quantifiers, and anchors. +- regex construct (`RegexConstruct`) - common name for all regex constructs like character classes, quantifiers, and anchors. - regex element (`RegexElement`) - a fundamental building block of a regular expression, defined as either a regex construct, a string, or `RegExp` literal (`/.../`). - regex sequence (`RegexSequence`) - a sequence of regex elements forming a regular expression. For developer convenience, it also accepts a single element instead of an array. From 5c0d92350d69a2dd04fc69d15d477c88928638e3 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 12:56:39 +0200 Subject: [PATCH 17/18] refactor: self code review --- src/builders.ts | 6 +++--- src/constructs/capture.ts | 9 ++++----- src/constructs/choice-of.ts | 4 ++-- src/constructs/lookahead.ts | 4 ++-- src/constructs/lookbehind.ts | 4 ++-- src/constructs/negative-lookahead.ts | 4 ++-- src/constructs/negative-lookbehind.ts | 4 ++-- src/constructs/regex.ts | 4 ++-- src/encoder.ts | 12 ++++++------ 9 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/builders.ts b/src/builders.ts index 15435cf..5568761 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -1,5 +1,5 @@ import type { RegexFlags, RegexSequence } from './types'; -import { encodeSequence } from './encoder'; +import { encode } from './encoder'; /** * Generate RegExp object from elements with optional flags. @@ -9,7 +9,7 @@ import { encodeSequence } from './encoder'; * @returns RegExp object */ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp { - const pattern = encodeSequence(sequence).pattern; + const pattern = encode(sequence).pattern; const flagsString = encodeFlags(flags ?? {}); return new RegExp(pattern, flagsString); } @@ -20,7 +20,7 @@ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp * @returns regex pattern string */ export function buildPattern(sequence: RegexSequence): string { - return encodeSequence(sequence).pattern; + return encode(sequence).pattern; } function encodeFlags(flags: RegexFlags): string { diff --git a/src/constructs/capture.ts b/src/constructs/capture.ts index d0abdb8..3814866 100644 --- a/src/constructs/capture.ts +++ b/src/constructs/capture.ts @@ -1,6 +1,5 @@ -import { encodeSequence } from '../encoder'; -import type { EncodedRegex } from '../types'; -import type { RegexSequence } from '../types'; +import { encode } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; export type CaptureOptions = { /** @@ -23,13 +22,13 @@ export function capture(sequence: RegexSequence, options?: CaptureOptions): Enco if (name) { return { precedence: 'atom', - pattern: `(?<${name}>${encodeSequence(sequence).pattern})`, + pattern: `(?<${name}>${encode(sequence).pattern})`, }; } return { precedence: 'atom', - pattern: `(${encodeSequence(sequence).pattern})`, + pattern: `(${encode(sequence).pattern})`, }; } diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index 0b03698..40be23a 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encode } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { @@ -6,7 +6,7 @@ export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { throw new Error('`choiceOf` should receive at least one alternative'); } - const encodedAlternatives = alternatives.map((c) => encodeSequence(c)); + const encodedAlternatives = alternatives.map((c) => encode(c)); if (encodedAlternatives.length === 1) { return encodedAlternatives[0]!; } diff --git a/src/constructs/lookahead.ts b/src/constructs/lookahead.ts index f944d12..6180033 100644 --- a/src/constructs/lookahead.ts +++ b/src/constructs/lookahead.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encode } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function lookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?=${encodeSequence(sequence).pattern})`, + pattern: `(?=${encode(sequence).pattern})`, }; } diff --git a/src/constructs/lookbehind.ts b/src/constructs/lookbehind.ts index b7b022c..9187bed 100644 --- a/src/constructs/lookbehind.ts +++ b/src/constructs/lookbehind.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encode } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function lookbehind(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?<=${encodeSequence(sequence).pattern})`, + pattern: `(?<=${encode(sequence).pattern})`, }; } diff --git a/src/constructs/negative-lookahead.ts b/src/constructs/negative-lookahead.ts index e321089..5694ca6 100644 --- a/src/constructs/negative-lookahead.ts +++ b/src/constructs/negative-lookahead.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encode } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function negativeLookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?!${encodeSequence(sequence).pattern})`, + pattern: `(?!${encode(sequence).pattern})`, }; } diff --git a/src/constructs/negative-lookbehind.ts b/src/constructs/negative-lookbehind.ts index ce647f3..b0264f3 100644 --- a/src/constructs/negative-lookbehind.ts +++ b/src/constructs/negative-lookbehind.ts @@ -1,4 +1,4 @@ -import { encodeSequence } from '../encoder'; +import { encode } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; /** @@ -18,6 +18,6 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function negativeLookbehind(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(? encodeElement(n)); @@ -21,6 +16,11 @@ export function encodeSequence(sequence: RegexSequence): EncodedRegex { }; } +export function encodeAtomic(sequence: RegexSequence): string { + const encoded = encode(sequence); + return encoded.precedence === 'atom' ? encoded.pattern : `(?:${encoded.pattern})`; +} + function encodeElement(element: RegexElement): EncodedRegex { if (typeof element === 'string') { return encodeText(element); From f8f76f38078d0b56847fedab17698135ff16f25c Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 13:02:58 +0200 Subject: [PATCH 18/18] refactor: final tweaks --- src/encoder.ts | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/src/encoder.ts b/src/encoder.ts index 1a0a370..9a7d967 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -48,16 +48,9 @@ function encodeText(text: string): EncodedRegex { throw new Error('`encodeText`: received text should not be empty'); } - // Optimize for single character case - if (text.length === 1) { - return { - precedence: 'atom', - pattern: escapeText(text), - }; - } - return { - precedence: 'sequence', + // Optimize for single character case + precedence: text.length === 1 ? 'atom' : 'sequence', pattern: escapeText(text), }; } @@ -65,13 +58,30 @@ function encodeText(text: string): EncodedRegex { function encodeRegExp(regexp: RegExp): EncodedRegex { const pattern = regexp.source; - // Encode at safe precedence return { + // Encode at safe precedence precedence: isAtomicPattern(pattern) ? 'atom' : 'disjunction', pattern, }; } +// This is intended to catch only some popular atomic patterns like char classes. +function isAtomicPattern(pattern: string): boolean { + if (pattern.length === 1) { + return true; + } + + if (pattern.startsWith('[') && pattern.endsWith(']') && pattern.match(/[[\]]/g)?.length === 2) { + return true; + } + + if (pattern.startsWith('(') && pattern.endsWith(')') && pattern.match(/[()]/g)?.length === 2) { + return true; + } + + return false; +} + export function encodeCharClass(element: CharacterClass, isNegated?: boolean): EncodedRegex { if (!element.chars.length && !element.ranges?.length) { throw new Error('Character class should contain at least one character or character range'); @@ -95,23 +105,6 @@ export function encodeCharClass(element: CharacterClass, isNegated?: boolean): E }; } -// This is intended to catch only some popular atomic patterns like char classes. -function isAtomicPattern(pattern: string): boolean { - if (pattern.length === 1) { - return true; - } - - if (pattern.startsWith('[') && pattern.endsWith(']') && pattern.match(/[[\]]/g)?.length === 2) { - return true; - } - - if (pattern.startsWith('(') && pattern.endsWith(')') && pattern.match(/[()]/g)?.length === 2) { - return true; - } - - return false; -} - // Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping function escapeText(text: string) { return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string