diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 7bb2a6d..ee09984 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -1,4 +1,4 @@ -import { buildRegExp, char, unicodeProperty } from '..'; +import { buildRegExp, unicodeChar, unicodeProperty } from '..'; test('`regexBuilder` flags', () => { expect(buildRegExp('a').flags).toBe(''); @@ -34,22 +34,26 @@ test('`regexBuilder` flags', () => { }); test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => { - expect(() => buildRegExp(char(0x1234))).not.toThrow(); - expect(() => buildRegExp(char(0x12345), { unicode: true })).not.toThrow(); + expect(() => buildRegExp(unicodeChar(0x1234))).not.toThrow(); + expect(() => buildRegExp(unicodeChar(0x12345), { unicode: true })).not.toThrow(); expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); - expect(() => buildRegExp(char(0x123456))).toThrowErrorMatchingInlineSnapshot( + expect(() => buildRegExp(unicodeChar(0x123456))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received 1193046"`, ); - expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot( - `"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + expect(() => buildRegExp(unicodeChar(0x12345))).toThrowErrorMatchingInlineSnapshot( + `"Pattern "\\u{12345}" requires "unicode" flag to be set."`, ); expect(() => buildRegExp(unicodeProperty('Emoji_Presentation')), ).toThrowErrorMatchingInlineSnapshot( - `"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + `"Pattern "\\p{Emoji_Presentation}" requires "unicode" flag to be set."`, ); expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot( - `"The pattern "\\P{Letter}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + `"Pattern "\\P{Letter}" requires "unicode" flag to be set."`, ); }); + +test('`regexBuilder` does not throws on tricky unicode mode-like patterns', () => { + expect(() => buildRegExp(/\\u{1234}/)).not.toThrow(); +}); diff --git a/src/builders.ts b/src/builders.ts index 482392f..f65c7fb 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -10,17 +10,9 @@ import { encode } from './encoder'; */ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp { const pattern = encode(sequence).pattern; - const flagsString = encodeFlags(flags ?? {}); - - if (!flags?.unicode) { - const unicodeModePattern = getUnicodeModePattern(pattern); - if (unicodeModePattern) { - throw new Error( - `The pattern "${unicodeModePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`, - ); - } - } + ensureUnicodeFlagIfNeeded(pattern, flags); + const flagsString = encodeFlags(flags ?? {}); return new RegExp(pattern, flagsString); } @@ -47,9 +39,16 @@ function encodeFlags(flags: RegexFlags): string { return result; } -const unicodeModePatterns = /(?:\\u|\\p|\\P)\{.+?\}/; +// Matches unicode mode patterns: \u{...}, \p{...}, \P{...}, but avoids valid \\u{...}, etc +const unicodeModePatterns = /(? { }); test('`charClass` throws on empty text', () => { - expect(() => charClass()).toThrowErrorMatchingInlineSnapshot( - `"\`charClass\` should receive at least one element"`, - ); + expect(() => charClass()).toThrowErrorMatchingInlineSnapshot(`"Expected at least one element"`); }); test('`charRange` pattern', () => { @@ -49,15 +46,23 @@ test('`charRange` pattern', () => { expect([charRange('A', 'F'), 'x']).toEqualRegex(/[A-F]x/); }); +test('`charRange` works both ways', () => { + expect(charRange('a', 'z')).toEqualRegex(/[a-z]/); + expect(charRange('z', 'a')).toEqualRegex(/[a-z]/); +}); + test('`charRange` throws on incorrect arguments', () => { - expect(() => charRange('z', 'a')).toThrowErrorMatchingInlineSnapshot( - `"\`start\` should be before or equal to \`end\`"`, - ); expect(() => charRange('aa', 'z')).toThrowErrorMatchingInlineSnapshot( - `"\`charRange\` should receive only single character \`start\` string"`, + `"Expected single characters, but received "aa" & "z""`, ); expect(() => charRange('a', 'zz')).toThrowErrorMatchingInlineSnapshot( - `"\`charRange\` should receive only single character \`end\` string"`, + `"Expected single characters, but received "a" & "zz""`, + ); + expect(() => charRange('', 'z')).toThrowErrorMatchingInlineSnapshot( + `"Expected single characters, but received "" & "z""`, + ); + expect(() => charRange('a', '')).toThrowErrorMatchingInlineSnapshot( + `"Expected single characters, but received "a" & """`, ); }); @@ -105,9 +110,7 @@ test('`anyOf` pattern edge cases', () => { }); test('`anyOf` throws on empty text', () => { - expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot( - `"\`anyOf\` should received at least one character"`, - ); + expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot(`"Expected at least one character"`); }); test('`negated` character class pattern', () => { @@ -119,9 +122,3 @@ test('`negated` character class matching', () => { expect(negated(anyOf('a'))).not.toMatchString('aa'); expect(negated(anyOf('a'))).toMatchGroups('aba', ['b']); }); - -test('`encodeCharacterClass` throws on empty text', () => { - expect(() => buildRegExp(negated({ chars: [], ranges: [] }))).toThrowErrorMatchingInlineSnapshot( - `"Character class should contain at least one character or character range"`, - ); -}); diff --git a/src/constructs/__tests__/choice-of.test.ts b/src/constructs/__tests__/choice-of.test.ts index cc2a3a6..3d3a2e5 100644 --- a/src/constructs/__tests__/choice-of.test.ts +++ b/src/constructs/__tests__/choice-of.test.ts @@ -34,6 +34,6 @@ test('`choiceOf` pattern using nested regex', () => { test('`choiceOf` throws on empty options', () => { expect(() => choiceOf()).toThrowErrorMatchingInlineSnapshot( - `"\`choiceOf\` should receive at least one alternative"`, + `"Expected at least one alternative"`, ); }); diff --git a/src/constructs/__tests__/encoder.test.tsx b/src/constructs/__tests__/encoder.test.tsx index 4713160..9ba7c06 100644 --- a/src/constructs/__tests__/encoder.test.tsx +++ b/src/constructs/__tests__/encoder.test.tsx @@ -75,7 +75,7 @@ test('`buildRegExp` throws error on unknown element', () => { // @ts-expect-error intentionally passing incorrect object buildRegExp({ type: 'unknown' }), ).toThrowErrorMatchingInlineSnapshot(` - "\`encodeElement\`: unknown element: { + "Unsupported element. Received: { "type": "unknown" }" `); @@ -83,6 +83,6 @@ test('`buildRegExp` throws error on unknown element', () => { test('`buildPattern` throws on empty text', () => { expect(() => buildPattern('')).toThrowErrorMatchingInlineSnapshot( - `"\`encodeText\`: received text should not be empty"`, + `"Expected at least one character"`, ); }); diff --git a/src/constructs/__tests__/repeat.test.tsx b/src/constructs/__tests__/repeat.test.tsx index af61fa4..c7158aa 100644 --- a/src/constructs/__tests__/repeat.test.tsx +++ b/src/constructs/__tests__/repeat.test.tsx @@ -16,9 +16,7 @@ test('`repeat` pattern optimizes grouping for atoms', () => { }); test('`repeat` throws on no children', () => { - expect(() => repeat([], 1)).toThrowErrorMatchingInlineSnapshot( - `"\`repeat\` should receive at least one element"`, - ); + expect(() => repeat([], 1)).toThrowErrorMatchingInlineSnapshot(`"Expected at least one element"`); }); test('greedy `repeat` quantifier pattern', () => { diff --git a/src/constructs/__tests__/char-escape-unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx similarity index 54% rename from src/constructs/__tests__/char-escape-unicode.test.tsx rename to src/constructs/__tests__/unicode.test.tsx index e7c940e..ca9994c 100644 --- a/src/constructs/__tests__/char-escape-unicode.test.tsx +++ b/src/constructs/__tests__/unicode.test.tsx @@ -5,6 +5,7 @@ import { endOfString, type RegexSequence, startOfString, + unicodeChar, unicodeProperty, } from '../..'; @@ -12,81 +13,91 @@ function u(sequence: RegexSequence) { return buildRegExp(sequence, { unicode: true }); } -test('`char` pattern', () => { +test('`unicodeChar` pattern', () => { // eslint-disable-next-line no-control-regex - expect(char(0)).toEqualRegex(/\u0000/); + expect(unicodeChar(0)).toEqualRegex(/\u0000/); // eslint-disable-next-line no-control-regex - expect(char(0x1)).toEqualRegex(/\u0001/); + expect(unicodeChar(0x1)).toEqualRegex(/\u0001/); // eslint-disable-next-line no-control-regex - expect(char(0x12)).toEqualRegex(/\u0012/); - expect(char(0x123)).toEqualRegex(/\u0123/); - expect(char(0x1234)).toEqualRegex(/\u1234/); + expect(unicodeChar(0x12)).toEqualRegex(/\u0012/); + expect(unicodeChar(0x123)).toEqualRegex(/\u0123/); + expect(unicodeChar(0x1234)).toEqualRegex(/\u1234/); // eslint-disable-next-line no-control-regex - expect(u(char(0))).toEqualRegex(new RegExp('\\u0000', 'u')); + expect(u(unicodeChar(0))).toEqualRegex(new RegExp('\\u0000', 'u')); // eslint-disable-next-line no-control-regex - expect(u(char(0x1))).toEqualRegex(new RegExp('\\u0001', 'u')); - expect(u(char(0x12))).toEqualRegex( + expect(u(unicodeChar(0x1))).toEqualRegex(new RegExp('\\u0001', 'u')); + expect(u(unicodeChar(0x12))).toEqualRegex( // eslint-disable-next-line no-control-regex new RegExp('\\u0012', 'u'), ); - expect(char(0x0123)).toEqualRegex(/\u0123/); - expect(char(0x1234)).toEqualRegex(/\u1234/); + expect(unicodeChar(0x0123)).toEqualRegex(/\u0123/); + expect(unicodeChar(0x1234)).toEqualRegex(/\u1234/); - expect(u(char(0x0123))).toEqualRegex(/\u0123/u); - expect(u(char(0x1234))).toEqualRegex(/\u1234/u); - expect(u(char(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u')); - expect(u(char(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); + expect(u(unicodeChar(0x0123))).toEqualRegex(/\u0123/u); + expect(u(unicodeChar(0x1234))).toEqualRegex(/\u1234/u); + expect(u(unicodeChar(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u')); + expect(u(unicodeChar(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); }); -test('`char` matching', () => { - expect(char(0)).toMatchString('\u{0}'); - expect(char(0x1)).toMatchString('\u{1}'); - expect(char(0x12)).toMatchString('\u{12}}'); - expect(char(0x123)).toMatchString('\u{123}'); - expect(char(0x1234)).toMatchString('\u{1234}}'); - - expect(char('a'.codePointAt(0)!)).toMatchString('a'); - expect(char('ą'.codePointAt(0)!)).toMatchString('ą'); - expect(char('©'.codePointAt(0)!)).toMatchString('©'); - - expect(u(char(0))).toMatchString('\u{0}'); - expect(u(char(0))).not.toMatchString('a'); - expect(u(char(0x1))).toMatchString('\u{1}'); - expect(u(char(0x12))).toMatchString('\u{12}'); - expect(u(char(0x123))).toMatchString('\u{123}'); - expect(u(char(0x1234))).toMatchString('\u{1234}'); - expect(u(char(0x12345))).toMatchString('\u{12345}'); - expect(u(char(0x103456))).toMatchString('\u{103456}'); - - expect(u(char('a'.codePointAt(0)!))).toMatchString('a'); - expect(u(char('ą'.codePointAt(0)!))).toMatchString('ą'); - expect(u(char('©'.codePointAt(0)!))).toMatchString('©'); - expect(u(char('😎'.codePointAt(0)!))).toMatchString('😎'); - expect(u(char('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); +test('`unicodeChar` matching', () => { + expect(unicodeChar(0)).toMatchString('\u{0}'); + expect(unicodeChar(0x1)).toMatchString('\u{1}'); + expect(unicodeChar(0x12)).toMatchString('\u{12}}'); + expect(unicodeChar(0x123)).toMatchString('\u{123}'); + expect(unicodeChar(0x1234)).toMatchString('\u{1234}}'); + + expect(unicodeChar('a'.codePointAt(0)!)).toMatchString('a'); + expect(unicodeChar('ą'.codePointAt(0)!)).toMatchString('ą'); + expect(unicodeChar('©'.codePointAt(0)!)).toMatchString('©'); + + expect(u(unicodeChar(0))).toMatchString('\u{0}'); + expect(u(unicodeChar(0))).not.toMatchString('a'); + expect(u(unicodeChar(0x1))).toMatchString('\u{1}'); + expect(u(unicodeChar(0x12))).toMatchString('\u{12}'); + expect(u(unicodeChar(0x123))).toMatchString('\u{123}'); + expect(u(unicodeChar(0x1234))).toMatchString('\u{1234}'); + expect(u(unicodeChar(0x12345))).toMatchString('\u{12345}'); + expect(u(unicodeChar(0x103456))).toMatchString('\u{103456}'); + + expect(u(unicodeChar('a'.codePointAt(0)!))).toMatchString('a'); + expect(u(unicodeChar('ą'.codePointAt(0)!))).toMatchString('ą'); + expect(u(unicodeChar('©'.codePointAt(0)!))).toMatchString('©'); + expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('😎'); + expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); }); -test('`char` nesting matching', () => { - expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('a'); - expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('ą'); - expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).not.toMatchString('b'); +test('`unicodeChar` nesting matching', () => { + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).toMatchString('a'); + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).toMatchString('ą'); + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).not.toMatchString('b'); }); -test('`char` edge cases handling', () => { - expect(() => u(char(NaN))).toThrowErrorMatchingInlineSnapshot( +test('`unicodeChar` edge cases handling', () => { + expect(() => u(unicodeChar(NaN))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received NaN"`, ); - expect(() => u(char(1.5))).toThrowErrorMatchingInlineSnapshot( + expect(() => u(unicodeChar(1.5))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received 1.5"`, ); - expect(() => u(char(-1))).toThrowErrorMatchingInlineSnapshot( + expect(() => u(unicodeChar(-1))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received -1"`, ); - expect(() => u(char(0x110000))).toThrowErrorMatchingInlineSnapshot( + expect(() => u(unicodeChar(0x110000))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received 1114112"`, ); - expect(u(char(0x10ffff))).toEqualRegex(/\u{10ffff}/u); + expect(u(unicodeChar(0x10ffff))).toEqualRegex(/\u{10ffff}/u); +}); + +test('"char" alias', () => { + expect(char('a'.codePointAt(0)!)).toEqualRegex(/\u0061/); }); test('`unicodeProperty` pattern', () => { diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index c480d9f..5724503 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -1,50 +1,45 @@ -import { encodeCharClass } from '../encoder'; import type { CharacterClass, CharacterEscape, EncodedRegex } from '../types'; +import { ensureText } from '../utils'; export function charClass(...elements: Array): CharacterClass { if (!elements.length) { - throw new Error('`charClass` should receive at least one element'); + throw new Error('Expected at least one element'); } return { chars: elements.map((c) => c.chars).flat(), ranges: elements.map((c) => c.ranges ?? []).flat(), + encode: encodeCharClass, }; } export function charRange(start: string, end: string): CharacterClass { - if (start.length !== 1) { - throw new Error('`charRange` should receive only single character `start` string'); - } - - if (end.length !== 1) { - throw new Error('`charRange` should receive only single character `end` string'); + if (start.length !== 1 || end.length !== 1) { + throw new Error(`Expected single characters, but received "${start}" & "${end}"`); } if (start > end) { - throw new Error('`start` should be before or equal to `end`'); + [start, end] = [end, start]; } return { chars: [], ranges: [{ start, end }], + encode: encodeCharClass, }; } -export function anyOf(characters: string): CharacterClass { - const chars = characters.split('').map((c) => escapeCharClass(c)); - - if (chars.length === 0) { - throw new Error('`anyOf` should received at least one character'); - } +export function anyOf(chars: string): CharacterClass { + ensureText(chars); return { - chars, + chars: chars.split('').map(escapeChar), + encode: encodeCharClass, }; } export function negated(element: CharacterClass | CharacterEscape): EncodedRegex { - return encodeCharClass(element, true); + return encodeCharClass.call(element, true); } /** @@ -52,6 +47,29 @@ export function negated(element: CharacterClass | CharacterEscape): EncodedRegex */ export const inverted = negated; -function escapeCharClass(text: string): string { +/** Escape chars for usage inside char class */ +function escapeChar(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } + +function encodeCharClass( + this: CharacterClass | CharacterEscape, + isNegated?: boolean, +): EncodedRegex { + // If passed characters includes hyphen (`-`) it need to be moved to + // first (or last) place in order to treat it as hyphen character and not a range. + // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types + const hyphen = this.chars.includes('-') ? '-' : ''; + const caret = this.chars.includes('^') ? '^' : ''; + const otherChars = this.chars.filter((c) => c !== '-' && c !== '^').join(''); + const ranges = this.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; + const negation = isNegated ? '^' : ''; + + let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; + if (pattern === '[^-]') pattern = '[\\^-]'; + + return { + precedence: 'atom', + pattern, + }; +} diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index fcf6be5..77aa2cb 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -59,54 +59,3 @@ export const notWord = nonWord; * @deprecated Renamed to `nonWhitespace`. */ export const notWhitespace = nonWhitespace; - -/** - * Unicode character code point escape. - * - * Regex pattern: - * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. - * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. - * - * Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). - * - * @param codePoint The code point of the character to escape. - * @returns A character class representing the unicode escape. - */ -export function char(codePoint: number): CharacterEscape { - if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { - throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); - } - - let escape = - codePoint < 0x10000 - ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) - : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} - -/** - * Unicode property escape matching a set of characters specified by a Unicode property. - * - * Regex pattern: `\p{Property}` or `\p{Property=Value}` - * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape - * - * Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). - * - * @param property Unicode property name. - * @param value Unicode property value (optional). - * @returns A character class representing the unicode property escape. - */ -export function unicodeProperty(property: string, value?: string): CharacterEscape { - const escape = `\\p{${property}${value ? `=${value}` : ''}}`; - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index 40be23a..a899c94 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -3,7 +3,7 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { if (alternatives.length === 0) { - throw new Error('`choiceOf` should receive at least one alternative'); + throw new Error('Expected at least one alternative'); } const encodedAlternatives = alternatives.map((c) => encode(c)); diff --git a/src/constructs/quantifiers.ts b/src/constructs/quantifiers.ts index 0fcab70..70e0869 100644 --- a/src/constructs/quantifiers.ts +++ b/src/constructs/quantifiers.ts @@ -1,27 +1,31 @@ import { encodeAtomic } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; +import { ensureElements } from '../utils'; export interface QuantifierOptions { greedy?: boolean; } export function zeroOrMore(sequence: RegexSequence, options?: QuantifierOptions): EncodedRegex { + const elements = ensureElements(sequence); return { precedence: 'sequence', - pattern: `${encodeAtomic(sequence)}*${options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtomic(elements)}*${options?.greedy === false ? '?' : ''}`, }; } export function oneOrMore(sequence: RegexSequence, options?: QuantifierOptions): EncodedRegex { + const elements = ensureElements(sequence); return { precedence: 'sequence', - pattern: `${encodeAtomic(sequence)}+${options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtomic(elements)}+${options?.greedy === false ? '?' : ''}`, }; } export function optional(sequence: RegexSequence, options?: QuantifierOptions): EncodedRegex { + const elements = ensureElements(sequence); return { precedence: 'sequence', - pattern: `${encodeAtomic(sequence)}?${options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtomic(elements)}?${options?.greedy === false ? '?' : ''}`, }; } diff --git a/src/constructs/repeat.ts b/src/constructs/repeat.ts index 2fc30a9..ddb42e3 100644 --- a/src/constructs/repeat.ts +++ b/src/constructs/repeat.ts @@ -1,13 +1,11 @@ import { encodeAtomic } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; +import { ensureElements } from '../utils'; export type RepeatOptions = number | { min: number; max?: number; greedy?: boolean }; export function repeat(sequence: RegexSequence, options: RepeatOptions): EncodedRegex { - const elements = Array.isArray(sequence) ? sequence : [sequence]; - if (elements.length === 0) { - throw new Error('`repeat` should receive at least one element'); - } + const elements = ensureElements(sequence); if (typeof options === 'number') { return { diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts new file mode 100644 index 0000000..c3874ea --- /dev/null +++ b/src/constructs/unicode.ts @@ -0,0 +1,57 @@ +import type { CharacterEscape } from '../types'; + +/** + * Unicode character code point escape. + * + * Regex pattern: + * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. + * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. + * + * Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param codePoint The code point of the character to escape. + * @returns A character class representing the unicode escape. + */ +export function unicodeChar(codePoint: number): CharacterEscape { + if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { + throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); + } + + let escape = + codePoint < 0x10000 + ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) + : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} + +/** + * Alias for `unicodeChar`. + */ +export const char = unicodeChar; + +/** + * Unicode property escape matching a set of characters specified by a Unicode property. + * + * Regex pattern: `\p{Property}` or `\p{Property=Value}` + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape + * + * Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param property Unicode property name. + * @param value Unicode property value (optional). + * @returns A character class representing the unicode property escape. + */ +export function unicodeProperty(property: string, value?: string): CharacterEscape { + const escape = `\\p{${property}${value ? `=${value}` : ''}}`; + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} diff --git a/src/encoder.ts b/src/encoder.ts index 9a7d967..c2aaeda 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -1,7 +1,8 @@ -import type { CharacterClass, EncodedRegex, RegexElement, RegexSequence } from './types'; +import type { EncodedRegex, RegexElement, RegexSequence } from './types'; +import { ensureElements, ensureText } from './utils'; export function encode(sequence: RegexSequence): EncodedRegex { - const elements = Array.isArray(sequence) ? sequence : [sequence]; + const elements = ensureElements(sequence); const encoded = elements.map((n) => encodeElement(n)); if (encoded.length === 1) { @@ -26,27 +27,27 @@ function encodeElement(element: RegexElement): EncodedRegex { return encodeText(element); } - if (typeof element === 'object' && element instanceof RegExp) { + if (element instanceof RegExp) { return encodeRegExp(element); } - // EncodedRegex - if (typeof element === 'object' && 'pattern' in element) { - return element; - } + if (typeof element === 'object') { + // EncodedRegex + if ('pattern' in element) { + return element; + } - // CharacterClass - if (typeof element === 'object' && 'chars' in element) { - return encodeCharClass(element); + // LazyEncodableRegex + if ('encode' in element) { + return element.encode(); + } } - throw new Error(`\`encodeElement\`: unknown element: ${JSON.stringify(element, null, 2)}`); + throw new Error(`Unsupported element. Received: ${JSON.stringify(element, null, 2)}`); } function encodeText(text: string): EncodedRegex { - if (text.length === 0) { - throw new Error('`encodeText`: received text should not be empty'); - } + ensureText(text); return { // Optimize for single character case @@ -65,44 +66,10 @@ function encodeRegExp(regexp: RegExp): EncodedRegex { }; } -// This is intended to catch only some popular atomic patterns like char classes. +// This is intended to catch only some popular atomic patterns like char classes and groups. function isAtomicPattern(pattern: string): boolean { - if (pattern.length === 1) { - return true; - } - - if (pattern.startsWith('[') && pattern.endsWith(']') && pattern.match(/[[\]]/g)?.length === 2) { - return true; - } - - if (pattern.startsWith('(') && pattern.endsWith(')') && pattern.match(/[()]/g)?.length === 2) { - return true; - } - - return false; -} - -export function encodeCharClass(element: CharacterClass, isNegated?: boolean): EncodedRegex { - if (!element.chars.length && !element.ranges?.length) { - throw new Error('Character class should contain at least one character or character range'); - } - - // If passed characters includes hyphen (`-`) it need to be moved to - // first (or last) place in order to treat it as hyphen character and not a range. - // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types - const hyphen = element.chars.includes('-') ? '-' : ''; - const caret = element.chars.includes('^') ? '^' : ''; - const otherChars = element.chars.filter((c) => c !== '-' && c !== '^').join(''); - const ranges = element.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; - const negation = isNegated ? '^' : ''; - - let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; - if (pattern === '[^-]') pattern = '[\\^-]'; - - return { - precedence: 'atom', - pattern, - }; + // Simple char, char class [...] or group (...) + return pattern.length === 1 || /^\[[^[\]]*\]$/.test(pattern) || /^\([^()]*\)$/.test(pattern); } // Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping diff --git a/src/index.ts b/src/index.ts index 30d6677..048ae92 100644 --- a/src/index.ts +++ b/src/index.ts @@ -28,8 +28,6 @@ export { notDigit, notWhitespace, notWord, - char, - unicodeProperty, } from './constructs/char-escape'; export { choiceOf } from './constructs/choice-of'; export { lookahead } from './constructs/lookahead'; @@ -39,3 +37,4 @@ export { negativeLookbehind } from './constructs/negative-lookbehind'; export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers'; export { regex } from './constructs/regex'; export { repeat } from './constructs/repeat'; +export { char, unicodeChar, unicodeProperty } from './constructs/unicode'; diff --git a/src/patterns/hex-color.ts b/src/patterns/hex-color.ts index 67f8efa..6253529 100644 --- a/src/patterns/hex-color.ts +++ b/src/patterns/hex-color.ts @@ -1,11 +1,9 @@ import { buildRegExp } from '../builders'; import { endOfString, startOfString, wordBoundary } from '../constructs/anchors'; -import { charClass, charRange } from '../constructs/char-class'; -import { digit } from '../constructs/char-escape'; import { choiceOf } from '../constructs/choice-of'; import { repeat } from '../constructs/repeat'; -const hexDigit = charClass(digit, charRange('a', 'f')); +const hexDigit = /[0-9a-f]/; /** Find hex color strings in a text. */ export const hexColorFinder = buildRegExp( diff --git a/src/types.ts b/src/types.ts index 81e23a3..f6fd401 100644 --- a/src/types.ts +++ b/src/types.ts @@ -15,7 +15,7 @@ export type RegexElement = RegexConstruct | RegExp | string; /** * Fundamental building block of a regular expression, defined as either an encoded regex or a character class. */ -export type RegexConstruct = EncodedRegex | CharacterClass; +export type RegexConstruct = EncodedRegex | LazyEncodableRegex; /** * Encoded regex pattern with information about its type (atom, sequence) @@ -33,7 +33,11 @@ export interface CharacterEscape extends EncodedRegex { ranges?: never; } -export interface CharacterClass { +export interface LazyEncodableRegex { + encode: () => EncodedRegex; +} + +export interface CharacterClass extends LazyEncodableRegex { chars: string[]; ranges?: CharacterRange[]; } diff --git a/src/utils.ts b/src/utils.ts new file mode 100644 index 0000000..5bc9232 --- /dev/null +++ b/src/utils.ts @@ -0,0 +1,16 @@ +import type { RegexElement, RegexSequence } from './types'; + +export function ensureElements(sequence: RegexSequence): RegexElement[] { + const elements = Array.isArray(sequence) ? sequence : [sequence]; + if (elements.length === 0) { + throw new Error('Expected at least one element'); + } + + return elements; +} + +export function ensureText(text: string): void { + if (text.length === 0) { + throw new Error('Expected at least one character'); + } +} diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md index fc1648b..abccb33 100644 --- a/website/docs/api/unicode.md +++ b/website/docs/api/unicode.md @@ -10,13 +10,15 @@ JavaScript `RegExp` object offers [Unicode-aware](https://developer.mozilla.org/ ### Character escapes ```ts -function char(codePoint: number): CharacterEscape; +function unicodeChar(codePoint: number): CharacterEscape; ``` +Alias: `char`. + Regex syntax: - `\uXXXX`: 4-digit hex escape for code points below 0x10000. -- `\u{X}`: Unicode code point escape for code points above 0xFFFF. +- `\u{XXXXXX}`: Unicode code point escape for code points above 0xFFFF. Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode).