From 711df7382d10d5db9113c28cb52564a0b168e875 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Wed, 27 Mar 2024 14:21:42 +0100 Subject: [PATCH 1/2] fix: proper encoding for `any('.')` --- .../__tests__/character-class.test.ts | 43 ++++++++++++-- src/constructs/character-class.ts | 58 +++++++++++++------ 2 files changed, 78 insertions(+), 23 deletions(-) diff --git a/src/constructs/__tests__/character-class.test.ts b/src/constructs/__tests__/character-class.test.ts index d2ddb9f..6a58d57 100644 --- a/src/constructs/__tests__/character-class.test.ts +++ b/src/constructs/__tests__/character-class.test.ts @@ -15,6 +15,7 @@ import { word, zeroOrMore, } from '../..'; +import { notDigit } from '../../../lib/typescript/src'; test('`any` character class', () => { expect(any).toEqualRegex(/./); @@ -89,6 +90,25 @@ test('`charClass` throws on negated arguments', () => { ); }); +test('`charClass` joins character escapes', () => { + expect(charClass(any)).toEqualRegex(/./); + expect(charClass(word)).toEqualRegex(/\w/); + expect(charClass(digit)).toEqualRegex(/\d/); + expect(charClass(whitespace)).toEqualRegex(/\s/); + expect(charClass(nonWord)).toEqualRegex(/\W/); + expect(charClass(nonDigit)).toEqualRegex(/\D/); + expect(charClass(nonWhitespace)).toEqualRegex(/\S/); + + expect(charClass(any, whitespace)).toEqualRegex(/[.\s]/); + expect(charClass(any, nonWhitespace)).toEqualRegex(/[.\S]/); + + expect(charClass(word, whitespace)).toEqualRegex(/[\w\s]/); + expect(charClass(any, word, digit)).toEqualRegex(/[.\w\d]/); + + expect(charClass(word, digit, whitespace)).toEqualRegex(/[\w\d\s]/); + expect(charClass(any, word, digit, whitespace)).toEqualRegex(/[.\w\d\s]/); +}); + test('`charRange` pattern', () => { expect(charRange('a', 'z')).toEqualRegex(/[a-z]/); expect(['x', charRange('0', '9')]).toEqualRegex(/x[0-9]/); @@ -108,8 +128,8 @@ test('`charRange` throws on incorrect arguments', () => { }); test('`anyOf` pattern', () => { - expect(anyOf('a')).toEqualRegex(/a/); - expect(['x', anyOf('a'), 'x']).toEqualRegex(/xax/); + expect(anyOf('a')).toEqualRegex(/[a]/); + expect(['x', anyOf('a'), 'x']).toEqualRegex(/x[a]x/); expect(anyOf('ab')).toEqualRegex(/[ab]/); expect(['x', anyOf('ab')]).toEqualRegex(/x[ab]/); expect(['x', anyOf('ab'), 'x']).toEqualRegex(/x[ab]x/); @@ -129,10 +149,25 @@ test('`anyOf` pattern moves hyphen to the last position', () => { expect(anyOf('a-bc')).toEqualRegex(/[abc-]/); }); -test('`anyOf` pattern edge case caret and hyphen', () => { +test('`anyOf` pattern edge cases', () => { expect(anyOf('^-')).toEqualRegex(/[\^-]/); expect(anyOf('-^')).toEqualRegex(/[\^-]/); expect(anyOf('-^a')).toEqualRegex(/[a^-]/); + + expect(anyOf('.')).toEqualRegex(/[.]/); + expect(anyOf('*')).toEqualRegex(/[*]/); + expect(anyOf('+')).toEqualRegex(/[+]/); + expect(anyOf('?')).toEqualRegex(/[?]/); + expect(anyOf('^')).toEqualRegex(/[^]/); + expect(anyOf('$')).toEqualRegex(/[$]/); + expect(anyOf('{')).toEqualRegex(/[{]/); + expect(anyOf('}')).toEqualRegex(/[}]/); + expect(anyOf('(')).toEqualRegex(/[(]/); + expect(anyOf(')')).toEqualRegex(/[)]/); + expect(anyOf('|')).toEqualRegex(/[|]/); + expect(anyOf('[')).toEqualRegex(/[[]/); + expect(anyOf(']')).toEqualRegex(/[\]]/); + expect(anyOf('\\')).toEqualRegex(/[\\]/); }); test('`anyOf` throws on empty text', () => { @@ -147,7 +182,7 @@ test('`negated` character class pattern', () => { }); test('`negated` character class pattern double inversion', () => { - expect(negated(negated(anyOf('a')))).toEqualRegex(/a/); + expect(negated(negated(anyOf('a')))).toEqualRegex(/[a]/); expect(negated(negated(anyOf('abc')))).toEqualRegex(/[abc]/); }); diff --git a/src/constructs/character-class.ts b/src/constructs/character-class.ts index 880e52d..7f20894 100644 --- a/src/constructs/character-class.ts +++ b/src/constructs/character-class.ts @@ -3,6 +3,7 @@ import type { RegexConstruct } from '../types'; export interface CharacterClass extends RegexConstruct { type: 'characterClass'; + escape?: string; chars: string[]; ranges: CharacterRange[]; isNegated: boolean; @@ -19,7 +20,8 @@ export interface CharacterRange { export const any: CharacterClass = { type: 'characterClass', - chars: ['.'], + escape: '.', + chars: [], ranges: [], isNegated: false, encode: encodeCharacterClass, @@ -27,7 +29,8 @@ export const any: CharacterClass = { export const digit: CharacterClass = { type: 'characterClass', - chars: ['\\d'], + escape: '\\d', + chars: [], ranges: [], isNegated: false, encode: encodeCharacterClass, @@ -35,7 +38,8 @@ export const digit: CharacterClass = { export const nonDigit: CharacterClass = { type: 'characterClass', - chars: ['\\D'], + escape: '\\D', + chars: [], ranges: [], isNegated: false, encode: encodeCharacterClass, @@ -43,7 +47,8 @@ export const nonDigit: CharacterClass = { export const word: CharacterClass = { type: 'characterClass', - chars: ['\\w'], + escape: '\\w', + chars: [], ranges: [], isNegated: false, encode: encodeCharacterClass, @@ -51,7 +56,8 @@ export const word: CharacterClass = { export const nonWord: CharacterClass = { type: 'characterClass', - chars: ['\\W'], + escape: '\\W', + chars: [], ranges: [], isNegated: false, encode: encodeCharacterClass, @@ -59,7 +65,8 @@ export const nonWord: CharacterClass = { export const whitespace: CharacterClass = { type: 'characterClass', - chars: ['\\s'], + escape: '\\s', + chars: [], ranges: [], isNegated: false, encode: encodeCharacterClass, @@ -67,7 +74,8 @@ export const whitespace: CharacterClass = { export const nonWhitespace: CharacterClass = { type: 'characterClass', - chars: ['\\S'], + escape: '\\S', + chars: [], ranges: [], isNegated: false, encode: encodeCharacterClass, @@ -89,15 +97,17 @@ export const notWord = nonWord; export const notWhitespace = nonWhitespace; export function charClass(...elements: CharacterClass[]): CharacterClass { - elements.forEach((element) => { - if (element.isNegated) { - throw new Error('`charClass` should receive only non-negated character classes'); - } - }); + if (elements.some((e) => e.isNegated)) { + throw new Error('`charClass` should receive only non-negated character classes'); + } + + if (elements.length === 1) { + return elements[0]!; + } return { type: 'characterClass', - chars: elements.map((c) => c.chars).flat(), + chars: elements.map((c) => getAllChars(c)).flat(), ranges: elements.map((c) => c.ranges).flat(), isNegated: false, encode: encodeCharacterClass, @@ -158,24 +168,26 @@ export function negated(element: CharacterClass): CharacterClass { export const inverted = negated; function encodeCharacterClass(this: CharacterClass): EncodeResult { - if (this.chars.length === 0 && this.ranges.length === 0) { + if (this.escape === undefined && this.chars.length === 0 && this.ranges.length === 0) { throw new Error('Character class should contain at least one character or character range'); } // Direct rendering for single-character class - if (this.chars.length === 1 && this.ranges?.length === 0 && !this.isNegated) { + if (this.escape !== undefined && !this.chars.length && !this.ranges.length && !this.isNegated) { return { precedence: 'atom', - pattern: this.chars[0]!, + pattern: this.escape, }; } + const allChars = getAllChars(this); + // If passed characters includes hyphen (`-`) it need to be moved to // first (or last) place in order to treat it as hyphen character and not a range. // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types - const hyphen = this.chars.includes('-') ? '-' : ''; - const caret = this.chars.includes('^') ? '^' : ''; - const otherChars = this.chars.filter((c) => c !== '-' && c !== '^').join(''); + const hyphen = allChars.includes('-') ? '-' : ''; + const caret = allChars.includes('^') ? '^' : ''; + const otherChars = allChars.filter((c) => c !== '-' && c !== '^').join(''); const ranges = this.ranges.map(({ start, end }) => `${start}-${end}`).join(''); const negation = this.isNegated ? '^' : ''; @@ -191,3 +203,11 @@ function encodeCharacterClass(this: CharacterClass): EncodeResult { function escapeForCharacterClass(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } + +function getAllChars(characterClass: CharacterClass) { + if (characterClass.escape === undefined) { + return characterClass.chars; + } + + return [characterClass.escape, ...characterClass.chars]; +} From c1a8a9faef1ecdf804f25072119841c4dbafd04d Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Wed, 27 Mar 2024 14:27:49 +0100 Subject: [PATCH 2/2] chore: fix lint --- src/constructs/__tests__/character-class.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/constructs/__tests__/character-class.test.ts b/src/constructs/__tests__/character-class.test.ts index 6a58d57..9c291d4 100644 --- a/src/constructs/__tests__/character-class.test.ts +++ b/src/constructs/__tests__/character-class.test.ts @@ -15,7 +15,6 @@ import { word, zeroOrMore, } from '../..'; -import { notDigit } from '../../../lib/typescript/src'; test('`any` character class', () => { expect(any).toEqualRegex(/./);