Skip to content

Commit

Permalink
fix: rewrite string enum extractor as a token parser
Browse files Browse the repository at this point in the history
  • Loading branch information
MarshallOfSound committed Sep 23, 2024
1 parent f5d5caa commit 5a0ec10
Show file tree
Hide file tree
Showing 2 changed files with 231 additions and 17 deletions.
70 changes: 70 additions & 0 deletions src/__tests__/markdown-helpers.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,76 @@ def fn():
expect(extractStringEnum('wassup')).toBe(null);
});

it('should error helpfully on invalid value separators', () => {
expect(() => extractStringEnum('Can be `x` sometimes `y'))
.toThrowErrorMatchingInlineSnapshot(`
"Unexpected separator token while extracting string enum, expected a comma or "and" or "or" but found "s"
Context: \`x\` sometimes \`y
^"
`);
});

it('should error helpfully on unterminated enum strings', () => {
expect(() => extractStringEnum('Can be `x` or `y')).toThrowErrorMatchingInlineSnapshot(`
"Unexpected early termination of token sequence while extracting string enum, did you forget to close a quote?
Context: \`x\` or \`y"
`);
});

describe('mixed ticks', () => {
it('should extract an enum when mixed quotes are used', () => {
const values = extractStringEnum('Can be `x"` or "`y"')!;
expect(values).not.toBe(null);
expect(values).toHaveLength(2);
expect(values[0].value).toBe('x"');
expect(values[1].value).toBe('`y');
});
});

describe('deprecated wrappers', () => {
it('should handle strikethrough deprecation wrappers', () => {
const values = extractStringEnum('Can be `x` or ~~`y`~~')!;
expect(values).not.toBe(null);
expect(values).toHaveLength(2);
expect(values[0].value).toBe('x');
expect(values[1].value).toBe('y');
});
});

describe('lead-in descriptions', () => {
it('should handle value lists that smoothly lead in to prose with a comma', () => {
const values = extractStringEnum('Can be `x` or `y`, where `x` implies that...')!;
expect(values).not.toBe(null);
expect(values).toHaveLength(2);
expect(values[0].value).toBe('x');
expect(values[1].value).toBe('y');
});

it('should handle value lists that smoothly lead in to prose with a fullstop', () => {
const values = extractStringEnum('Can be `x` or `y`. The `x` value implies that...')!;
expect(values).not.toBe(null);
expect(values).toHaveLength(2);
expect(values[0].value).toBe('x');
expect(values[1].value).toBe('y');
});

it('should handle value lists that smoothly lead in to prose with a semicolon', () => {
const values = extractStringEnum('Can be `x` or `y`; the `x` value implies that...')!;
expect(values).not.toBe(null);
expect(values).toHaveLength(2);
expect(values[0].value).toBe('x');
expect(values[1].value).toBe('y');
});

it('should handle value lists that smoothly lead in to prose with a hyphen', () => {
const values = extractStringEnum('Can be `x` or `y` - the `x` value implies that...')!;
expect(values).not.toBe(null);
expect(values).toHaveLength(2);
expect(values[0].value).toBe('x');
expect(values[1].value).toBe('y');
});
});

describe('with backticks', () => {
it('should extract an enum of the format "can be x"', () => {
const values = extractStringEnum('Can be `x`')!;
Expand Down
178 changes: 161 additions & 17 deletions src/markdown-helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -453,28 +453,172 @@ export enum StripReturnTypeBehavior {
DO_NOT_STRIP,
}

// All possible value separators, sorted by reverse length to ensure
// that we match the longer comma prefix variants first if they are present
const niceSeparators = [',', 'and', 'or', ', and', ', or'].sort((a, b) => b.length - a.length);
// Some string enums can also be objects, the final phrase is "or an object" and we
// should gracefully terminate in that case
const niceTerminators = [', or an Object', 'or an Object'].sort((a, b) => b.length - a.length);
const suffixesToIgnore = ['(Deprecated)'];

export const extractStringEnum = (description: string): PossibleStringValue[] | null => {
const possibleValues: PossibleStringValue[] = [];

const inlineValuesPattern = /(?:can be|values? includes?) ((?:(?:[`"'][a-zA-Z0-9-_\.:]+[`"'])(?:(, | )?))*(?:(?:or|and) [`"'][a-zA-Z0-9-_\.:]+[`"'])?)/i;
const inlineMatch = inlineValuesPattern.exec(description);
if (inlineMatch) {
const valueString = inlineMatch[1];
const valuePattern = /[`"']([a-zA-Z0-9-_\.:]+)[`"']/g;
let value = valuePattern.exec(valueString);

while (value) {
possibleValues.push({
value: value[1],
description: '',
});
value = valuePattern.exec(valueString);
const inlineValuesLocatorPattern = /(?:can be|values? includes?) (.+)/i;
const locatorMatch = inlineValuesLocatorPattern.exec(description);
if (!locatorMatch) return null;

const valuesTokens = locatorMatch[1].split('');

const state = {
position: 0,
values: [] as string[],
currentValue: '',
currentQuoter: null as null | string,
currentQuoterWrappers: [] as string[],
expectingNiceSeparator: false,
couldBeDone: false,
};
stringEnumTokenLoop: while (state.position < valuesTokens.length) {
const char = valuesTokens[state.position];
state.position++;

console.log(state, { char });

if (state.currentQuoter) {
// We should never expect a separator inside a quoted value
if (state.expectingNiceSeparator) {
throw new Error('wat');
}
if (char === state.currentQuoter) {
state.currentQuoter = null;
state.values.push(state.currentValue);
state.currentValue = '';
state.expectingNiceSeparator = true;
} else {
state.currentValue += char;
}
} else {
// Whitespace can be skipped
if (char === ' ') {
continue stringEnumTokenLoop;
}

// If we're between values we should be expecting one of the above "nice"
// separators.
if (state.expectingNiceSeparator) {
// Before checking for a separator we need to ensure we have unwrapped any wrapping
// chars
if (state.currentQuoterWrappers.length) {
const expectedUnwrap = state.currentQuoterWrappers.pop();
if (char !== expectedUnwrap) {
throw new Error(
`Unexpected token while extracting string enum. Expected an unwrapping token that matched "${expectedUnwrap}". But found token: ${char}\nContext: "${
locatorMatch[1]
}"\n${' '.repeat(8 + state.position)}^`,
);
}
continue stringEnumTokenLoop;
}

if (char === '.' || char === ';' || char === '-') {
break stringEnumTokenLoop;
}

for (const suffix of suffixesToIgnore) {
if (
[char, ...valuesTokens.slice(state.position, state.position + suffix.length - 1)].join(
'',
) === suffix
) {
state.position += suffix.length - 1;
continue stringEnumTokenLoop;
}
}

for (const niceTerminator of niceTerminators) {
if (
[
char,
...valuesTokens.slice(state.position, state.position + niceTerminator.length - 1),
].join('') === niceTerminator
) {
state.position += niceTerminator.length - 1;
state.expectingNiceSeparator = false;
state.couldBeDone = true;
continue stringEnumTokenLoop;
}
}

for (const niceSeparator of niceSeparators) {
if (
[
char,
...valuesTokens.slice(state.position, state.position + niceSeparator.length - 1),
].join('') === niceSeparator
) {
state.position += niceSeparator.length - 1;
state.expectingNiceSeparator = false;
if (niceSeparator === ',') {
state.couldBeDone = true;
}
continue stringEnumTokenLoop;
}
}
throw new Error(
`Unexpected separator token while extracting string enum, expected a comma or "and" or "or" but found "${char}"\nContext: ${
locatorMatch[1]
}\n${' '.repeat(8 + state.position)}^`,
);
}

if (['"', "'", '`'].includes(char)) {
// Quote chars start a new value
state.currentQuoter = char;
// A new value has started, we no longer could be done on an invalid char
state.couldBeDone = false;
continue stringEnumTokenLoop;
}
if (['~'].includes(char)) {
// Deprecated string enum values are wrapped with strikethrough
state.currentQuoterWrappers.push(char);
continue stringEnumTokenLoop;
}
// If we are at the very start we should just assume our heuristic found something silly
// and bail, 0 valid characters is skip-able
if (state.position === 1) {
return null;
}
// If the last thing we parsed _could_ have been a termination character
// let's assume an invalid character here confirms that.
if (state.couldBeDone) {
break stringEnumTokenLoop;
}
// Anything else is unexpected
throw new Error(
`Unexpected token while extracting string enum. Token: ${char}\nContext: "${
locatorMatch[1]
}"\n${' '.repeat(9 + state.position)}^`,
);
}
}

// Reached the end of the description, we should check
// if we are in a clean state (not inside a quote).
// If so we're good, if not hard error
if (state.currentQuoter || state.currentValue) {
throw new Error(
`Unexpected early termination of token sequence while extracting string enum, did you forget to close a quote?\nContext: ${locatorMatch[1]}`,
);
}

return possibleValues.length === 0 ? null : possibleValues;
// No options we should just bail, can't have a string enum with 0 options
if (!state.values.length) {
return null;
}

return null;
return state.values.map(value => ({
value,
description: '',
}));
};

export const extractReturnType = (
Expand Down

0 comments on commit 5a0ec10

Please sign in to comment.