From d0e986512da5da7e2f24bc9b103fbb70d84f4792 Mon Sep 17 00:00:00 2001 From: Samuel Attard Date: Sun, 22 Sep 2024 19:03:04 -0700 Subject: [PATCH] fix: rewrite string enum extractor as a token parser --- src/__tests__/markdown-helpers.spec.ts | 70 +++++++ src/markdown-helpers.ts | 252 +++++++++++++++++++------ 2 files changed, 269 insertions(+), 53 deletions(-) diff --git a/src/__tests__/markdown-helpers.spec.ts b/src/__tests__/markdown-helpers.spec.ts index ff1d531..e010f8c 100644 --- a/src/__tests__/markdown-helpers.spec.ts +++ b/src/__tests__/markdown-helpers.spec.ts @@ -104,6 +104,76 @@ def fn(): expect(extractStringEnum('wassup')).toBe(null); }); + it('should error helpfully on invalid value separators', () => { + expect(() => extractStringEnum('Can be `x` sometimes `y')) + .toThrowErrorMatchingInlineSnapshot(` + "Unexpected separator token while extracting string enum, expected a comma or "and" or "or" but found "s" + Context: \`x\` sometimes \`y + ^" + `); + }); + + it('should error helpfully on unterminated enum strings', () => { + expect(() => extractStringEnum('Can be `x` or `y')).toThrowErrorMatchingInlineSnapshot(` + "Unexpected early termination of token sequence while extracting string enum, did you forget to close a quote? + Context: \`x\` or \`y" + `); + }); + + describe('mixed ticks', () => { + it('should extract an enum when mixed quotes are used', () => { + const values = extractStringEnum('Can be `x"` or "`y"')!; + expect(values).not.toBe(null); + expect(values).toHaveLength(2); + expect(values[0].value).toBe('x"'); + expect(values[1].value).toBe('`y'); + }); + }); + + describe('deprecated wrappers', () => { + it('should handle strikethrough deprecation wrappers', () => { + const values = extractStringEnum('Can be `x` or ~~`y`~~')!; + expect(values).not.toBe(null); + expect(values).toHaveLength(2); + expect(values[0].value).toBe('x'); + expect(values[1].value).toBe('y'); + }); + }); + + describe('lead-in descriptions', () => { + it('should handle value lists that smoothly lead in to prose with a comma', () => { + const values = extractStringEnum('Can be `x` or `y`, where `x` implies that...')!; + expect(values).not.toBe(null); + expect(values).toHaveLength(2); + expect(values[0].value).toBe('x'); + expect(values[1].value).toBe('y'); + }); + + it('should handle value lists that smoothly lead in to prose with a fullstop', () => { + const values = extractStringEnum('Can be `x` or `y`. The `x` value implies that...')!; + expect(values).not.toBe(null); + expect(values).toHaveLength(2); + expect(values[0].value).toBe('x'); + expect(values[1].value).toBe('y'); + }); + + it('should handle value lists that smoothly lead in to prose with a semicolon', () => { + const values = extractStringEnum('Can be `x` or `y`; the `x` value implies that...')!; + expect(values).not.toBe(null); + expect(values).toHaveLength(2); + expect(values[0].value).toBe('x'); + expect(values[1].value).toBe('y'); + }); + + it('should handle value lists that smoothly lead in to prose with a hyphen', () => { + const values = extractStringEnum('Can be `x` or `y` - the `x` value implies that...')!; + expect(values).not.toBe(null); + expect(values).toHaveLength(2); + expect(values[0].value).toBe('x'); + expect(values[1].value).toBe('y'); + }); + }); + describe('with backticks', () => { it('should extract an enum of the format "can be x"', () => { const values = extractStringEnum('Can be `x`')!; diff --git a/src/markdown-helpers.ts b/src/markdown-helpers.ts index 9165a87..05e018e 100644 --- a/src/markdown-helpers.ts +++ b/src/markdown-helpers.ts @@ -35,7 +35,7 @@ export const parseHeadingTags = (tags: string | null): DocumentationTag[] => { parsedTags.push(match[1] as keyof typeof tagMap); } - return parsedTags.map(value => { + return parsedTags.map((value) => { if (tagMap[value]) return tagMap[value]; throw new Error( @@ -45,7 +45,7 @@ export const parseHeadingTags = (tags: string | null): DocumentationTag[] => { }; export const findNextList = (tokens: Token[]) => { - const start = tokens.findIndex(t => t.type === 'bullet_list_open'); + const start = tokens.findIndex((t) => t.type === 'bullet_list_open'); if (start === -1) return null; let opened = 1; let end = -1; @@ -63,7 +63,7 @@ export const findNextList = (tokens: Token[]) => { }; export const findFirstHeading = (tokens: Token[]) => { - const open = tokens.findIndex(token => token.type === 'heading_open'); + const open = tokens.findIndex((token) => token.type === 'heading_open'); expect(open).to.not.equal(-1, "expected to find a heading token but couldn't"); expect(tokens).to.have.lengthOf.at.least(open + 2); expect(tokens[open + 2].type).to.equal('heading_close'); @@ -89,16 +89,16 @@ export const findContentAfterList = (tokens: Token[], returnAllOnNoList = false) } if (start === -1) { if (!returnAllOnNoList) return []; - start = tokens.findIndex(t => t.type === 'heading_close'); + start = tokens.findIndex((t) => t.type === 'heading_close'); } - const end = tokens.slice(start).findIndex(t => t.type === 'heading_open'); + const end = tokens.slice(start).findIndex((t) => t.type === 'heading_open'); if (end === -1) return tokens.slice(start + 1); return tokens.slice(start + 1, end); }; export const findContentAfterHeadingClose = (tokens: Token[]) => { - const start = tokens.findIndex(t => t.type === 'heading_close'); - const end = tokens.slice(start).findIndex(t => t.type === 'heading_open'); + const start = tokens.findIndex((t) => t.type === 'heading_close'); + const end = tokens.slice(start).findIndex((t) => t.type === 'heading_open'); if (end === -1) return tokens.slice(start + 1); return tokens.slice(start + 1, end); }; @@ -118,14 +118,14 @@ export const headingsAndContent = (tokens: Token[]): HeadingContent[] => { const headingTokens = tokens.slice( start + 1, start + - tokens.slice(start).findIndex(t => t.type === 'heading_close' && t.level === token.level), + tokens.slice(start).findIndex((t) => t.type === 'heading_close' && t.level === token.level), ); const startLevel = parseInt(token.tag.replace('h', ''), 10); const content = tokens.slice(start + headingTokens.length); const contentEnd = content.findIndex( - t => t.type === 'heading_open' && parseInt(t.tag.replace('h', ''), 10) <= startLevel, + (t) => t.type === 'heading_open' && parseInt(t.tag.replace('h', ''), 10) <= startLevel, ); groups.push({ @@ -140,7 +140,7 @@ export const headingsAndContent = (tokens: Token[]): HeadingContent[] => { }; const getConstructorHeaderInGroups = (groups: HeadingContent[]) => { - return groups.find(group => group.heading.startsWith('`new ') && group.level === 3); + return groups.find((group) => group.heading.startsWith('`new ') && group.level === 3); }; export const findConstructorHeader = (tokens: Token[]) => { @@ -165,7 +165,7 @@ export const getContentBeforeFirstHeadingMatching = ( return groups.slice( 0, - groups.findIndex(g => matcher(g.heading)), + groups.findIndex((g) => matcher(g.heading)), ); }; @@ -175,7 +175,7 @@ export const findContentInsideHeader = ( expectedLevel: number, ) => { const group = headingsAndContent(tokens).find( - g => g.heading === expectedHeader && g.level === expectedLevel, + (g) => g.heading === expectedHeader && g.level === expectedLevel, ); if (!group) return null; return group.content; @@ -205,7 +205,7 @@ export const safelySeparateTypeStringOn = (typeString: string, targetChar: strin } } types.push(current); - return types.map(t => t.trim()).filter(t => !!t); + return types.map((t) => t.trim()).filter((t) => !!t); }; export const getTopLevelMultiTypes = (typeString: string) => { @@ -286,7 +286,7 @@ export const rawTypeToTypeInformation = ( index === multiTypes.length - 1 && !wasBracketWrapped && collection ? '[]' : '' }`, ) - .map(multiType => rawTypeToTypeInformation(multiType, relatedDescription, subTypedKeys)), + .map((multiType) => rawTypeToTypeInformation(multiType, relatedDescription, subTypedKeys)), }; } @@ -296,7 +296,7 @@ export const rawTypeToTypeInformation = ( type: 'Function', parameters: subTypedKeys && !subTypedKeys.consumed - ? consumeTypedKeysList(subTypedKeys).map(typedKey => ({ + ? consumeTypedKeysList(subTypedKeys).map((typedKey) => ({ name: typedKey.key, description: typedKey.description, required: typedKey.required, @@ -311,7 +311,7 @@ export const rawTypeToTypeInformation = ( type: 'Object', properties: subTypedKeys && !subTypedKeys.consumed - ? consumeTypedKeysList(subTypedKeys).map(typedKey => ({ + ? consumeTypedKeysList(subTypedKeys).map((typedKey) => ({ name: typedKey.key, description: typedKey.description, required: typedKey.required, @@ -326,13 +326,13 @@ export const rawTypeToTypeInformation = ( type: 'String', possibleValues: subTypedKeys && !subTypedKeys.consumed - ? consumeTypedKeysList(subTypedKeys).map(typedKey => ({ + ? consumeTypedKeysList(subTypedKeys).map((typedKey) => ({ value: typedKey.key, description: typedKey.description, })) : relatedDescription - ? extractStringEnum(relatedDescription) - : null, + ? extractStringEnum(relatedDescription) + : null, }; } @@ -340,21 +340,23 @@ export const rawTypeToTypeInformation = ( if (genericTypeMatch) { const genericTypeString = genericTypeMatch.outerType; const innerTypes = getTopLevelOrderedTypes(genericTypeMatch.genericType) - .map(t => rawTypeToTypeInformation(t.trim(), '', null)) - .map(info => { + .map((t) => rawTypeToTypeInformation(t.trim(), '', null)) + .map((info) => { if (info.type === 'Object') { return { ...info, type: 'Object', properties: subTypedKeys && !subTypedKeys.consumed - ? consumeTypedKeysList(subTypedKeys).map(typedKey => ({ - name: typedKey.key, - description: typedKey.description, - required: typedKey.required, - additionalTags: typedKey.additionalTags, - ...typedKey.type, - })) + ? consumeTypedKeysList(subTypedKeys).map( + (typedKey) => ({ + name: typedKey.key, + description: typedKey.description, + required: typedKey.required, + additionalTags: typedKey.additionalTags, + ...typedKey.type, + }), + ) : [], }; } @@ -393,7 +395,7 @@ export const rawTypeToTypeInformation = ( collection, type: 'Event', eventProperties: consumeTypedKeysList(subTypedKeys).map( - typedKey => ({ + (typedKey) => ({ name: typedKey.key, description: typedKey.description, required: typedKey.required, @@ -417,12 +419,14 @@ export const rawTypeToTypeInformation = ( // If no param types are provided in the syntax then we should fallback to the normal one genericProvidedParams.length === 0 ? subTypedKeys && !subTypedKeys.consumed - ? consumeTypedKeysList(subTypedKeys).map(typedKey => ({ - name: typedKey.key, - description: typedKey.description, - required: typedKey.required, - ...typedKey.type, - })) + ? consumeTypedKeysList(subTypedKeys).map( + (typedKey) => ({ + name: typedKey.key, + description: typedKey.description, + required: typedKey.required, + ...typedKey.type, + }), + ) : [] : (genericProvidedParams as MethodParameterDocumentation[]), returns: innerTypes[innerTypes.length - 1], @@ -453,28 +457,170 @@ export enum StripReturnTypeBehavior { DO_NOT_STRIP, } +// All possible value separators, sorted by reverse length to ensure +// that we match the longer comma prefix variants first if they are present +const niceSeparators = [',', 'and', 'or', ', and', ', or'].sort((a, b) => b.length - a.length); +// Some string enums can also be objects, the final phrase is "or an object" and we +// should gracefully terminate in that case +const niceTerminators = [', or an Object', 'or an Object'].sort((a, b) => b.length - a.length); +const suffixesToIgnore = ['(Deprecated)']; + export const extractStringEnum = (description: string): PossibleStringValue[] | null => { - const possibleValues: PossibleStringValue[] = []; - - const inlineValuesPattern = /(?:can be|values? includes?) ((?:(?:[`"'][a-zA-Z0-9-_\.:]+[`"'])(?:(, | )?))*(?:(?:or|and) [`"'][a-zA-Z0-9-_\.:]+[`"'])?)/i; - const inlineMatch = inlineValuesPattern.exec(description); - if (inlineMatch) { - const valueString = inlineMatch[1]; - const valuePattern = /[`"']([a-zA-Z0-9-_\.:]+)[`"']/g; - let value = valuePattern.exec(valueString); - - while (value) { - possibleValues.push({ - value: value[1], - description: '', - }); - value = valuePattern.exec(valueString); + const inlineValuesLocatorPattern = /(?:can be|values? includes?) (.+)/i; + const locatorMatch = inlineValuesLocatorPattern.exec(description); + if (!locatorMatch) return null; + + const valuesTokens = locatorMatch[1].split(''); + + const state = { + position: 0, + values: [] as string[], + currentValue: '', + currentQuoter: null as null | string, + currentQuoterWrappers: [] as string[], + expectingNiceSeparator: false, + couldBeDone: false, + }; + stringEnumTokenLoop: while (state.position < valuesTokens.length) { + const char = valuesTokens[state.position]; + state.position++; + + if (state.currentQuoter) { + // We should never expect a separator inside a quoted value + if (state.expectingNiceSeparator) { + throw new Error('wat'); + } + if (char === state.currentQuoter) { + state.currentQuoter = null; + state.values.push(state.currentValue); + state.currentValue = ''; + state.expectingNiceSeparator = true; + } else { + state.currentValue += char; + } + } else { + // Whitespace can be skipped + if (char === ' ') { + continue stringEnumTokenLoop; + } + + // If we're between values we should be expecting one of the above "nice" + // separators. + if (state.expectingNiceSeparator) { + // Before checking for a separator we need to ensure we have unwrapped any wrapping + // chars + if (state.currentQuoterWrappers.length) { + const expectedUnwrap = state.currentQuoterWrappers.pop(); + if (char !== expectedUnwrap) { + throw new Error( + `Unexpected token while extracting string enum. Expected an unwrapping token that matched "${expectedUnwrap}". But found token: ${char}\nContext: "${ + locatorMatch[1] + }"\n${' '.repeat(8 + state.position)}^`, + ); + } + continue stringEnumTokenLoop; + } + + if (char === '.' || char === ';' || char === '-') { + break stringEnumTokenLoop; + } + + for (const suffix of suffixesToIgnore) { + if ( + [char, ...valuesTokens.slice(state.position, state.position + suffix.length - 1)].join( + '', + ) === suffix + ) { + state.position += suffix.length - 1; + continue stringEnumTokenLoop; + } + } + + for (const niceTerminator of niceTerminators) { + if ( + [ + char, + ...valuesTokens.slice(state.position, state.position + niceTerminator.length - 1), + ].join('') === niceTerminator + ) { + state.position += niceTerminator.length - 1; + state.expectingNiceSeparator = false; + state.couldBeDone = true; + continue stringEnumTokenLoop; + } + } + + for (const niceSeparator of niceSeparators) { + if ( + [ + char, + ...valuesTokens.slice(state.position, state.position + niceSeparator.length - 1), + ].join('') === niceSeparator + ) { + state.position += niceSeparator.length - 1; + state.expectingNiceSeparator = false; + if (niceSeparator === ',') { + state.couldBeDone = true; + } + continue stringEnumTokenLoop; + } + } + throw new Error( + `Unexpected separator token while extracting string enum, expected a comma or "and" or "or" but found "${char}"\nContext: ${ + locatorMatch[1] + }\n${' '.repeat(8 + state.position)}^`, + ); + } + + if (['"', "'", '`'].includes(char)) { + // Quote chars start a new value + state.currentQuoter = char; + // A new value has started, we no longer could be done on an invalid char + state.couldBeDone = false; + continue stringEnumTokenLoop; + } + if (['~'].includes(char)) { + // Deprecated string enum values are wrapped with strikethrough + state.currentQuoterWrappers.push(char); + continue stringEnumTokenLoop; + } + // If we are at the very start we should just assume our heuristic found something silly + // and bail, 0 valid characters is skip-able + if (state.position === 1) { + return null; + } + // If the last thing we parsed _could_ have been a termination character + // let's assume an invalid character here confirms that. + if (state.couldBeDone) { + break stringEnumTokenLoop; + } + // Anything else is unexpected + throw new Error( + `Unexpected token while extracting string enum. Token: ${char}\nContext: "${ + locatorMatch[1] + }"\n${' '.repeat(9 + state.position)}^`, + ); } + } - return possibleValues.length === 0 ? null : possibleValues; + // Reached the end of the description, we should check + // if we are in a clean state (not inside a quote). + // If so we're good, if not hard error + if (state.currentQuoter || state.currentValue) { + throw new Error( + `Unexpected early termination of token sequence while extracting string enum, did you forget to close a quote?\nContext: ${locatorMatch[1]}`, + ); + } + + // No options we should just bail, can't have a string enum with 0 options + if (!state.values.length) { + return null; } - return null; + return state.values.map((value) => ({ + value, + description: '', + })); }; export const extractReturnType = ( @@ -829,7 +975,7 @@ export const findProcess = (tokens: Token[]): ProcessBlock => { renderer: false, utility: false, exported: !ptks.some( - ptk => ptk.type === 'text' && ptk.content.startsWith('This class is not exported'), + (ptk) => ptk.type === 'text' && ptk.content.startsWith('This class is not exported'), ), }; for (const ptk of ptks) {