-
Notifications
You must be signed in to change notification settings - Fork 6
Myst frontmatter support #899
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
58a21ba
582e2c6
2df0f1c
ac32304
57cf712
56b22c7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,385 @@ | ||
import { | ||
ResearchObject, | ||
ResearchObjectV1, | ||
ResearchObjectV1Author, | ||
ResearchObjectV1Component, | ||
ResearchObjectComponentType, | ||
} from '../ResearchObject'; | ||
import { BaseTransformer } from './BaseTransformer'; | ||
|
||
/** | ||
* Transformer for MyST Markdown format | ||
* | ||
* MyST Markdown is an extension of CommonMark with additional features for scientific and technical documentation. | ||
* It includes frontmatter, directives, roles, and more. | ||
*/ | ||
export class MystTransformer implements BaseTransformer { | ||
/** | ||
* Import a MyST Markdown string into a ResearchObject | ||
* | ||
* @param input MyST Markdown string | ||
* @returns ResearchObject | ||
*/ | ||
importObject(input: string): ResearchObject { | ||
if (typeof input !== 'string') { | ||
throw new Error('MystTransformer.importObject expects a string input'); | ||
} | ||
|
||
// Extract frontmatter and content | ||
const { frontmatter, content } = this.extractFrontmatter(input); | ||
|
||
// Create a basic ResearchObject | ||
const researchObject: ResearchObjectV1 = { | ||
version: 1, | ||
title: frontmatter.title || '', | ||
description: frontmatter.description || '', | ||
components: [], | ||
authors: this.parseAuthors(frontmatter.authors || []), | ||
keywords: frontmatter.keywords || [], | ||
researchFields: frontmatter.tags || [], | ||
defaultLicense: this.parseLicense(frontmatter.license), | ||
}; | ||
|
||
// Add content as a component | ||
if (content) { | ||
const component: ResearchObjectV1Component = { | ||
id: 'content', | ||
name: 'Main Content', | ||
type: ResearchObjectComponentType.CODE, | ||
payload: { | ||
path: 'content.md', | ||
title: researchObject.title, | ||
description: researchObject.description, | ||
content: content, | ||
cid: '', // This would be populated when the content is stored | ||
}, | ||
}; | ||
researchObject.components.push(component); | ||
} | ||
|
||
return researchObject; | ||
} | ||
|
||
/** | ||
* Export a ResearchObject to MyST Markdown | ||
* | ||
* @param input ResearchObject | ||
* @returns MyST Markdown string | ||
*/ | ||
exportObject(input: ResearchObject): string { | ||
if (!input || typeof input !== 'object') { | ||
throw new Error('MystTransformer.exportObject expects a ResearchObject input'); | ||
} | ||
|
||
const researchObject = input as ResearchObjectV1; | ||
|
||
// Extract relevant data | ||
const title = researchObject.title || ''; | ||
const description = researchObject.description || ''; | ||
const authors = researchObject.authors || []; | ||
const keywords = researchObject.keywords || []; | ||
const tags = researchObject.researchFields || []; | ||
const license = researchObject.defaultLicense || ''; | ||
|
||
// Generate frontmatter | ||
const frontmatter = this.generateFrontmatter({ | ||
title, | ||
description, | ||
authors, | ||
keywords, | ||
tags, | ||
license, | ||
}); | ||
|
||
// Include content from components | ||
let content = ''; | ||
if (researchObject.components && researchObject.components.length > 0) { | ||
const mainComponent = researchObject.components.find((c) => c.id === 'content'); | ||
if (mainComponent && mainComponent.payload) { | ||
content = mainComponent.payload.content || ''; | ||
} | ||
} | ||
|
||
return frontmatter + content; | ||
} | ||
|
||
/** | ||
* Extract frontmatter and content from MyST Markdown | ||
* | ||
* @param input MyST Markdown string | ||
* @returns Object containing frontmatter and content | ||
*/ | ||
private extractFrontmatter(input: string): { frontmatter: any; content: string } { | ||
const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n([\s\S]*)$/; | ||
const match = input.match(frontmatterRegex); | ||
|
||
if (!match) { | ||
return { frontmatter: {}, content: input }; | ||
} | ||
|
||
const frontmatterYaml = match[1]; | ||
const content = match[2]; | ||
|
||
// Parse YAML frontmatter | ||
const frontmatter: any = {}; | ||
let currentKey = ''; | ||
let currentList: any[] = []; | ||
let currentListItem: any = {}; | ||
let inList = false; | ||
let listIndent = 0; | ||
let inNestedList = false; | ||
let nestedListIndent = 0; | ||
|
||
const lines = frontmatterYaml.split('\n'); | ||
for (const line of lines) { | ||
// Skip empty lines | ||
if (!line.trim()) continue; | ||
|
||
const indent = line.search(/\S/); | ||
const trimmedLine = line.trim(); | ||
|
||
// Check if we're starting a new list item | ||
if (trimmedLine.startsWith('-')) { | ||
if (indent > listIndent && inList) { | ||
// This is a nested list item | ||
if (!inNestedList) { | ||
inNestedList = true; | ||
nestedListIndent = indent; | ||
if (!currentListItem.organizations) { | ||
currentListItem.organizations = []; | ||
} | ||
} | ||
const nestedItemContent = trimmedLine.slice(1).trim(); | ||
if (nestedItemContent.includes(':')) { | ||
const [key, value] = this.splitKeyValue(nestedItemContent); | ||
currentListItem.organizations.push({ | ||
id: this.generateId(), | ||
name: value, | ||
}); | ||
} else { | ||
currentListItem.organizations.push({ | ||
id: this.generateId(), | ||
name: nestedItemContent, | ||
}); | ||
} | ||
continue; | ||
} | ||
|
||
// If we're not in a list yet, start a new one | ||
if (!inList) { | ||
inList = true; | ||
currentList = []; | ||
listIndent = indent; | ||
} else if (indent === listIndent) { | ||
// Save previous list item if it exists | ||
if (Object.keys(currentListItem).length > 0) { | ||
currentList.push({ ...currentListItem }); | ||
currentListItem = {}; | ||
} | ||
inNestedList = false; | ||
} | ||
|
||
// Parse the list item | ||
const itemContent = trimmedLine.slice(1).trim(); | ||
if (itemContent.includes(':')) { | ||
const [key, value] = this.splitKeyValue(itemContent); | ||
currentListItem[key] = value; | ||
} else { | ||
currentListItem = { name: itemContent }; | ||
} | ||
continue; | ||
} | ||
|
||
// Handle nested properties in list items | ||
if (inList && indent > listIndent && !inNestedList) { | ||
const [key, value] = this.splitKeyValue(trimmedLine); | ||
if (key && value) { | ||
currentListItem[key] = value; | ||
} | ||
continue; | ||
} | ||
|
||
// If we're in a list but this line isn't indented enough, end the list | ||
if (inList && indent <= listIndent) { | ||
// Save the last list item if it exists | ||
if (Object.keys(currentListItem).length > 0) { | ||
currentList.push({ ...currentListItem }); | ||
} | ||
frontmatter[currentKey] = [...currentList]; | ||
inList = false; | ||
inNestedList = false; | ||
currentList = []; | ||
currentListItem = {}; | ||
} | ||
|
||
// Parse key-value pairs | ||
const keyValueMatch = trimmedLine.match(/^([^:]+):\s*(.*)$/); | ||
if (keyValueMatch) { | ||
const key = keyValueMatch[1].trim(); | ||
const value = keyValueMatch[2].trim(); | ||
|
||
// Handle arrays in square brackets | ||
if (value.startsWith('[') && value.endsWith(']')) { | ||
frontmatter[key] = value | ||
.slice(1, -1) | ||
.split(',') | ||
.map((item) => item.trim()); | ||
} else { | ||
frontmatter[key] = value; | ||
currentKey = key; | ||
} | ||
} | ||
} | ||
|
||
Comment on lines
+124
to
+233
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Hand-rolled YAML parser is fragile – use This 100-line loop attempts to replicate YAML parsing (lists, nested lists, scalars, URLs, etc.).
Maintaining this logic will be error-prone and slows the project. -import * as yaml from 'yaml'; // at top
-// ...
-const parsed = yaml.parse(frontmatterYaml) ?? {};
-return { frontmatter: parsed, content };
+import * as yaml from 'yaml';
+// ...
+let parsed: any = {};
+try {
+ parsed = yaml.parse(frontmatterYaml) ?? {};
+} catch (err) {
+ throw new Error(`Failed to parse MyST front-matter: ${(err as Error).message}`);
+}
+return { frontmatter: parsed, content };
|
||
// Save any remaining list items | ||
if (inList && Object.keys(currentListItem).length > 0) { | ||
currentList.push({ ...currentListItem }); | ||
frontmatter[currentKey] = [...currentList]; | ||
} | ||
|
||
return { frontmatter, content }; | ||
} | ||
|
||
/** | ||
* Split a YAML line into key and value, handling special cases like URLs | ||
*/ | ||
private splitKeyValue(line: string): [string, string] { | ||
const colonIndex = line.indexOf(':'); | ||
if (colonIndex === -1) { | ||
return ['', line]; | ||
} | ||
|
||
const key = line.slice(0, colonIndex).trim(); | ||
let value = line.slice(colonIndex + 1).trim(); | ||
|
||
// Handle URLs that contain colons | ||
if (value.startsWith('http')) { | ||
const match = line.match(/^([^:]+):\s*(https?:\/\/.*)$/); | ||
if (match) { | ||
return [match[1].trim(), match[2].trim()]; | ||
} | ||
} | ||
|
||
return [key, value]; | ||
} | ||
|
||
/** | ||
* Generate a random ID for organizations | ||
* @returns A random UUID | ||
*/ | ||
private generateId(): string { | ||
return 'org-' + Math.random().toString(36).substring(2, 15); | ||
} | ||
|
||
/** | ||
* Parse authors from frontmatter | ||
* | ||
* @param authors Authors from frontmatter | ||
* @returns ResearchObjectV1Author[] | ||
*/ | ||
private parseAuthors(authors: any[]): ResearchObjectV1Author[] { | ||
if (!Array.isArray(authors)) { | ||
return []; | ||
} | ||
|
||
return authors.map((author) => { | ||
if (typeof author === 'string') { | ||
return { name: author, role: 'Author' }; | ||
} | ||
|
||
const parsedAuthor: ResearchObjectV1Author = { | ||
name: author.name || '', | ||
role: author.role || 'Author', | ||
}; | ||
|
||
if (author.orcid) { | ||
parsedAuthor.orcid = author.orcid.startsWith('http') ? author.orcid : `https://orcid.org/${author.orcid}`; | ||
} | ||
|
||
// Handle both organizations and affiliations fields | ||
const orgs = author.organizations || author.affiliations || []; | ||
if (orgs.length > 0) { | ||
parsedAuthor.organizations = orgs.map((org: any) => ({ | ||
id: this.generateId(), | ||
name: typeof org === 'string' ? org : org.name || '', | ||
})); | ||
} | ||
|
||
return parsedAuthor; | ||
}); | ||
} | ||
|
||
/** | ||
* Parse license from frontmatter | ||
* | ||
* @param license License from frontmatter | ||
* @returns License string | ||
*/ | ||
private parseLicense(license: any): string { | ||
if (!license) { | ||
return ''; | ||
} | ||
|
||
if (typeof license === 'string') { | ||
return license; | ||
} | ||
|
||
if (license.content) { | ||
return license.content; | ||
} | ||
|
||
return ''; | ||
} | ||
|
||
/** | ||
* Generate frontmatter for MyST Markdown | ||
* | ||
* @param data Data to include in frontmatter | ||
* @returns Frontmatter string | ||
*/ | ||
private generateFrontmatter(data: { | ||
title: string; | ||
description: string; | ||
authors: ResearchObjectV1Author[]; | ||
keywords: string[]; | ||
tags: string[]; | ||
license: string; | ||
}): string { | ||
const { title, description, authors, keywords, tags, license } = data; | ||
|
||
let frontmatter = '---\n'; | ||
if (title) frontmatter += `title: ${title}\n`; | ||
if (description) frontmatter += `description: ${description}\n`; | ||
if (license) frontmatter += `license: ${license}\n`; | ||
|
||
if (keywords && keywords.length > 0) { | ||
frontmatter += `keywords: [${keywords.join(', ')}]\n`; | ||
} | ||
|
||
if (tags && tags.length > 0) { | ||
frontmatter += `tags: [${tags.join(', ')}]\n`; | ||
} | ||
|
||
if (authors && authors.length > 0) { | ||
frontmatter += 'authors:\n'; | ||
for (const author of authors) { | ||
frontmatter += ` - name: ${author.name}\n`; | ||
if (author.orcid) { | ||
frontmatter += ` orcid: ${author.orcid}\n`; | ||
} | ||
if (author.role) { | ||
frontmatter += ` role: ${author.role}\n`; | ||
} | ||
if (author.organizations && author.organizations.length > 0) { | ||
frontmatter += ' affiliations:\n'; | ||
for (const org of author.organizations) { | ||
frontmatter += ` - ${org.name}\n`; | ||
} | ||
} | ||
} | ||
} | ||
|
||
frontmatter += '---\n\n'; | ||
return frontmatter; | ||
Comment on lines
+350
to
+383
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Front-matter generation doesn’t escape YAML & loses rich metadata Direct string concatenation will break when titles/descriptions contain:
and drops optional fields like Recommend serialising an object and letting the YAML library handle escaping: -let frontmatter = '---\n';
-/* manual concat */
-frontmatter += '---\n\n';
-return frontmatter;
+import * as yaml from 'yaml';
+const fmObj: any = { title, description, license };
+if (keywords.length) fmObj.keywords = keywords;
+if (tags.length) fmObj.tags = tags;
+if (authors.length) fmObj.authors = authors.map(/* transform to yaml-safe */);
+return `---\n${yaml.stringify(fmObj)}---\n\n`;
|
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
export * from './BaseTransformer'; | ||
export * from './RdfTransformer'; | ||
export * from './RoCrateTransformer'; | ||
export * from './MystTransformer'; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regex fails on Windows line-endings or missing trailing newline
/^---\s*\n([\s\S]*?)\n---\s*\n([\s\S]*)$/
\n
only matches LF, not CRLF.---
is valid at EOF without a following newline.Consider a more tolerant expression:
Or better, delegate parsing to a YAML/front-matter library to avoid hand-rolled edge cases.
📝 Committable suggestion