diff --git a/desci-models/package.json b/desci-models/package.json index d38988f0e..f8e8fb9d2 100644 --- a/desci-models/package.json +++ b/desci-models/package.json @@ -1,6 +1,6 @@ { "name": "@desci-labs/desci-models", - "version": "0.2.25", + "version": "0.2.26", "description": "Data models for DeSci Nodes", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/desci-models/src/transformers/MystTransformer.ts b/desci-models/src/transformers/MystTransformer.ts new file mode 100644 index 000000000..f37edf496 --- /dev/null +++ b/desci-models/src/transformers/MystTransformer.ts @@ -0,0 +1,385 @@ +import { + ResearchObject, + ResearchObjectV1, + ResearchObjectV1Author, + ResearchObjectV1Component, + ResearchObjectComponentType, +} from '../ResearchObject'; +import { BaseTransformer } from './BaseTransformer'; + +/** + * Transformer for MyST Markdown format + * + * MyST Markdown is an extension of CommonMark with additional features for scientific and technical documentation. + * It includes frontmatter, directives, roles, and more. + */ +export class MystTransformer implements BaseTransformer { + /** + * Import a MyST Markdown string into a ResearchObject + * + * @param input MyST Markdown string + * @returns ResearchObject + */ + importObject(input: string): ResearchObject { + if (typeof input !== 'string') { + throw new Error('MystTransformer.importObject expects a string input'); + } + + // Extract frontmatter and content + const { frontmatter, content } = this.extractFrontmatter(input); + + // Create a basic ResearchObject + const researchObject: ResearchObjectV1 = { + version: 1, + title: frontmatter.title || '', + description: frontmatter.description || '', + components: [], + authors: this.parseAuthors(frontmatter.authors || []), + keywords: frontmatter.keywords || [], + researchFields: frontmatter.tags || [], + defaultLicense: this.parseLicense(frontmatter.license), + }; + + // Add content as a component + if (content) { + const component: ResearchObjectV1Component = { + id: 'content', + name: 'Main Content', + type: ResearchObjectComponentType.CODE, + payload: { + path: 'content.md', + title: researchObject.title, + description: researchObject.description, + content: content, + cid: '', // This would be populated when the content is stored + }, + }; + researchObject.components.push(component); + } + + return researchObject; + } + + /** + * Export a ResearchObject to MyST Markdown + * + * @param input ResearchObject + * @returns MyST Markdown string + */ + exportObject(input: ResearchObject): string { + if (!input || typeof input !== 'object') { + throw new Error('MystTransformer.exportObject expects a ResearchObject input'); + } + + const researchObject = input as ResearchObjectV1; + + // Extract relevant data + const title = researchObject.title || ''; + const description = researchObject.description || ''; + const authors = researchObject.authors || []; + const keywords = researchObject.keywords || []; + const tags = researchObject.researchFields || []; + const license = researchObject.defaultLicense || ''; + + // Generate frontmatter + const frontmatter = this.generateFrontmatter({ + title, + description, + authors, + keywords, + tags, + license, + }); + + // Include content from components + let content = ''; + if (researchObject.components && researchObject.components.length > 0) { + const mainComponent = researchObject.components.find((c) => c.id === 'content'); + if (mainComponent && mainComponent.payload) { + content = mainComponent.payload.content || ''; + } + } + + return frontmatter + content; + } + + /** + * Extract frontmatter and content from MyST Markdown + * + * @param input MyST Markdown string + * @returns Object containing frontmatter and content + */ + private extractFrontmatter(input: string): { frontmatter: any; content: string } { + const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n([\s\S]*)$/; + const match = input.match(frontmatterRegex); + + if (!match) { + return { frontmatter: {}, content: input }; + } + + const frontmatterYaml = match[1]; + const content = match[2]; + + // Parse YAML frontmatter + const frontmatter: any = {}; + let currentKey = ''; + let currentList: any[] = []; + let currentListItem: any = {}; + let inList = false; + let listIndent = 0; + let inNestedList = false; + let nestedListIndent = 0; + + const lines = frontmatterYaml.split('\n'); + for (const line of lines) { + // Skip empty lines + if (!line.trim()) continue; + + const indent = line.search(/\S/); + const trimmedLine = line.trim(); + + // Check if we're starting a new list item + if (trimmedLine.startsWith('-')) { + if (indent > listIndent && inList) { + // This is a nested list item + if (!inNestedList) { + inNestedList = true; + nestedListIndent = indent; + if (!currentListItem.organizations) { + currentListItem.organizations = []; + } + } + const nestedItemContent = trimmedLine.slice(1).trim(); + if (nestedItemContent.includes(':')) { + const [key, value] = this.splitKeyValue(nestedItemContent); + currentListItem.organizations.push({ + id: this.generateId(), + name: value, + }); + } else { + currentListItem.organizations.push({ + id: this.generateId(), + name: nestedItemContent, + }); + } + continue; + } + + // If we're not in a list yet, start a new one + if (!inList) { + inList = true; + currentList = []; + listIndent = indent; + } else if (indent === listIndent) { + // Save previous list item if it exists + if (Object.keys(currentListItem).length > 0) { + currentList.push({ ...currentListItem }); + currentListItem = {}; + } + inNestedList = false; + } + + // Parse the list item + const itemContent = trimmedLine.slice(1).trim(); + if (itemContent.includes(':')) { + const [key, value] = this.splitKeyValue(itemContent); + currentListItem[key] = value; + } else { + currentListItem = { name: itemContent }; + } + continue; + } + + // Handle nested properties in list items + if (inList && indent > listIndent && !inNestedList) { + const [key, value] = this.splitKeyValue(trimmedLine); + if (key && value) { + currentListItem[key] = value; + } + continue; + } + + // If we're in a list but this line isn't indented enough, end the list + if (inList && indent <= listIndent) { + // Save the last list item if it exists + if (Object.keys(currentListItem).length > 0) { + currentList.push({ ...currentListItem }); + } + frontmatter[currentKey] = [...currentList]; + inList = false; + inNestedList = false; + currentList = []; + currentListItem = {}; + } + + // Parse key-value pairs + const keyValueMatch = trimmedLine.match(/^([^:]+):\s*(.*)$/); + if (keyValueMatch) { + const key = keyValueMatch[1].trim(); + const value = keyValueMatch[2].trim(); + + // Handle arrays in square brackets + if (value.startsWith('[') && value.endsWith(']')) { + frontmatter[key] = value + .slice(1, -1) + .split(',') + .map((item) => item.trim()); + } else { + frontmatter[key] = value; + currentKey = key; + } + } + } + + // Save any remaining list items + if (inList && Object.keys(currentListItem).length > 0) { + currentList.push({ ...currentListItem }); + frontmatter[currentKey] = [...currentList]; + } + + return { frontmatter, content }; + } + + /** + * Split a YAML line into key and value, handling special cases like URLs + */ + private splitKeyValue(line: string): [string, string] { + const colonIndex = line.indexOf(':'); + if (colonIndex === -1) { + return ['', line]; + } + + const key = line.slice(0, colonIndex).trim(); + let value = line.slice(colonIndex + 1).trim(); + + // Handle URLs that contain colons + if (value.startsWith('http')) { + const match = line.match(/^([^:]+):\s*(https?:\/\/.*)$/); + if (match) { + return [match[1].trim(), match[2].trim()]; + } + } + + return [key, value]; + } + + /** + * Generate a random ID for organizations + * @returns A random UUID + */ + private generateId(): string { + return 'org-' + Math.random().toString(36).substring(2, 15); + } + + /** + * Parse authors from frontmatter + * + * @param authors Authors from frontmatter + * @returns ResearchObjectV1Author[] + */ + private parseAuthors(authors: any[]): ResearchObjectV1Author[] { + if (!Array.isArray(authors)) { + return []; + } + + return authors.map((author) => { + if (typeof author === 'string') { + return { name: author, role: 'Author' }; + } + + const parsedAuthor: ResearchObjectV1Author = { + name: author.name || '', + role: author.role || 'Author', + }; + + if (author.orcid) { + parsedAuthor.orcid = author.orcid.startsWith('http') ? author.orcid : `https://orcid.org/${author.orcid}`; + } + + // Handle both organizations and affiliations fields + const orgs = author.organizations || author.affiliations || []; + if (orgs.length > 0) { + parsedAuthor.organizations = orgs.map((org: any) => ({ + id: this.generateId(), + name: typeof org === 'string' ? org : org.name || '', + })); + } + + return parsedAuthor; + }); + } + + /** + * Parse license from frontmatter + * + * @param license License from frontmatter + * @returns License string + */ + private parseLicense(license: any): string { + if (!license) { + return ''; + } + + if (typeof license === 'string') { + return license; + } + + if (license.content) { + return license.content; + } + + return ''; + } + + /** + * Generate frontmatter for MyST Markdown + * + * @param data Data to include in frontmatter + * @returns Frontmatter string + */ + private generateFrontmatter(data: { + title: string; + description: string; + authors: ResearchObjectV1Author[]; + keywords: string[]; + tags: string[]; + license: string; + }): string { + const { title, description, authors, keywords, tags, license } = data; + + let frontmatter = '---\n'; + if (title) frontmatter += `title: ${title}\n`; + if (description) frontmatter += `description: ${description}\n`; + if (license) frontmatter += `license: ${license}\n`; + + if (keywords && keywords.length > 0) { + frontmatter += `keywords: [${keywords.join(', ')}]\n`; + } + + if (tags && tags.length > 0) { + frontmatter += `tags: [${tags.join(', ')}]\n`; + } + + if (authors && authors.length > 0) { + frontmatter += 'authors:\n'; + for (const author of authors) { + frontmatter += ` - name: ${author.name}\n`; + if (author.orcid) { + frontmatter += ` orcid: ${author.orcid}\n`; + } + if (author.role) { + frontmatter += ` role: ${author.role}\n`; + } + if (author.organizations && author.organizations.length > 0) { + frontmatter += ' affiliations:\n'; + for (const org of author.organizations) { + frontmatter += ` - ${org.name}\n`; + } + } + } + } + + frontmatter += '---\n\n'; + return frontmatter; + } +} diff --git a/desci-models/src/transformers/index.ts b/desci-models/src/transformers/index.ts index 0a49ccc60..09c0e2dca 100644 --- a/desci-models/src/transformers/index.ts +++ b/desci-models/src/transformers/index.ts @@ -1,3 +1,4 @@ export * from './BaseTransformer'; export * from './RdfTransformer'; export * from './RoCrateTransformer'; +export * from './MystTransformer'; diff --git a/desci-models/tests/transformers/MystTransformer.test.ts b/desci-models/tests/transformers/MystTransformer.test.ts new file mode 100644 index 000000000..10a40777a --- /dev/null +++ b/desci-models/tests/transformers/MystTransformer.test.ts @@ -0,0 +1,346 @@ +import { describe } from 'mocha'; +import { expect } from 'chai'; +import { MystTransformer } from '../../src/transformers/MystTransformer'; +import ResearchObjectTi from '../../src/ResearchObject-ti'; +import { createCheckers } from 'ts-interface-checker'; +import exampleNode from '../example-data/exampleNode.json'; +import exampleNodeWithAuthors from '../example-data/exampleNodeWithAuthors.json'; +import { ResearchObjectV1 } from '../../src/ResearchObject'; +import { ResearchObjectComponentType } from '../../src/ResearchObject'; + +const checkers = createCheckers(ResearchObjectTi); +const transformer = new MystTransformer(); + +describe('MystTransformer', () => { + it('should import a MyST Markdown string into a ResearchObject', () => { + const mystMarkdown = `--- +title: Test Research Paper +description: A test research paper for testing the MystTransformer +authors: + - name: John Doe + orcid: https://orcid.org/0000-0001-2345-6789 + role: Author + - name: Jane Smith + role: Author + organizations: + - name: University of Example +keywords: [research, test, myst] +tags: [science, technology] +license: https://creativecommons.org/licenses/by/4.0/ +--- + +# Test Research Paper + +This is a test research paper written in MyST Markdown format. +`; + + const researchObject = transformer.importObject(mystMarkdown) as ResearchObjectV1; + + // Validate the output as a ResearchObject + checkers.ResearchObjectV1.check(researchObject); + + // Check specific fields + expect(researchObject.title).to.equal('Test Research Paper'); + expect(researchObject.description).to.equal('A test research paper for testing the MystTransformer'); + expect(researchObject.defaultLicense).to.equal('https://creativecommons.org/licenses/by/4.0/'); + expect(researchObject.keywords).to.deep.equal(['research', 'test', 'myst']); + expect(researchObject.researchFields).to.deep.equal(['science', 'technology']); + + // Check authors + expect(researchObject.authors).to.have.lengthOf(2); + expect(researchObject.authors![0].name).to.equal('John Doe'); + expect(researchObject.authors![0].orcid).to.equal('https://orcid.org/0000-0001-2345-6789'); + expect(researchObject.authors![0].role).to.equal('Author'); + expect(researchObject.authors![1].name).to.equal('Jane Smith'); + expect(researchObject.authors![1].role).to.equal('Author'); + expect(researchObject.authors![1].organizations).to.have.lengthOf(1); + expect(researchObject.authors![1].organizations![0].name).to.equal('University of Example'); + + // Check content component + expect(researchObject.components).to.have.lengthOf(1); + expect(researchObject.components[0].id).to.equal('content'); + expect(researchObject.components[0].name).to.equal('Main Content'); + expect(researchObject.components[0].type).to.equal('code'); + expect(researchObject.components[0].payload.path).to.equal('content.md'); + expect(researchObject.components[0].payload.title).to.equal('Test Research Paper'); + expect(researchObject.components[0].payload.description).to.equal( + 'A test research paper for testing the MystTransformer', + ); + }); + + it('should export a ResearchObject to MyST Markdown', () => { + const researchObject = exampleNode as ResearchObjectV1; + + const mystMarkdown = transformer.exportObject(researchObject); + + // Check that the output is a string + expect(mystMarkdown).to.be.a('string'); + + // Check that it contains the expected frontmatter + expect(mystMarkdown).to.include('---'); + expect(mystMarkdown).to.include(`title: ${researchObject.title}`); + expect(mystMarkdown).to.include(`license: ${researchObject.defaultLicense}`); + + // Check authors + if (researchObject.authors && researchObject.authors.length > 0) { + expect(mystMarkdown).to.include('authors:'); + expect(mystMarkdown).to.include(` - name: ${researchObject.authors[0].name}`); + expect(mystMarkdown).to.include(` orcid: ${researchObject.authors[0].orcid}`); + expect(mystMarkdown).to.include(` role: ${researchObject.authors[0].role}`); + } + + // Check keywords and tags + if (researchObject.keywords && researchObject.keywords.length > 0) { + expect(mystMarkdown).to.include(`keywords: [${researchObject.keywords.join(', ')}]`); + } + + if (researchObject.researchFields && researchObject.researchFields.length > 0) { + expect(mystMarkdown).to.include(`tags: [${researchObject.researchFields.join(', ')}]`); + } + }); + + it('should handle a ResearchObject with multiple authors', () => { + const researchObject = exampleNodeWithAuthors as ResearchObjectV1; + + const mystMarkdown = transformer.exportObject(researchObject); + + // Check that the output is a string + expect(mystMarkdown).to.be.a('string'); + + // Check that it contains the expected frontmatter + expect(mystMarkdown).to.include('---'); + expect(mystMarkdown).to.include(`title: ${researchObject.title}`); + + // Check authors + if (researchObject.authors && researchObject.authors.length > 0) { + expect(mystMarkdown).to.include('authors:'); + + // Check that all authors are included + for (const author of researchObject.authors) { + expect(mystMarkdown).to.include(`- name: ${author.name}`); + expect(mystMarkdown).to.include(`role: ${author.role}`); + + if (author.orcid) { + expect(mystMarkdown).to.include(`orcid: ${author.orcid}`); + } + + if (author.organizations && author.organizations.length > 0) { + expect(mystMarkdown).to.include('affiliations:'); + for (const org of author.organizations) { + expect(mystMarkdown).to.include(`- ${org.name}`); + } + } + } + } + }); + + it('should handle MyST Markdown without frontmatter', () => { + const mystMarkdown = `# Test Research Paper + +This is a test research paper written in MyST Markdown format without frontmatter. +`; + + const researchObject = transformer.importObject(mystMarkdown) as ResearchObjectV1; + + // Validate the output as a ResearchObject + checkers.ResearchObjectV1.check(researchObject); + + // Check that default values are set + expect(researchObject.title).to.equal(''); + expect(researchObject.description).to.equal(''); + expect(researchObject.authors).to.be.an('array').that.is.empty; + expect(researchObject.keywords).to.be.an('array').that.is.empty; + expect(researchObject.researchFields).to.be.an('array').that.is.empty; + + // Check content component + expect(researchObject.components).to.have.lengthOf(1); + expect(researchObject.components[0].id).to.equal('content'); + expect(researchObject.components[0].name).to.equal('Main Content'); + expect(researchObject.components[0].type).to.equal('code'); + expect(researchObject.components[0].payload.path).to.equal('content.md'); + }); + + it('should handle complex MyST frontmatter with nested fields', () => { + const mystMarkdown = `--- +title: Using MyST Frontmatter +subtitle: In JupyterLab +license: CC-BY-4.0 +github: https://github.com/executablebooks/mystmd +subject: Tutorial +venue: MyST Markdown +biblio: + volume: '1' + issue: '42' +authors: + - name: Rowan Cockett + email: rowan@curvenote.com + corresponding: true + orcid: 0000-0002-7859-8394 + affiliations: + - Curvenote + - ExecutableBooks +date: 2023/07/05 +math: + '\\dobs': '\\mathbf{d}_\\text{obs}' + '\\dpred': '\\mathbf{d}_\\text{pred}\\left( #1 \\right)' + '\\mref': '\\mathbf{m}_\\text{ref}' +abbreviations: + MyST: Markedly Structured Text + TLA: Three Letter Acronym +--- + +:::{important} Objective + +The goal of this quickstart is to get you up and running with MyST Markdown **Frontmatter**. + +For a full guide on frontmatter see the [MyST Markdown Guide](https://mystmd.org/guide/frontmatter). +:::`; + + const researchObject = transformer.importObject(mystMarkdown) as ResearchObjectV1; + + // Validate the output as a ResearchObject + checkers.ResearchObjectV1.check(researchObject); + + // Check specific fields + expect(researchObject.title).to.equal('Using MyST Frontmatter'); + expect(researchObject.defaultLicense).to.equal('CC-BY-4.0'); + + // Check authors + expect(researchObject.authors).to.have.lengthOf(1); + expect(researchObject.authors![0].name).to.equal('Rowan Cockett'); + expect(researchObject.authors![0].orcid).to.equal('https://orcid.org/0000-0002-7859-8394'); + + // Check organizations + expect(researchObject.authors![0].organizations).to.have.lengthOf(2); + expect(researchObject.authors![0].organizations![0].name).to.equal('Curvenote'); + expect(researchObject.authors![0].organizations![1].name).to.equal('ExecutableBooks'); + + // Check content component + expect(researchObject.components).to.have.lengthOf(1); + expect(researchObject.components[0].id).to.equal('content'); + expect(researchObject.components[0].name).to.equal('Main Content'); + expect(researchObject.components[0].type).to.equal('code'); + expect(researchObject.components[0].payload.path).to.equal('content.md'); + expect(researchObject.components[0].payload.title).to.equal('Using MyST Frontmatter'); + }); + + it('should preserve data in MyST -> RO -> MyST roundtrip', () => { + const originalMyst = `--- +title: Test Roundtrip +description: Testing roundtrip conversion from MyST to ResearchObject and back +authors: + - name: John Doe + orcid: https://orcid.org/0000-0001-2345-6789 + role: Author + affiliations: + - Test University + - Research Institute + - name: Jane Smith + role: Author + affiliations: + - Example Labs +keywords: [test, roundtrip, conversion] +tags: [research, methodology] +license: CC-BY-4.0 +--- + +# Introduction + +This is a test of roundtrip conversion.`; + + // Convert MyST -> RO -> MyST + const researchObject = transformer.importObject(originalMyst) as ResearchObjectV1; + const convertedMyst = transformer.exportObject(researchObject); + + // The converted MyST should contain all the same information + expect(convertedMyst).to.include('title: Test Roundtrip'); + expect(convertedMyst).to.include('description: Testing roundtrip conversion from MyST to ResearchObject and back'); + expect(convertedMyst).to.include('license: CC-BY-4.0'); + expect(convertedMyst).to.include('name: John Doe'); + expect(convertedMyst).to.include('orcid: https://orcid.org/0000-0001-2345-6789'); + expect(convertedMyst).to.include('role: Author'); + expect(convertedMyst).to.include(' - Test University'); + expect(convertedMyst).to.include(' - Research Institute'); + expect(convertedMyst).to.include('name: Jane Smith'); + expect(convertedMyst).to.include(' - Example Labs'); + expect(convertedMyst).to.include('keywords: [test, roundtrip, conversion]'); + expect(convertedMyst).to.include('tags: [research, methodology]'); + }); + + it('should preserve data in RO -> MyST -> RO roundtrip', () => { + const originalRO: ResearchObjectV1 = { + version: 1, + title: 'Test Roundtrip', + description: 'Testing roundtrip conversion from ResearchObject to MyST and back', + defaultLicense: 'CC-BY-4.0', + authors: [ + { + name: 'John Doe', + orcid: 'https://orcid.org/0000-0001-2345-6789', + role: 'Author', + organizations: [ + { id: 'org-1', name: 'Test University' }, + { id: 'org-2', name: 'Research Institute' }, + ], + }, + { + name: 'Jane Smith', + role: 'Author', + organizations: [{ id: 'org-3', name: 'Example Labs' }], + }, + ], + keywords: ['test', 'roundtrip', 'conversion'], + researchFields: ['research', 'methodology'], + components: [ + { + id: 'content', + name: 'Main Content', + type: ResearchObjectComponentType.CODE, + payload: { + path: 'content.md', + title: 'Test Roundtrip', + description: 'Testing roundtrip conversion from ResearchObject to MyST and back', + content: '# Introduction\n\nThis is a test of roundtrip conversion.', + }, + }, + ], + }; + + // Convert RO -> MyST -> RO + const mystMarkdown = transformer.exportObject(originalRO); + const convertedRO = transformer.importObject(mystMarkdown) as ResearchObjectV1; + + // Compare essential fields + expect(convertedRO.title).to.equal(originalRO.title); + expect(convertedRO.description).to.equal(originalRO.description); + expect(convertedRO.defaultLicense).to.equal(originalRO.defaultLicense); + expect(convertedRO.keywords).to.deep.equal(originalRO.keywords); + expect(convertedRO.researchFields).to.deep.equal(originalRO.researchFields); + + // Compare authors + expect(convertedRO.authors).to.have.lengthOf(originalRO.authors!.length); + originalRO.authors!.forEach((author, i) => { + const convertedAuthor = convertedRO.authors![i]; + expect(convertedAuthor.name).to.equal(author.name); + expect(convertedAuthor.role).to.equal(author.role); + expect(convertedAuthor.orcid).to.equal(author.orcid); + + if (author.organizations) { + expect(convertedAuthor.organizations).to.have.lengthOf(author.organizations.length); + author.organizations.forEach((org, j) => { + expect(convertedAuthor.organizations![j].name).to.equal(org.name); + // Note: IDs will be different as they are generated during import + }); + } + }); + + // Verify the component was preserved + expect(convertedRO.components).to.have.lengthOf(1); + expect(convertedRO.components[0].name).to.equal(originalRO.components[0].name); + expect(convertedRO.components[0].type).to.equal(originalRO.components[0].type); + expect(convertedRO.components[0].payload.path).to.equal(originalRO.components[0].payload.path); + expect(convertedRO.components[0].payload.title).to.equal(originalRO.components[0].payload.title); + expect(convertedRO.components[0].payload.description).to.equal(originalRO.components[0].payload.description); + expect(convertedRO.components[0].payload.content).to.equal(originalRO.components[0].payload.content); + }); +});