Skip to content

Commit

Permalink
feat: convert pdf to markdown
Browse files Browse the repository at this point in the history
  • Loading branch information
rpidanny committed Jul 12, 2024
1 parent a66be3e commit fb074a3
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 15 deletions.
20 changes: 19 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"@oclif/plugin-not-found": "^3.2.1",
"@rpidanny/google-scholar": "^3.2.0",
"@rpidanny/odysseus": "^2.6.0",
"@rpidanny/pdf2md": "^1.0.0",
"@rpidanny/quill": "^1.6.0",
"@rpidanny/quill-hooks": "^1.0.2",
"bottleneck": "^2.19.5",
Expand Down
18 changes: 9 additions & 9 deletions src/services/paper/paper.service.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,17 @@ describe('PaperService', () => {
waitOnCaptcha: !config.skipCaptcha,
throwOnCaptcha: true,
})
expect(pdfServiceMock.getTextContent).not.toHaveBeenCalled()
expect(pdfServiceMock.getMarkdownContent).not.toHaveBeenCalled()
})

it('should get text content from pdf url when paper is a pdf', async () => {
pdfServiceMock.getTextContent.mockResolvedValueOnce('some-text')
pdfServiceMock.getMarkdownContent.mockResolvedValueOnce('some-text')

const content = await service.getTextContent(pdfPaperMetadata)

expect(content).toBe('some-text')
expect(pdfServiceMock.getTextContent).toHaveBeenCalledTimes(1)
expect(pdfServiceMock.getTextContent).toHaveBeenCalledWith(pdfPaperMetadata.source.url)
expect(pdfServiceMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(pdfServiceMock.getMarkdownContent).toHaveBeenCalledWith(pdfPaperMetadata.source.url)
expect(odysseusMock.getMarkdownContent).not.toHaveBeenCalled()
})

Expand All @@ -80,17 +80,17 @@ describe('PaperService', () => {
waitOnCaptcha: !config.skipCaptcha,
throwOnCaptcha: true,
})
expect(pdfServiceMock.getTextContent).not.toHaveBeenCalled()
expect(pdfServiceMock.getMarkdownContent).not.toHaveBeenCalled()
})

it('should fallback to main url when pdf processing fails', async () => {
pdfServiceMock.getTextContent.mockRejectedValueOnce(new Error('Failed to process PDF'))
pdfServiceMock.getMarkdownContent.mockRejectedValueOnce(new Error('Failed to process PDF'))
odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text')

const content = await service.getTextContent(pdfPaperMetadata)

expect(content).toBe('some-text')
expect(pdfServiceMock.getTextContent).toHaveBeenCalledTimes(1)
expect(pdfServiceMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(pdfPaperMetadata.url, {
waitOnCaptcha: !config.skipCaptcha,
Expand Down Expand Up @@ -126,7 +126,7 @@ describe('PaperService', () => {

expect(content).toBe('')
expect(odysseusMock.getMarkdownContent).not.toHaveBeenCalled()
expect(pdfServiceMock.getTextContent).not.toHaveBeenCalled()
expect(pdfServiceMock.getMarkdownContent).not.toHaveBeenCalled()
})

it('should return empty string when falling back to main url and url is empty', async () => {
Expand All @@ -137,7 +137,7 @@ describe('PaperService', () => {

expect(content).toBe('')
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(pdfServiceMock.getTextContent).not.toHaveBeenCalled()
expect(pdfServiceMock.getMarkdownContent).not.toHaveBeenCalled()
})
})

Expand Down
2 changes: 1 addition & 1 deletion src/services/paper/paper.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ export class PaperService {
}

private async getPdfContent(url: string): Promise<string> {
return this.pdfService.getTextContent(url)
return this.pdfService.getMarkdownContent(url)
}

/*
Expand Down
7 changes: 7 additions & 0 deletions src/services/pdf/pdf.service.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,11 @@ describe('PdfService', () => {
expect(content).toContain('The new frontier of genome engineering with CRISPR-Cas9')
})
})

describe('getMarkdownContent', () => {
it('should get markdown content from PDF', async () => {
const content = await pdfService.getMarkdownContent('https://example.com')
expect(content).toContain('The new frontier of genome engineering with CRISPR-Cas9')
})
})
})
16 changes: 12 additions & 4 deletions src/services/pdf/pdf.service.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pdf2md from '@rpidanny/pdf2md'
import { Quill } from '@rpidanny/quill'
import { combinePagesIntoSingleString, parsePageItems } from 'pdf-text-reader'
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs'
Expand All @@ -15,17 +16,19 @@ export class PdfService {
private readonly logger?: Quill,
) {}

async getTextContent(url: string): Promise<string> {
private async downloadPdf(url: string): Promise<Buffer> {
this.logger?.debug(`Fetching PDF from ${url}`)

let fileContent: Buffer

try {
fileContent = await this.downloadService.getContent(url)
return await this.downloadService.getContent(url)
} catch (error) {
this.logger?.debug(`Failed to get PDF from ${url} : ${(error as Error).message}`)
throw error
}
}

async getTextContent(url: string): Promise<string> {
const fileContent = await this.downloadPdf(url)

const doc = await pdfjs.getDocument({
data: new Uint8Array(fileContent),
Expand All @@ -42,4 +45,9 @@ export class PdfService {
const parsedPages = parsePageItems(items)
return combinePagesIntoSingleString([parsedPages])
}

async getMarkdownContent(url: string): Promise<string> {
const fileContent = await this.downloadPdf(url)
return pdf2md(new Uint8Array(fileContent))
}
}

0 comments on commit fb074a3

Please sign in to comment.