Skip to content

Commit

Permalink
feat: use markdown for web content processing (#78)
Browse files Browse the repository at this point in the history
* feat: use markdown for html convent

* chore: update prompt tempalte

* chore: update tests
  • Loading branch information
rpidanny authored Jul 5, 2024
1 parent fe8281f commit 522006a
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 23 deletions.
24 changes: 19 additions & 5 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"@oclif/plugin-help": "^6",
"@oclif/plugin-not-found": "^3.2.1",
"@rpidanny/google-scholar": "^3.2.0",
"@rpidanny/odysseus": "^2.5.0",
"@rpidanny/odysseus": "^2.6.0",
"@rpidanny/quill": "^1.6.0",
"@rpidanny/quill-hooks": "^1.0.2",
"bottleneck": "^2.19.5",
Expand Down
4 changes: 4 additions & 0 deletions src/services/llm/prompt-templates/summary.template.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Below you find the text content of the paper:
Total output will be a summary of the paper including the key ideas, findings of the paper as a paragraph.
If the text is about cookies, cookie policy and preferences, please ignore it.
[IMPORTANT] Only return the summary without saying anything else.
SUMMARY:
Expand All @@ -34,6 +36,8 @@ Given the new context, refine the summary to be more accurate and informative.
If the context isn't useful, return the original summary.
Total output will be a summary of the paper including the key ideas, findings of the paper as a paragraph.
If the text is about cookies, cookie policy and preferences, please ignore it and return the original summary.
[IMPORTANT] Only return the summary without saying anything else.
SUMMARY:
Expand Down
32 changes: 16 additions & 16 deletions src/services/paper/paper.service.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ describe('PaperService', () => {
downloadService,
)

odysseusMock.getTextContent.mockResolvedValueOnce('some-text')
odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text')

const content = await service.getTextContent(htmlPaperMetadata)

expect(content).toBe('some-text')
expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getTextContent).toHaveBeenCalledWith(htmlPaperMetadata.url, {
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(htmlPaperMetadata.url, {
waitOnCaptcha: !config.skipCaptcha,
throwOnCaptcha: true,
})
Expand All @@ -66,17 +66,17 @@ describe('PaperService', () => {
expect(content).toBe('some-text')
expect(pdfServiceMock.getTextContent).toHaveBeenCalledTimes(1)
expect(pdfServiceMock.getTextContent).toHaveBeenCalledWith(pdfPaperMetadata.source.url)
expect(odysseusMock.getTextContent).not.toHaveBeenCalled()
expect(odysseusMock.getMarkdownContent).not.toHaveBeenCalled()
})

it('should get text content from paper url when paper is not a pdf', async () => {
odysseusMock.getTextContent.mockResolvedValueOnce('some-text')
odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text')

const content = await service.getTextContent(htmlPaperMetadata)

expect(content).toBe('some-text')
expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getTextContent).toHaveBeenCalledWith(htmlPaperMetadata.source.url, {
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(htmlPaperMetadata.source.url, {
waitOnCaptcha: !config.skipCaptcha,
throwOnCaptcha: true,
})
Expand All @@ -85,28 +85,28 @@ describe('PaperService', () => {

it('should fallback to main url when pdf processing fails', async () => {
pdfServiceMock.getTextContent.mockRejectedValueOnce(new Error('Failed to process PDF'))
odysseusMock.getTextContent.mockResolvedValueOnce('some-text')
odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text')

const content = await service.getTextContent(pdfPaperMetadata)

expect(content).toBe('some-text')
expect(pdfServiceMock.getTextContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getTextContent).toHaveBeenCalledWith(pdfPaperMetadata.url, {
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(pdfPaperMetadata.url, {
waitOnCaptcha: !config.skipCaptcha,
throwOnCaptcha: true,
})
})

it('should fallback to main url when paper url is empty', async () => {
const metadata = getMockPaperMetadata({ source: { ...htmlPaperMetadata.source, url: '' } })
odysseusMock.getTextContent.mockResolvedValueOnce('some-text')
odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text')

const content = await service.getTextContent(metadata)

expect(content).toBe('some-text')
expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getTextContent).toHaveBeenCalledWith(metadata.url, {
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(metadata.url, {
waitOnCaptcha: !config.skipCaptcha,
throwOnCaptcha: true,
})
Expand All @@ -125,18 +125,18 @@ describe('PaperService', () => {
const content = await service.getTextContent(metadata)

expect(content).toBe('')
expect(odysseusMock.getTextContent).not.toHaveBeenCalled()
expect(odysseusMock.getMarkdownContent).not.toHaveBeenCalled()
expect(pdfServiceMock.getTextContent).not.toHaveBeenCalled()
})

it('should return empty string when falling back to main url and url is empty', async () => {
const metadata = getMockPaperMetadata({ url: '' })
odysseusMock.getTextContent.mockRejectedValueOnce(new Error('Failed to process PDF'))
odysseusMock.getMarkdownContent.mockRejectedValueOnce(new Error('Failed to process PDF'))

const content = await service.getTextContent(metadata)

expect(content).toBe('')
expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1)
expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1)
expect(pdfServiceMock.getTextContent).not.toHaveBeenCalled()
})
})
Expand Down
2 changes: 1 addition & 1 deletion src/services/paper/paper.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export class PaperService {
) {}

private async getWebContent(url: string): Promise<string> {
return this.odysseus.getTextContent(url, {
return this.odysseus.getMarkdownContent(url, {
waitOnCaptcha: !this.config.skipCaptcha,
throwOnCaptcha: true,
})
Expand Down

0 comments on commit 522006a

Please sign in to comment.