diff --git a/package-lock.json b/package-lock.json index 49ea378..5a2c779 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "@oclif/plugin-help": "^6", "@oclif/plugin-not-found": "^3.2.1", "@rpidanny/google-scholar": "^3.2.0", - "@rpidanny/odysseus": "^2.5.0", + "@rpidanny/odysseus": "^2.6.0", "@rpidanny/quill": "^1.6.0", "@rpidanny/quill-hooks": "^1.0.2", "bottleneck": "^2.19.5", @@ -4879,6 +4879,11 @@ "semver": "bin/semver.js" } }, + "node_modules/@mixmark-io/domino": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz", + "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==" + }, "node_modules/@nodelib/fs.scandir": { "version": "2.1.5", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", @@ -5381,14 +5386,15 @@ } }, "node_modules/@rpidanny/odysseus": { - "version": "2.5.0", - "resolved": "https://registry.npmjs.org/@rpidanny/odysseus/-/odysseus-2.5.0.tgz", - "integrity": "sha512-z1lg6p02w38QFDshbOZUR4ownitJGilvZ2iYKkgc7mxYohBtyRrlk5sgxXWvfT2MaIAow9MwT7WW3hKZlermcw==", + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@rpidanny/odysseus/-/odysseus-2.6.0.tgz", + "integrity": "sha512-Qkn+DVTFHuS0iBlneyJ51rGAnOIzE82Dg3RiwQpe90vmyxcI0XgsN+rCKV6VuS3jOAC2eLkB6At9goxGCAOYmg==", "hasInstallScript": true, "dependencies": { "html-to-text": "^9.0.5", "p-retry": "^6.2.0", - "playwright": "^1.44.1" + "playwright": "^1.44.1", + "turndown": "^7.2.0" } }, "node_modules/@rpidanny/quill": { @@ -21989,6 +21995,14 @@ "node": "*" } }, + "node_modules/turndown": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.0.tgz", + "integrity": "sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==", + "dependencies": { + "@mixmark-io/domino": "^2.2.0" + } + }, "node_modules/type-check": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", diff --git a/package.json b/package.json index 9c70d43..84e8f42 100644 --- a/package.json +++ b/package.json @@ -39,7 +39,7 @@ "@oclif/plugin-help": "^6", "@oclif/plugin-not-found": "^3.2.1", "@rpidanny/google-scholar": "^3.2.0", - "@rpidanny/odysseus": "^2.5.0", + "@rpidanny/odysseus": "^2.6.0", "@rpidanny/quill": "^1.6.0", "@rpidanny/quill-hooks": "^1.0.2", "bottleneck": "^2.19.5", diff --git a/src/services/llm/prompt-templates/summary.template.ts b/src/services/llm/prompt-templates/summary.template.ts index 51fc722..90b2b32 100644 --- a/src/services/llm/prompt-templates/summary.template.ts +++ b/src/services/llm/prompt-templates/summary.template.ts @@ -11,6 +11,8 @@ Below you find the text content of the paper: Total output will be a summary of the paper including the key ideas, findings of the paper as a paragraph. +If the text is about cookies, cookie policy and preferences, please ignore it. + [IMPORTANT] Only return the summary without saying anything else. SUMMARY: @@ -34,6 +36,8 @@ Given the new context, refine the summary to be more accurate and informative. If the context isn't useful, return the original summary. Total output will be a summary of the paper including the key ideas, findings of the paper as a paragraph. +If the text is about cookies, cookie policy and preferences, please ignore it and return the original summary. + [IMPORTANT] Only return the summary without saying anything else. SUMMARY: diff --git a/src/services/paper/paper.service.spec.ts b/src/services/paper/paper.service.spec.ts index 837c159..88831ff 100644 --- a/src/services/paper/paper.service.spec.ts +++ b/src/services/paper/paper.service.spec.ts @@ -45,13 +45,13 @@ describe('PaperService', () => { downloadService, ) - odysseusMock.getTextContent.mockResolvedValueOnce('some-text') + odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text') const content = await service.getTextContent(htmlPaperMetadata) expect(content).toBe('some-text') - expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1) - expect(odysseusMock.getTextContent).toHaveBeenCalledWith(htmlPaperMetadata.url, { + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1) + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(htmlPaperMetadata.url, { waitOnCaptcha: !config.skipCaptcha, throwOnCaptcha: true, }) @@ -66,17 +66,17 @@ describe('PaperService', () => { expect(content).toBe('some-text') expect(pdfServiceMock.getTextContent).toHaveBeenCalledTimes(1) expect(pdfServiceMock.getTextContent).toHaveBeenCalledWith(pdfPaperMetadata.source.url) - expect(odysseusMock.getTextContent).not.toHaveBeenCalled() + expect(odysseusMock.getMarkdownContent).not.toHaveBeenCalled() }) it('should get text content from paper url when paper is not a pdf', async () => { - odysseusMock.getTextContent.mockResolvedValueOnce('some-text') + odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text') const content = await service.getTextContent(htmlPaperMetadata) expect(content).toBe('some-text') - expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1) - expect(odysseusMock.getTextContent).toHaveBeenCalledWith(htmlPaperMetadata.source.url, { + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1) + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(htmlPaperMetadata.source.url, { waitOnCaptcha: !config.skipCaptcha, throwOnCaptcha: true, }) @@ -85,14 +85,14 @@ describe('PaperService', () => { it('should fallback to main url when pdf processing fails', async () => { pdfServiceMock.getTextContent.mockRejectedValueOnce(new Error('Failed to process PDF')) - odysseusMock.getTextContent.mockResolvedValueOnce('some-text') + odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text') const content = await service.getTextContent(pdfPaperMetadata) expect(content).toBe('some-text') expect(pdfServiceMock.getTextContent).toHaveBeenCalledTimes(1) - expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1) - expect(odysseusMock.getTextContent).toHaveBeenCalledWith(pdfPaperMetadata.url, { + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1) + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(pdfPaperMetadata.url, { waitOnCaptcha: !config.skipCaptcha, throwOnCaptcha: true, }) @@ -100,13 +100,13 @@ describe('PaperService', () => { it('should fallback to main url when paper url is empty', async () => { const metadata = getMockPaperMetadata({ source: { ...htmlPaperMetadata.source, url: '' } }) - odysseusMock.getTextContent.mockResolvedValueOnce('some-text') + odysseusMock.getMarkdownContent.mockResolvedValueOnce('some-text') const content = await service.getTextContent(metadata) expect(content).toBe('some-text') - expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1) - expect(odysseusMock.getTextContent).toHaveBeenCalledWith(metadata.url, { + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1) + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledWith(metadata.url, { waitOnCaptcha: !config.skipCaptcha, throwOnCaptcha: true, }) @@ -125,18 +125,18 @@ describe('PaperService', () => { const content = await service.getTextContent(metadata) expect(content).toBe('') - expect(odysseusMock.getTextContent).not.toHaveBeenCalled() + expect(odysseusMock.getMarkdownContent).not.toHaveBeenCalled() expect(pdfServiceMock.getTextContent).not.toHaveBeenCalled() }) it('should return empty string when falling back to main url and url is empty', async () => { const metadata = getMockPaperMetadata({ url: '' }) - odysseusMock.getTextContent.mockRejectedValueOnce(new Error('Failed to process PDF')) + odysseusMock.getMarkdownContent.mockRejectedValueOnce(new Error('Failed to process PDF')) const content = await service.getTextContent(metadata) expect(content).toBe('') - expect(odysseusMock.getTextContent).toHaveBeenCalledTimes(1) + expect(odysseusMock.getMarkdownContent).toHaveBeenCalledTimes(1) expect(pdfServiceMock.getTextContent).not.toHaveBeenCalled() }) }) diff --git a/src/services/paper/paper.service.ts b/src/services/paper/paper.service.ts index fdf6335..6fbfdf9 100644 --- a/src/services/paper/paper.service.ts +++ b/src/services/paper/paper.service.ts @@ -20,7 +20,7 @@ export class PaperService { ) {} private async getWebContent(url: string): Promise { - return this.odysseus.getTextContent(url, { + return this.odysseus.getMarkdownContent(url, { waitOnCaptcha: !this.config.skipCaptcha, throwOnCaptcha: true, })