diff --git a/app/api/embedding/get/route.ts b/app/api/embedding/get/route.ts new file mode 100644 index 0000000..e05f620 --- /dev/null +++ b/app/api/embedding/get/route.ts @@ -0,0 +1,48 @@ +// app/api/embedding/get/route.ts +import EmbeddingStorage from "../../../utils/embeddingStorage"; +import { storageMethod } from "../../../utils/env"; +import { StorageMethod } from "../../../utils/types"; + +async function handleRequest(req: Request): Promise { + let db: EmbeddingStorage | undefined; + try { + const body = await req.json(); + let highlights; + + if (storageMethod === StorageMethod.sqlite) { + db = new EmbeddingStorage(); + highlights = await db.searchEmbedding(body.pdfId, body.query); + } else { + console.log("Databse not initialized"); + } + + return new Response(JSON.stringify(highlights), { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + } catch (error) { + console.error("Error in handleRequest:", error); + return new Response( + JSON.stringify({ + error: "Internal Server Error", + details: error.message, + }), + { + status: 500, + headers: { "Content-Type": "application/json" }, + } + ); + } finally { + if (db) { + try { + await db.close(); + } catch (closeError) { + console.error("Error closing database:", closeError); + } + } + } +} + +export async function POST(req: Request): Promise { + return handleRequest(req); +} diff --git a/app/api/embedding/upload/route.ts b/app/api/embedding/upload/route.ts new file mode 100644 index 0000000..ce4e512 --- /dev/null +++ b/app/api/embedding/upload/route.ts @@ -0,0 +1,57 @@ +// app/api/embedding/upload/route.ts +import EmbeddingStorage from "../../../utils/embeddingStorage"; +import { storageMethod } from "../../../utils/env"; +import { StorageMethod, StoredEmbedding } from "../../../utils/types"; + +async function handleRequest( + req: Request, + action: (body: any, db?: EmbeddingStorage) => Promise +): Promise { + let db: EmbeddingStorage | undefined; + try { + const body = await req.json(); + if (storageMethod === StorageMethod.sqlite) { + db = new EmbeddingStorage(); + } + await action(body, db); + return new Response(null, { status: 200 }); + } catch (error) { + console.error(error); + return new Response(null, { status: 500 }); + } finally { + if (db) { + await db.close(); + } + } +} + +async function saveEmbedding(body: any, db?: EmbeddingStorage): Promise { + if (db) { + if (Array.isArray(body.embeddings)) { + await db.saveBulkEmbedding(body.embeddings); + } else { + await db.saveEmbedding(body.embeddings); + } + } else { + console.log("Databse not initialized"); + } +} + +async function removeEmbedding( + body: any, + db?: EmbeddingStorage +): Promise { + if (db) { + await db.deleteEmbedding(body.pdfId, body.id); + } else { + console.log("Databse not initialized"); + } +} + +export async function POST(req: Request): Promise { + return handleRequest(req, saveEmbedding); +} + +export async function DELETE(req: Request): Promise { + return handleRequest(req, removeEmbedding); +} diff --git a/app/api/ocr/route.ts b/app/api/ocr/route.ts new file mode 100644 index 0000000..9de3715 --- /dev/null +++ b/app/api/ocr/route.ts @@ -0,0 +1,38 @@ +// app/api/ocr/route.ts + +import { googleApiKey } from "../../utils/env"; + +// API for server to call the Google Vision API +async function handleRequest(req: Request): Promise { + try { + const body = await req.json(); + + const googleResponse = await fetch("https://vision.googleapis.com/v1/images:annotate?key=" + googleApiKey, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ requests: body.requests }) + }); + const googleResponseValue = await googleResponse.json(); + + return new Response(JSON.stringify(googleResponseValue), { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + } catch (error) { + console.error("Error in handleRequest:", error); + return new Response( + JSON.stringify({ + error: "Internal Server Error", + details: error.message, + }), + { + status: 500, + headers: { "Content-Type": "application/json" }, + } + ); + } +} + +export async function POST(req: Request): Promise { + return handleRequest(req); +} \ No newline at end of file diff --git a/app/components/App.tsx b/app/components/App.tsx index 730f94b..f55f292 100644 --- a/app/components/App.tsx +++ b/app/components/App.tsx @@ -6,10 +6,10 @@ import KeywordSearch from "./KeywordSearch"; import PdfViewer from "./PdfViewer"; import { Header } from "./Header"; import Spinner from "./Spinner"; -import { convertPdfToImages, searchPdf } from "../utils/pdfUtils"; +import { convertPdfToImages, searchPdf, createSearchablePDF } from "../utils/pdfUtils"; import type { IHighlight } from "react-pdf-highlighter"; import HighlightUploader from "./HighlightUploader"; -import { StoredHighlight, StorageMethod } from "../utils/types"; +import { StoredHighlight, StorageMethod, StoredEmbedding } from "../utils/types"; import { IHighlightToStoredHighlight, StoredHighlightToIHighlight, @@ -49,17 +49,36 @@ export default function App() { // perform OCR, // convert output back to PDF // update file url with new PDF url - const i = await convertPdfToImages(file); - const worker = await createWorker("eng"); - const res = await worker.recognize( - i[0], - { pdfTitle: "ocr-out" }, - { pdf: true } - ); - const pdf = res.data.pdf; - if (pdf) { + const images = await convertPdfToImages(file); + + const base64Pages = images.map((item): string => {return item.split(',')[1]}); + + const requests = base64Pages.map(base64 => ({ + image: { content: base64 }, + features: [{ type: 'DOCUMENT_TEXT_DETECTION' }] + })); + + const fullAnnotations = []; + + // Maximum number of pages per API call is 16 + const maxPages = 16; + for (let i = 0; i < requests.length; i += maxPages) { + const googleOcrRes = await fetch("/api/ocr", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ requests: requests.slice(i, Math.min(i + maxPages, requests.length)) }), + }); + + const body = await googleOcrRes.json(); + + for (let j = 0; j < body.responses.length; ++j) { + fullAnnotations.push(body.responses[j].fullTextAnnotation); + } + } + + if (fullAnnotations.length > 0) { // Update file url if OCR success - const blob = new Blob([new Uint8Array(pdf)], { type: "application/pdf" }); + const blob = await createSearchablePDF(base64Pages, fullAnnotations); const fileOcrUrl = URL.createObjectURL(blob); setPdfOcrUrl(fileOcrUrl); diff --git a/app/globals.css b/app/globals.css index 4bc552d..79aee37 100644 --- a/app/globals.css +++ b/app/globals.css @@ -8,7 +8,7 @@ --background-end-rgb: 255, 255, 255; } -@media (prefers-color-scheme: dark) { +@media (prefers-color-scheme: light) { :root { --foreground-rgb: 255, 255, 255; --background-start-rgb: 0, 0, 0; diff --git a/app/utils/embedding.ts b/app/utils/embedding.ts new file mode 100644 index 0000000..2641c14 --- /dev/null +++ b/app/utils/embedding.ts @@ -0,0 +1,25 @@ +// app/utils/embedding.ts + +import { openAiKey } from "./env"; + +export async function embedText(text: string): Promise { + const res = await fetch("https://api.openai.com/v1/embeddings", { + method: "POST", + headers: { + Authorization: "Bearer ${process.env.}" + openAiKey, + "Content-Type": "application/json" + }, + body: JSON.stringify({ + model: "text-embedding-ada-002", + input: text + }), + }); + + const data = await res.json(); + + if (!data) { + return []; + } + + return data.data[0].embedding; +} diff --git a/app/utils/embeddingStorage.ts b/app/utils/embeddingStorage.ts new file mode 100644 index 0000000..041ca13 --- /dev/null +++ b/app/utils/embeddingStorage.ts @@ -0,0 +1,152 @@ +// app/utils/embeddingStorage.ts + +import sqlite3 from "sqlite3" +import path from "path"; +import { StoredEmbedding } from "./types"; +import { embedText } from "./embedding"; + +function euclideanDistance(a: number[], b: number[]) { + return Math.sqrt(a.reduce((sum, ai, i) => sum + Math.pow(ai - b[i], 2), 0)); +} + +class EmbeddingDatabase { + private db: sqlite3.Database; + private tableName: string = "pages"; + + private migrationPromise: Promise; + + constructor() { + this.db = new sqlite3.Database( + path.join(process.cwd(), "embeddings.db"), + sqlite3.OPEN_READWRITE | sqlite3.OPEN_CREATE, + (error) => { + if (error) { + console.error("Error opening database:", error.message); + } else { + console.log("Connected to embeddings db!"); + } + } + ); + this.migrationPromise = this.migrate(); + } + + private migrate(): Promise { + return new Promise((resolve, reject) => { + const sql = ` + CREATE TABLE ${this.tableName} ( + id UUID PRIMARY KEY, + pdfId INT, + pageNum INT, + text TEXT, + embedding VECTOR(1536) + ); + `; + this.db.run(sql, (err) => { + if (err) { + console.error("Error creating table:", err.message); + reject(err); + } else { + console.log("Highlights table created or already exists"); + resolve(); + } + }); + }); + } + + private async ensureMigrated(): Promise { + await this.migrationPromise; + } + + async saveEmbedding(embedding: StoredEmbedding): Promise { + await this.ensureMigrated(); + const sql = `INSERT OR REPLACE INTO ${this.tableName} (id, pdfId, pageNum, text, embedding) VALUES (?, ?, ?, ?, ?)`; + return new Promise((resolve, reject) => { + this.db.run(sql, Object.values(embedding), (error) => { + if (error) reject(error); + else resolve(); + }); + }); + } + + async saveBulkEmbedding(embeddings: StoredEmbedding[]): Promise { + await this.ensureMigrated(); + const sql = `INSERT OR REPLACE INTO ${this.tableName} (id, pdfId, pageNum, text, embedding) VALUES (?, ?, ?, ?, ?)`; + return new Promise((resolve, reject) => { + this.db.serialize(() => { + this.db.run("BEGIN TRANSACTION"); + const stmt = this.db.prepare(sql); + embeddings.forEach((embedding) => { + stmt.run(Object.values(embedding)); + }); + stmt.finalize((error) => { + if (error) { + this.db.run("ROLLBACK"); + reject(error); + } else { + this.db.run("COMMIT", (commitError) => { + if (commitError) reject(commitError); + else resolve(); + }); + } + }); + }); + }); + } + + async getEmbeddingForPDF(pdfId: string): Promise { + await this.ensureMigrated(); + const sql = `SELECT * FROM ${this.tableName} WHERE pdfId = ?`; + return new Promise((resolve, reject) => { + this.db.all(sql, [pdfId], (error, rows) => { + if (error) reject(error); + else resolve(rows as StoredEmbedding[]); + }); + }); + } + + async searchEmbedding(query: string) { + const embedding = await embedText(query); + + await this.ensureMigrated(); + const sql = ` + SELECT * + FROM pages + `; + + const res: StoredEmbedding[] = await new Promise((resolve, reject) => { + this.db.all(sql, [], (error, rows) => { + if (error) reject(error); + else resolve(rows as StoredEmbedding[]); + }); + }); + + return res.map(r => ({ + ...r, + distance: euclideanDistance(r.embedding, embedding) + })) + .sort((a, b) => a.distance - b.distance); + } + + async deleteEmbedding(pdfId: string, id: string): Promise { + await this.ensureMigrated(); + const sql = `DELETE FROM ${this.tableName} WHERE pdfId = ? AND id = ?`; + return new Promise((resolve, reject) => { + this.db.run(sql, [pdfId, id], (error) => { + if (error) reject(error); + else resolve(); + }); + }); + } + + async close(): Promise { + await this.ensureMigrated(); + return new Promise((resolve, reject) => { + this.db.close((error) => { + if (error) reject(error); + else resolve(); + }); + }); + } +} + +export default EmbeddingDatabase; \ No newline at end of file diff --git a/app/utils/env.ts b/app/utils/env.ts index 0031af9..2e64889 100644 --- a/app/utils/env.ts +++ b/app/utils/env.ts @@ -6,3 +6,5 @@ export const googleClientSecret = process.env.GOOGLE_CLIENT_SECRET as string; export const storageMethod = process.env.STORAGE_METHOD ? (process.env.STORAGE_METHOD as "supabase" | "sqlite") : "sqlite"; +export const googleApiKey = process.env.GOOGLE_API_KEY as string; +export const openAiKey = process.env.OPEN_AI_KEY as string; \ No newline at end of file diff --git a/app/utils/pdfUtils.ts b/app/utils/pdfUtils.ts index 17535e5..0f80e1c 100644 --- a/app/utils/pdfUtils.ts +++ b/app/utils/pdfUtils.ts @@ -1,6 +1,7 @@ // app/utils/pdfUtils.ts import { IHighlight } from "react-pdf-highlighter"; import * as pdfjs from "pdfjs-dist"; +import { PDFDocument, rgb } from "pdf-lib"; pdfjs.GlobalWorkerOptions.workerSrc = new URL( "pdfjs-dist/build/pdf.worker.min.mjs", @@ -233,3 +234,76 @@ export const convertPdfToImages = async (file: File) => { canvas.remove(); return images; }; + +// Iterate through google OCR output text +function extractWordsFromVision(fullText: any) { + const words: { text: string, boundingBox: [number, number][] }[] = []; + for (const page of fullText.pages ?? []) { + for (const block of page.blocks ?? []) { + for (const paragraph of block.paragraphs ?? []) { + for (const word of paragraph.words ?? []) { + const text = word.symbols.map((s: any) => s.text).join(""); + const bbox = word.boundingBox.vertices.map((v: any) => [v.x ?? 0, v.y ?? 0]); + words.push({ text, boundingBox: bbox as [number, number][] }); + } + } + } + } + return words; +} + +// Generate a pdf with machine readable text based on Google Vision output and original image +export async function createSearchablePDF(imageBase64: string[], visionOutput: any[]): Promise { + const pdfDoc = await PDFDocument.create(); + + for (let i = 0; i < imageBase64.length; ++i) { + // Embed image + const binaryString = atob(imageBase64[i]); // decode base64 to binary string + const len = binaryString.length; + const bytes = new Uint8Array(len); + + for (let j = 0; j < len; j++) { + bytes[j] = binaryString.charCodeAt(j); + } + + const imageBytes = await bytes.buffer; + const png = await pdfDoc.embedPng(imageBytes); + const imgDims = png.scale(1); + + const page = pdfDoc.addPage([imgDims.width, imgDims.height]); + + page.drawImage(png, { + x: 0, + y: 0, + width: imgDims.width, + height: imgDims.height + }); + + // Extract words from vision output + let words: { text: string, boundingBox: [number, number][] }[] = []; + if (visionOutput[i]) { + words = extractWordsFromVision(visionOutput[i]); + } + + for (const word of words) { + const [x, y] = word.boundingBox[0]; // top-left + const height = Math.abs(word.boundingBox[3][1] - word.boundingBox[0][1]) || 12; + + try { + page.drawText(word.text, { + x, + y: imgDims.height - y - height, // PDF-lib uses bottom-left origin + size: height, + color: rgb(0, 0, 0), + opacity: 0 // Set to >0 if you want visible text + }); + } catch (e) { + // Set to not fail if it encounters an invalid character + console.log("Invalid character encountered: " + e); + } + } + } + + const pdfBytes = await pdfDoc.save(); + return new Blob([pdfBytes], { type: "application/pdf" }); +} \ No newline at end of file diff --git a/app/utils/types.ts b/app/utils/types.ts index 3704157..9bb2711 100644 --- a/app/utils/types.ts +++ b/app/utils/types.ts @@ -14,6 +14,14 @@ export interface StoredHighlight { keyword: string; } +export interface StoredEmbedding { + id: string, + pdfId: string, + pageNum: number, + text: string, + embedding: number[] +} + export enum StorageMethod { supabase = "supabase", sqlite = "sqlite", diff --git a/package.json b/package.json index 9e3ea29..cba9ef7 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,7 @@ "lucide-react": "^0.436.0", "next": "14.2.7", "next-auth": "^5.0.0-beta.21", + "pdf-lib": "^1.17.1", "pdfjs-dist": "4.4.168", "react": "^18", "react-dom": "^18", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 068e9a7..85d93e1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -26,6 +26,9 @@ importers: next-auth: specifier: ^5.0.0-beta.21 version: 5.0.0-beta.21(next@14.2.7(react-dom@18.3.1(react@18.3.1))(react@18.3.1))(react@18.3.1) + pdf-lib: + specifier: ^1.17.1 + version: 1.17.1 pdfjs-dist: specifier: 4.4.168 version: 4.4.168(encoding@0.1.13) @@ -246,6 +249,12 @@ packages: '@panva/hkdf@1.2.1': resolution: {integrity: sha512-6oclG6Y3PiDFcoyk8srjLfVKyMfVCKJ27JwNPViuXziFpmdz+MZnZN/aKY0JGXgYuO/VghU0jcOAZgWXZ1Dmrw==} + '@pdf-lib/standard-fonts@1.0.0': + resolution: {integrity: sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==} + + '@pdf-lib/upng@1.0.1': + resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} + '@pkgjs/parseargs@0.11.0': resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} @@ -1568,6 +1577,9 @@ packages: package-json-from-dist@1.0.0: resolution: {integrity: sha512-dATvCeZN/8wQsGywez1mzHtTlP22H8OEfPrVMLNr4/eGa+ijtLn/6M5f0dY8UKNrC2O9UCU6SSoG3qRKnt7STw==} + pako@1.0.11: + resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} + parent-module@1.0.1: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} @@ -1599,6 +1611,9 @@ packages: resolution: {integrity: sha512-Fl2z/BHvkTNvkuBzYTpTuirHZg6wW9z8+4SND/3mDTEcYbbNKWAy21dz9D3ePNNwrrK8pqZO5vLPZ1hLF6T7XA==} engines: {node: '>=6'} + pdf-lib@1.17.1: + resolution: {integrity: sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==} + pdfjs-dist@4.4.168: resolution: {integrity: sha512-MbkAjpwka/dMHaCfQ75RY1FXX3IewBVu6NGZOcxerRFlaBiIkZmUoR0jotX5VUzYZEXAGzSFtknWs5xRKliXPA==} engines: {node: '>=18'} @@ -2046,6 +2061,9 @@ packages: tsconfig-paths@3.15.0: resolution: {integrity: sha512-2Ac2RgzDe/cn48GvOe3M+o82pEFewD3UPbyoUHHdKasHwJKjds4fLXWf/Ux5kATBKN20oaFGu+jbElp1pos0mg==} + tslib@1.14.1: + resolution: {integrity: sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==} + tslib@2.6.2: resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==} @@ -2330,6 +2348,14 @@ snapshots: '@panva/hkdf@1.2.1': {} + '@pdf-lib/standard-fonts@1.0.0': + dependencies: + pako: 1.0.11 + + '@pdf-lib/upng@1.0.1': + dependencies: + pako: 1.0.11 + '@pkgjs/parseargs@0.11.0': optional: true @@ -3946,6 +3972,8 @@ snapshots: package-json-from-dist@1.0.0: {} + pako@1.0.11: {} + parent-module@1.0.1: dependencies: callsites: 3.1.0 @@ -3968,6 +3996,13 @@ snapshots: path2d@0.2.1: optional: true + pdf-lib@1.17.1: + dependencies: + '@pdf-lib/standard-fonts': 1.0.0 + '@pdf-lib/upng': 1.0.1 + pako: 1.0.11 + tslib: 1.14.1 + pdfjs-dist@4.4.168(encoding@0.1.13): optionalDependencies: canvas: 2.11.2(encoding@0.1.13) @@ -4511,6 +4546,8 @@ snapshots: minimist: 1.2.8 strip-bom: 3.0.0 + tslib@1.14.1: {} + tslib@2.6.2: {} tslib@2.7.0: {} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml new file mode 100644 index 0000000..bc99204 --- /dev/null +++ b/pnpm-workspace.yaml @@ -0,0 +1,5 @@ +onlyBuiltDependencies: + - bcrypt + - canvas + - sqlite3 + - tesseract.js diff --git a/public/demo.pdf b/public/demo.pdf new file mode 100644 index 0000000..e0b15fb Binary files /dev/null and b/public/demo.pdf differ diff --git a/public/p2-5.pdf b/public/p2-5.pdf new file mode 100644 index 0000000..222d6bd Binary files /dev/null and b/public/p2-5.pdf differ diff --git a/public/p3-4.pdf b/public/p3-4.pdf new file mode 100644 index 0000000..23eb10b Binary files /dev/null and b/public/p3-4.pdf differ diff --git a/public/p3.pdf b/public/p3.pdf new file mode 100644 index 0000000..4797ef6 Binary files /dev/null and b/public/p3.pdf differ