diff --git a/.env.example b/.env.example
new file mode 100644
index 00000000..919a5c64
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,47 @@
+ # Since the ".env" file is gitignored, you can use the ".env.example" file to
+# build a new ".env" file when you clone the repo. Keep this file up-to-date
+# when you add new variables to `.env`.
+
+# This file will be committed to version control, so make sure not to have any
+# secrets in it. If you are cloning this repo, create a copy of this file named
+# ".env" and populate it with your secrets.
+
+# When adding additional environment variables, the schema in "/src/env.js"
+# should be updated accordingly.
+
+# Database
+DATABASE_URL=""
+
+# Docker Compose: password for PostgreSQL (used by db service)
+# POSTGRES_PASSWORD=password
+
+# Clerk Authentication (get from https://clerk.com/)
+
+NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=
+CLERK_SECRET_KEY=
+
+# OpenAI API (get from https://platform.openai.com/)
+
+OPENAI_API_KEY=
+
+# UploadThing (get from https://uploadthing.com/)
+UPLOADTHING_SECRET="your_uploadthing_secret"
+UPLOADTHING_APP_ID="your_uploadthing_app_id"
+
+# Datalab OCR API (optional - get from https://www.datalab.to/)
+# Required only if you want to enable OCR processing for scanned documents
+DATALAB_API_KEY="your_datalab_api_key"
+
+# Landing.AI OCR API (optional - get from https://www.landing.ai/)
+LANDING_AI_API_KEY="your_landing_ai_api_key"
+
+# Tavily API (optional - get from https://www.tavily.com/)
+TAVILY_API_KEY="your_tavily_api_key"
+
+# Azure Document Intelligence OCR API (optional - get from https://learn.microsoft.com/en-us/azure/applied-ai-services/document-intelligence/quickstarts/get-started-with-rest-api?pivots=programming-language-rest-api)
+AZURE_DOC_INTELLIGENCE_ENDPOINT="your_azure_doc_intelligence_endpoint"
+AZURE_DOC_INTELLIGENCE_KEY="your_azure_doc_intelligence_key"
+
+# Inngest (required for background document processing - https://inngest.com/)
+INNGEST_EVENT_KEY="dev_placeholder"
+INNGEST_SIGNING_KEY="signkey-dev-xxxxx"
diff --git a/README.md b/README.md
index a0a4f18d..493a9885 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,7 @@ Create `.env` from `.env.example` and fill required values:
- `DATABASE_URL`
- `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY`
- `CLERK_SECRET_KEY`
+- `BLOB_READ_WRITE_TOKEN` (Vercel Blob read/write token)
- `OPENAI_API_KEY`
- `INNGEST_EVENT_KEY`, as placeholder
@@ -137,6 +138,18 @@ Optional integrations:
- `LANGCHAIN_TRACING_V2`, `LANGCHAIN_API_KEY`, `LANGCHAIN_PROJECT`
- `DEBUG_PERF` (`1` or `true`) to enable dev perf logs for middleware and key auth/dashboard APIs
+### 2.1) Configure Vercel Blob Storage
+
+Vercel Blob is used for storing uploaded documents. Both **public** and **private** stores are supported -- the upload logic auto-detects which mode the store uses and adapts automatically.
+
+1. In the Vercel dashboard, go to **Storage → Blob → Create Store**.
+2. Choose either **Public** or **Private** access. Both work:
+ - **Public** stores produce URLs the browser can load directly (faster for previews).
+ - **Private** stores keep files behind authentication; the app proxies content through `/api/documents/[id]/content` and `/api/files/[id]` so previews still work.
+3. Generate a **Read/Write token** for the store and add it as `BLOB_READ_WRITE_TOKEN` in your environment (`.env` locally, or Vercel Project Settings for deploys).
+4. Redeploy so the token is available at build and runtime.
+5. Verify: sign in to the Employer Upload page, upload a small PDF, and confirm `/api/upload-local` returns a `vercel-storage.com` URL without errors.
+
### 3) Start database and apply schema
```bash
diff --git a/__tests__/api/fetchDocument/fetchDocument.test.ts b/__tests__/api/fetchDocument/fetchDocument.test.ts
index 86afa568..4281d61e 100644
--- a/__tests__/api/fetchDocument/fetchDocument.test.ts
+++ b/__tests__/api/fetchDocument/fetchDocument.test.ts
@@ -1,7 +1,8 @@
-import { POST } from "~/app/api/fetchDocument/route";
-import { auth } from "@clerk/nextjs/server";
-import { validateRequestBody } from "~/lib/validation";
-import { dbCore } from "~/server/db/core";
+jest.mock("~/server/storage/vercel-blob", () => ({
+ isPrivateBlobUrl: jest.fn(() => false),
+ fetchBlob: jest.fn(),
+ putFile: jest.fn(),
+}));
jest.mock("@clerk/nextjs/server", () => ({
auth: jest.fn(),
@@ -11,13 +12,17 @@ jest.mock("~/lib/validation", () => ({
validateRequestBody: jest.fn(),
}));
-// Route uses dbCore from core, not db from index
jest.mock("~/server/db/core", () => ({
dbCore: {
select: jest.fn(),
},
}));
+import { POST } from "~/app/api/fetchDocument/route";
+import { auth } from "@clerk/nextjs/server";
+import { validateRequestBody } from "~/lib/validation";
+import { dbCore } from "~/server/db/core";
+
describe("POST /api/fetchDocument", () => {
beforeEach(() => {
jest.clearAllMocks();
diff --git a/__tests__/api/trendSearch/web-search.pbt.test.ts b/__tests__/api/trendSearch/web-search.pbt.test.ts
index d77e8e79..6347a2f3 100644
--- a/__tests__/api/trendSearch/web-search.pbt.test.ts
+++ b/__tests__/api/trendSearch/web-search.pbt.test.ts
@@ -115,8 +115,8 @@ describe("Unit: one sub-query returns 0 results, pipeline continues", () => {
const result = await executeSearch(subQueries);
expect(fetchSpy).toHaveBeenCalledTimes(3);
- expect(result).toHaveLength(2);
- expect(result.map((r) => r.url)).toEqual(["https://b.com", "https://c.com"]);
+ expect(result.results).toHaveLength(2);
+ expect(result.results.map((r) => r.url)).toEqual(["https://b.com", "https://c.com"]);
});
});
@@ -158,7 +158,7 @@ describe("Unit: Tavily fails, retries 2 times then marks sub-query failed", () =
// 1 + 2 retries for first sub-query, then 1 for second
expect(fetchSpy).toHaveBeenCalledTimes(4);
- expect(result).toHaveLength(1);
- expect(result[0].url).toBe("https://ok.com");
+ expect(result.results).toHaveLength(1);
+ expect(result.results[0].url).toBe("https://ok.com");
});
});
diff --git a/__tests__/lib/ocr/complexity.test.ts b/__tests__/lib/ocr/complexity.test.ts
index 7134ff5e..50ac7549 100644
--- a/__tests__/lib/ocr/complexity.test.ts
+++ b/__tests__/lib/ocr/complexity.test.ts
@@ -1,3 +1,9 @@
+jest.mock("~/server/storage/vercel-blob", () => ({
+ fetchBlob: jest.fn(),
+ putFile: jest.fn(),
+ isPrivateBlobUrl: jest.fn(() => false),
+}));
+
import { selectSamplePages } from "~/lib/ocr/complexity";
describe("OCR Complexity Module", () => {
diff --git a/dev-output.log b/dev-output.log
new file mode 100644
index 00000000..3ad7f40e
Binary files /dev/null and b/dev-output.log differ
diff --git a/docs/deployment.md b/docs/deployment.md
index dda49d9a..4ef23d95 100644
--- a/docs/deployment.md
+++ b/docs/deployment.md
@@ -51,7 +51,7 @@ docker compose --env-file .env --profile dev up
1. Import repository into Vercel.
2. Configure managed PostgreSQL (Vercel Postgres, Neon, Supabase, etc.).
-3. Set `DATABASE_URL` and app environment variables.
+3. Set `DATABASE_URL`, `BLOB_READ_WRITE_TOKEN`, and the other app environment variables.
4. Deploy with Vercel defaults.
5. Apply schema once:
@@ -65,6 +65,12 @@ Optional integrations:
- LangSmith for tracing
- Sidecar (deploy separately and set `SIDECAR_URL`)
+### Verifying Blob uploads on Vercel
+
+1. After deploy, sign in to the Employer portal and open `/employer/upload`.
+2. Upload any small PDF or DOCX. The `/api/upload-local` response should return a `vercel-storage.com` URL.
+3. Paste that URL into a new tab. The file should download directly, confirming Blob access end to end.
+
## Option 3: VPS self-hosted (Node + reverse proxy)
1. Install Node.js 18+, pnpm, Nginx, and PostgreSQL with pgvector.
@@ -89,7 +95,8 @@ Optional: Run the sidecar separately and point `SIDECAR_URL` to it.
| `CLERK_SECRET_KEY` | Yes | Clerk secret key |
| `OPENAI_API_KEY` | Yes | OpenAI API key |
| `INNGEST_EVENT_KEY` | Yes (prod) | Inngest event key for background jobs |
-| `UPLOADTHING_TOKEN` | Optional | UploadThing for cloud storage |
+| `BLOB_READ_WRITE_TOKEN` | Yes (Vercel) | Required for Vercel Blob uploads |
+| `UPLOADTHING_TOKEN` | Optional | UploadThing legacy uploader |
| `SIDECAR_URL` | Optional | Sidecar URL for reranking and Graph RAG |
| `TAVILY_API_KEY` | Optional | Web search for analysis |
| `AZURE_DOC_INTELLIGENCE_*` | Optional | OCR for scanned PDFs |
diff --git a/drizzle/0002_company_metadata.sql b/drizzle/0002_company_metadata.sql
new file mode 100644
index 00000000..ac7aa324
--- /dev/null
+++ b/drizzle/0002_company_metadata.sql
@@ -0,0 +1,39 @@
+-- Company Metadata Tables
+-- One canonical JSON row per company + append-only audit history.
+
+CREATE TABLE IF NOT EXISTS "company_metadata" (
+ "id" serial PRIMARY KEY NOT NULL,
+ "company_id" bigint NOT NULL REFERENCES "company"("id") ON DELETE CASCADE,
+ "schema_version" varchar(20) NOT NULL DEFAULT '1.0.0',
+ "metadata" jsonb NOT NULL,
+ "last_extraction_document_id" bigint REFERENCES "document"("id") ON DELETE SET NULL,
+ "created_at" timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ "updated_at" timestamptz
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS "company_metadata_company_id_unique"
+ ON "company_metadata" ("company_id");
+
+-- ============================================================================
+
+CREATE TABLE IF NOT EXISTS "company_metadata_history" (
+ "id" serial PRIMARY KEY NOT NULL,
+ "company_id" bigint NOT NULL REFERENCES "company"("id") ON DELETE CASCADE,
+ "document_id" bigint REFERENCES "document"("id") ON DELETE SET NULL,
+ "change_type" varchar(32) NOT NULL, -- extraction | merge | manual_override | deprecation
+ "diff" jsonb NOT NULL,
+ "changed_by" varchar(256) NOT NULL,
+ "created_at" timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE INDEX IF NOT EXISTS "company_metadata_history_company_id_idx"
+ ON "company_metadata_history" ("company_id");
+
+CREATE INDEX IF NOT EXISTS "company_metadata_history_document_id_idx"
+ ON "company_metadata_history" ("document_id");
+
+CREATE INDEX IF NOT EXISTS "company_metadata_history_created_at_idx"
+ ON "company_metadata_history" ("created_at");
+
+CREATE INDEX IF NOT EXISTS "company_metadata_history_change_type_idx"
+ ON "company_metadata_history" ("change_type");
diff --git a/drizzle/0002_vercel_blob.sql b/drizzle/0002_vercel_blob.sql
new file mode 100644
index 00000000..4eac39f3
--- /dev/null
+++ b/drizzle/0002_vercel_blob.sql
@@ -0,0 +1,8 @@
+ALTER TABLE "file_uploads"
+ ADD COLUMN IF NOT EXISTS "storage_provider" varchar(64) NOT NULL DEFAULT 'database',
+ ADD COLUMN IF NOT EXISTS "storage_url" varchar(1024),
+ ADD COLUMN IF NOT EXISTS "storage_pathname" varchar(1024),
+ ADD COLUMN IF NOT EXISTS "blob_checksum" varchar(128);
+
+ALTER TABLE "file_uploads"
+ ALTER COLUMN "file_data" DROP NOT NULL;
diff --git a/drizzle/0003_company_onboarding.sql b/drizzle/0003_company_onboarding.sql
new file mode 100644
index 00000000..e079bea1
--- /dev/null
+++ b/drizzle/0003_company_onboarding.sql
@@ -0,0 +1,4 @@
+-- Add onboarding profile columns to company table
+ALTER TABLE "company"
+ ADD COLUMN IF NOT EXISTS "description" text,
+ ADD COLUMN IF NOT EXISTS "industry" varchar(256);
diff --git a/next.config.ts b/next.config.ts
index 4e021d8e..2a4ca279 100644
--- a/next.config.ts
+++ b/next.config.ts
@@ -82,6 +82,9 @@ const config: NextConfig = {
"@img/sharp-libvips-linuxmusl-x64",
"@img/sharp-libvips-linux-x64",
"pdf-lib",
+ "jszip",
+ "readable-stream",
+ "mammoth",
],
};
diff --git a/package.json b/package.json
index 567dfd7c..d132af48 100644
--- a/package.json
+++ b/package.json
@@ -72,6 +72,7 @@
"@tiptap/starter-kit": "^3.20.0",
"@uploadthing/react": "^7.3.3",
"@vercel/analytics": "^1.6.1",
+ "@vercel/blob": "^2.3.0",
"cheerio": "^1.2.0",
"class-variance-authority": "^0.7.1",
"clsx": "*",
@@ -90,6 +91,7 @@
"jszip": "^3.10.1",
"katex": "^0.16.25",
"langchain": "^0.3.33",
+ "lru-cache": "^11.2.6",
"lucide-react": "^0.487.0",
"mammoth": "^1.11.0",
"marked": "^17.0.3",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index ca0e2f55..897b739a 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -158,6 +158,9 @@ importers:
'@vercel/analytics':
specifier: ^1.6.1
version: 1.6.1(next@15.5.7(@babel/core@7.28.5)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1))(react@18.3.1)
+ '@vercel/blob':
+ specifier: ^2.3.0
+ version: 2.3.0
cheerio:
specifier: ^1.2.0
version: 1.2.0
@@ -212,6 +215,9 @@ importers:
langchain:
specifier: ^0.3.33
version: 0.3.33(4c63b96815301c04536f0c3ba2bb9f23)
+ lru-cache:
+ specifier: ^11.2.6
+ version: 11.2.6
lucide-react:
specifier: ^0.487.0
version: 0.487.0(react@18.3.1)
@@ -4418,6 +4424,10 @@ packages:
vue-router:
optional: true
+ '@vercel/blob@2.3.0':
+ resolution: {integrity: sha512-oYWiJbWRQ7gz9Mj0X/NHFJ3OcLMOBzq/2b3j6zeNrQmtFo6dHwU8FAwNpxVIYddVMd+g8eqEi7iRueYx8FtM0Q==}
+ engines: {node: '>=20.0.0'}
+
'@xmldom/xmldom@0.8.11':
resolution: {integrity: sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw==}
engines: {node: '>=10.0.0'}
@@ -4559,6 +4569,9 @@ packages:
resolution: {integrity: sha512-hsU18Ae8CDTR6Kgu9DYf0EbCr/a5iGL0rytQDobUcdpYOKokk8LEjVphnXkDkgpi0wYVsqrXuP0bZxJaTqdgoA==}
engines: {node: '>= 0.4'}
+ async-retry@1.3.3:
+ resolution: {integrity: sha512-wfr/jstw9xNi/0teMHrRW7dsz3Lt5ARhYNZ2ewpadnhaIp5mbALhOAP+EAdsC7t4Z6wqsDVv9+W6gm1Dk9mEyw==}
+
asynckit@0.4.0:
resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
@@ -6037,6 +6050,10 @@ packages:
resolution: {integrity: sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A==}
engines: {node: '>= 0.4'}
+ is-buffer@2.0.5:
+ resolution: {integrity: sha512-i2R6zNFDwgEHJyQUtJEk0XFi1i0dPFn/oqjK3/vPCcDeJvW5NQ83V8QbicfF1SupOaB0h8ntgBC2YiE7dfyctQ==}
+ engines: {node: '>=4'}
+
is-bun-module@2.0.0:
resolution: {integrity: sha512-gNCGbnnnnFAUGKeZ9PdbyeGYJqewpmc2aKHUEMO5nQPWU9lOmv7jcmQIv+qHD8fXW6W7qfuCwX4rY9LNRjXrkQ==}
@@ -6094,6 +6111,9 @@ packages:
resolution: {integrity: sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==}
engines: {node: '>= 0.4'}
+ is-node-process@1.2.0:
+ resolution: {integrity: sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw==}
+
is-number-object@1.1.1:
resolution: {integrity: sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==}
engines: {node: '>= 0.4'}
@@ -6590,6 +6610,10 @@ packages:
lru-cache@10.4.3:
resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==}
+ lru-cache@11.2.6:
+ resolution: {integrity: sha512-ESL2CrkS/2wTPfuend7Zhkzo2u0daGJ/A2VucJOgQ/C48S/zB8MMeMHSGKYpXhIjbPxfuezITkaBH1wqv00DDQ==}
+ engines: {node: 20 || >=22}
+
lru-cache@5.1.1:
resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
@@ -8106,6 +8130,10 @@ packages:
thenify@3.3.1:
resolution: {integrity: sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==}
+ throttleit@2.1.0:
+ resolution: {integrity: sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==}
+ engines: {node: '>=18'}
+
tiny-invariant@1.3.3:
resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==}
@@ -8253,6 +8281,10 @@ packages:
undici-types@7.10.0:
resolution: {integrity: sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag==}
+ undici@6.23.0:
+ resolution: {integrity: sha512-VfQPToRA5FZs/qJxLIinmU59u0r7LXqoJkCzinq3ckNJp3vKEh7jTWN589YQ5+aoAC/TGRLyJLCPKcLQbM8r9g==}
+ engines: {node: '>=18.17'}
+
undici@7.22.0:
resolution: {integrity: sha512-RqslV2Us5BrllB+JeiZnK4peryVTndy9Dnqq62S3yYRRTj0tFQCwEniUy2167skdGOy3vqRzEvl1Dm4sV2ReDg==}
engines: {node: '>=20.18.1'}
@@ -12624,6 +12656,14 @@ snapshots:
next: 15.5.7(@babel/core@7.28.5)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
react: 18.3.1
+ '@vercel/blob@2.3.0':
+ dependencies:
+ async-retry: 1.3.3
+ is-buffer: 2.0.5
+ is-node-process: 1.2.0
+ throttleit: 2.1.0
+ undici: 6.23.0
+
'@xmldom/xmldom@0.8.11': {}
abort-controller-x@0.4.3: {}
@@ -12775,6 +12815,10 @@ snapshots:
async-function@1.0.0: {}
+ async-retry@1.3.3:
+ dependencies:
+ retry: 0.13.1
+
asynckit@0.4.0: {}
autoprefixer@10.4.21(postcss@8.5.6):
@@ -14326,7 +14370,7 @@ snapshots:
isstream: 0.1.2
jsonwebtoken: 9.0.3
mime-types: 2.1.35
- retry-axios: 2.6.0(axios@1.7.4)
+ retry-axios: 2.6.0(axios@1.7.4(debug@4.4.3))
tough-cookie: 4.1.4
transitivePeerDependencies:
- supports-color
@@ -14461,6 +14505,8 @@ snapshots:
call-bound: 1.0.4
has-tostringtag: 1.0.2
+ is-buffer@2.0.5: {}
+
is-bun-module@2.0.0:
dependencies:
semver: 7.7.2
@@ -14511,6 +14557,8 @@ snapshots:
is-negative-zero@2.0.3: {}
+ is-node-process@1.2.0: {}
+
is-number-object@1.1.1:
dependencies:
call-bound: 1.0.4
@@ -15182,6 +15230,8 @@ snapshots:
lru-cache@10.4.3: {}
+ lru-cache@11.2.6: {}
+
lru-cache@5.1.1:
dependencies:
yallist: 3.1.1
@@ -16615,7 +16665,7 @@ snapshots:
path-parse: 1.0.7
supports-preserve-symlinks-flag: 1.0.0
- retry-axios@2.6.0(axios@1.7.4):
+ retry-axios@2.6.0(axios@1.7.4(debug@4.4.3)):
dependencies:
axios: 1.7.4(debug@4.4.3)
@@ -17106,6 +17156,8 @@ snapshots:
dependencies:
any-promise: 1.3.0
+ throttleit@2.1.0: {}
+
tiny-invariant@1.3.3: {}
tinyglobby@0.2.14:
@@ -17252,6 +17304,8 @@ snapshots:
undici-types@7.10.0: {}
+ undici@6.23.0: {}
+
undici@7.22.0: {}
unicode-canonical-property-names-ecmascript@2.0.1: {}
diff --git a/public/images/reddit-snoo.png b/public/images/reddit-snoo.png
new file mode 100644
index 00000000..4f267084
Binary files /dev/null and b/public/images/reddit-snoo.png differ
diff --git a/scripts/create-metadata-tables.ts b/scripts/create-metadata-tables.ts
new file mode 100644
index 00000000..66f180ec
--- /dev/null
+++ b/scripts/create-metadata-tables.ts
@@ -0,0 +1,57 @@
+import "dotenv/config";
+import { db } from "../src/server/db";
+import { sql } from "drizzle-orm";
+
+async function createTables() {
+ try {
+ console.log("Creating company_metadata table...");
+
+ // Create company_metadata table
+ await db.execute(sql`
+ CREATE TABLE IF NOT EXISTS pdr_ai_v2_company_metadata (
+ id SERIAL PRIMARY KEY,
+ company_id BIGINT NOT NULL REFERENCES pdr_ai_v2_company(id) ON DELETE CASCADE,
+ schema_version VARCHAR(20) NOT NULL DEFAULT '1.0.0',
+ metadata JSONB NOT NULL,
+ last_extraction_document_id BIGINT REFERENCES pdr_ai_v2_document(id) ON DELETE SET NULL,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMPTZ
+ )
+ `);
+
+ // Create unique index on company_id
+ await db.execute(sql`
+ CREATE UNIQUE INDEX IF NOT EXISTS company_metadata_company_id_unique
+ ON pdr_ai_v2_company_metadata(company_id)
+ `);
+
+ console.log("Creating company_metadata_history table...");
+
+ // Create company_metadata_history table
+ await db.execute(sql`
+ CREATE TABLE IF NOT EXISTS pdr_ai_v2_company_metadata_history (
+ id SERIAL PRIMARY KEY,
+ company_id BIGINT NOT NULL REFERENCES pdr_ai_v2_company(id) ON DELETE CASCADE,
+ document_id BIGINT REFERENCES pdr_ai_v2_document(id) ON DELETE SET NULL,
+ change_type VARCHAR(32) NOT NULL,
+ diff JSONB NOT NULL,
+ changed_by VARCHAR(256) NOT NULL,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP
+ )
+ `);
+
+ // Create indexes for history table
+ await db.execute(sql`CREATE INDEX IF NOT EXISTS company_metadata_history_company_id_idx ON pdr_ai_v2_company_metadata_history(company_id)`);
+ await db.execute(sql`CREATE INDEX IF NOT EXISTS company_metadata_history_document_id_idx ON pdr_ai_v2_company_metadata_history(document_id)`);
+ await db.execute(sql`CREATE INDEX IF NOT EXISTS company_metadata_history_created_at_idx ON pdr_ai_v2_company_metadata_history(created_at)`);
+ await db.execute(sql`CREATE INDEX IF NOT EXISTS company_metadata_history_change_type_idx ON pdr_ai_v2_company_metadata_history(change_type)`);
+
+ console.log("✅ Tables created successfully!");
+ process.exit(0);
+ } catch (error) {
+ console.error("Error creating tables:", error);
+ process.exit(1);
+ }
+}
+
+createTables();
diff --git a/scripts/ensure-pgvector.mjs b/scripts/ensure-pgvector.mjs
index 2e29f08c..d77d7b23 100644
--- a/scripts/ensure-pgvector.mjs
+++ b/scripts/ensure-pgvector.mjs
@@ -1,4 +1,5 @@
-import 'dotenv/config';
+import dotenv from "dotenv";
+dotenv.config();
import postgres from "postgres";
const url = process.env.DATABASE_URL;
diff --git a/scripts/show-company-metadata.ts b/scripts/show-company-metadata.ts
new file mode 100644
index 00000000..7dcaf596
--- /dev/null
+++ b/scripts/show-company-metadata.ts
@@ -0,0 +1,107 @@
+/**
+ * Terminal display script for company metadata.
+ * Queries the database and pretty-prints the stored metadata JSON.
+ *
+ * Usage:
+ * npx tsx scripts/show-company-metadata.ts [companyId]
+ *
+ * Examples:
+ * npx tsx scripts/show-company-metadata.ts # Show all companies
+ * npx tsx scripts/show-company-metadata.ts 1 # Show metadata for company ID 1
+ *
+ * Required env vars (reads from .env automatically via dotenv):
+ * DATABASE_URL
+ */
+
+import "dotenv/config";
+
+// Skip the full env validation so we don't need Clerk/Inngest keys
+process.env.SKIP_ENV_VALIDATION = "true";
+
+import { db } from "~/server/db";
+import { companyMetadata } from "~/server/db/schema/company-metadata";
+import { company } from "~/server/db/schema/base";
+import { eq } from "drizzle-orm";
+
+async function main() {
+ const companyIdArg = process.argv[2];
+
+ if (companyIdArg) {
+ // Show metadata for a specific company
+ const companyId = BigInt(companyIdArg);
+
+ const [result] = await db
+ .select({
+ id: companyMetadata.id,
+ companyId: companyMetadata.companyId,
+ companyName: company.name,
+ schemaVersion: companyMetadata.schemaVersion,
+ metadata: companyMetadata.metadata,
+ createdAt: companyMetadata.createdAt,
+ updatedAt: companyMetadata.updatedAt,
+ })
+ .from(companyMetadata)
+ .leftJoin(company, eq(companyMetadata.companyId, company.id))
+ .where(eq(companyMetadata.companyId, companyId));
+
+ if (!result) {
+ console.log(`No metadata found for company ID: ${companyIdArg}`);
+ process.exit(0);
+ }
+
+ console.log("═══════════════════════════════════════════════════════════════");
+ console.log(` Company: ${result.companyName ?? "Unknown"} (ID: ${result.companyId})`);
+ console.log(` Schema Version: ${result.schemaVersion}`);
+ console.log(` Created: ${result.createdAt?.toISOString()}`);
+ console.log(` Updated: ${result.updatedAt?.toISOString() ?? "Never"}`);
+ console.log("═══════════════════════════════════════════════════════════════");
+ console.log("\n─── Metadata ───\n");
+ console.log(JSON.stringify(result.metadata, null, 2));
+ } else {
+ // Show all companies with metadata
+ const results = await db
+ .select({
+ id: companyMetadata.id,
+ companyId: companyMetadata.companyId,
+ companyName: company.name,
+ schemaVersion: companyMetadata.schemaVersion,
+ metadata: companyMetadata.metadata,
+ createdAt: companyMetadata.createdAt,
+ updatedAt: companyMetadata.updatedAt,
+ })
+ .from(companyMetadata)
+ .leftJoin(company, eq(companyMetadata.companyId, company.id));
+
+ if (results.length === 0) {
+ console.log("No company metadata found in the database.");
+ console.log("\nTo generate metadata:");
+ console.log(" 1. Start the dev server: pnpm dev");
+ console.log(" 2. Upload documents through the employer flow");
+ console.log(" 3. Call the extraction API: POST /api/company/metadata/extract");
+ process.exit(0);
+ }
+
+ console.log(`Found ${results.length} company/companies with metadata:\n`);
+
+ for (const result of results) {
+ console.log("═══════════════════════════════════════════════════════════════");
+ console.log(` Company: ${result.companyName ?? "Unknown"} (ID: ${result.companyId})`);
+ console.log(` Schema Version: ${result.schemaVersion}`);
+ console.log(` Created: ${result.createdAt?.toISOString()}`);
+ console.log(` Updated: ${result.updatedAt?.toISOString() ?? "Never"}`);
+ console.log("═══════════════════════════════════════════════════════════════");
+ console.log("\n─── Metadata ───\n");
+ console.log(JSON.stringify(result.metadata, null, 2));
+ console.log("\n");
+ }
+ }
+}
+
+main()
+ .catch((err) => {
+ console.error("Failed to fetch company metadata:", err);
+ process.exit(1);
+ })
+ .finally(() => {
+ process.exit(0);
+ });
diff --git a/scripts/test-platform-apis.ts b/scripts/test-platform-apis.ts
new file mode 100644
index 00000000..0ca74333
--- /dev/null
+++ b/scripts/test-platform-apis.ts
@@ -0,0 +1,150 @@
+/**
+ * Test script for Marketing Pipeline Platform API integrations
+ *
+ * Usage:
+ * npx tsx scripts/test-platform-apis.ts
+ *
+ * Required env vars:
+ * REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT
+ * TWITTER_BEARER_TOKEN
+ * LINKEDIN_ACCESS_TOKEN
+ * BLUESKY_HANDLE, BLUESKY_APP_PASSWORD
+ */
+
+import "dotenv/config";
+
+// Skip the full env validation so we don't need DB/Clerk/Inngest keys
+process.env.SKIP_ENV_VALIDATION = "true";
+
+import { redditClient } from "~/lib/tools/marketing-pipeline/clients/reddit";
+import { twitterClient } from "~/lib/tools/marketing-pipeline/clients/twitter";
+import { linkedinClient } from "~/lib/tools/marketing-pipeline/clients/linkedin";
+import { blueskyClient } from "~/lib/tools/marketing-pipeline/clients/bluesky";
+import { researchPlatformTrends } from "~/lib/tools/marketing-pipeline/research";
+import type { MarketingPlatform } from "~/lib/tools/marketing-pipeline/types";
+
+async function testRedditAPI() {
+ console.log("\n🔴 Testing Reddit API...");
+ try {
+ const results = await redditClient.searchTrendingPosts("AI technology", 5);
+ console.log(`✅ Reddit: Found ${results.length} trending posts`);
+ if (results.length > 0 && results[0]) {
+ console.log(` 📝 Sample: "${results[0].title}"`);
+ }
+ return true;
+ } catch (error) {
+ console.log(`❌ Reddit failed: ${error instanceof Error ? error.message : error}`);
+ return false;
+ }
+}
+
+async function testTwitterAPI() {
+ console.log("\n🐦 Testing Twitter/X API...");
+ try {
+ const results = await twitterClient.searchTrendingTweets("AI trends", 5);
+ console.log(`✅ Twitter: Found ${results.length} trending tweets`);
+ if (results.length > 0 && results[0]) {
+ console.log(` 📝 Sample: "${results[0].title}"`);
+ }
+ return true;
+ } catch (error) {
+ console.log(`❌ Twitter failed: ${error instanceof Error ? error.message : error}`);
+ return false;
+ }
+}
+
+async function testLinkedInAPI() {
+ console.log("\n💼 Testing LinkedIn API...");
+ try {
+ const results = await linkedinClient.searchTrendingPosts("business technology", 5);
+ console.log(`✅ LinkedIn: Found ${results.length} trending posts`);
+ if (results.length > 0 && results[0]) {
+ console.log(` 📝 Sample: "${results[0].title}"`);
+ }
+ return results.length > 0; // LinkedIn might return 0 due to API restrictions
+ } catch (error) {
+ console.log(`⚠️ LinkedIn failed: ${error instanceof Error ? error.message : error}`);
+ console.log(" Note: LinkedIn API has strict access requirements");
+ return false;
+ }
+}
+
+async function testBlueskyAPI() {
+ console.log("\n🦋 Testing Bluesky API...");
+ try {
+ const results = await blueskyClient.searchTrendingPosts("technology", 5);
+ console.log(`✅ Bluesky: Found ${results.length} trending posts`);
+ if (results.length > 0 && results[0]) {
+ console.log(` 📝 Sample: "${results[0].title}"`);
+ }
+ return true;
+ } catch (error) {
+ console.log(`❌ Bluesky failed: ${error instanceof Error ? error.message : error}`);
+ return false;
+ }
+}
+
+async function testIntegratedPipeline() {
+ console.log("\n🔄 Testing Integrated Marketing Pipeline...");
+
+ const platforms: MarketingPlatform[] = ["reddit", "x", "linkedin", "bluesky"];
+ let successCount = 0;
+
+ for (const platform of platforms) {
+ try {
+ console.log(`\n Testing ${platform} integration...`);
+ const results = await researchPlatformTrends({
+ platform,
+ prompt: "AI marketing tools",
+ companyName: "TechCorp",
+ companyContext: "Company Name: TechCorp. Knowledge Base Signals: None.",
+ maxResults: 3,
+ });
+
+ console.log(` ✅ ${platform}: ${results.length} results (${results.length > 0 ? 'API' : 'fallback'} mode)`);
+ successCount++;
+ } catch (error) {
+ console.log(` ❌ ${platform}: ${error instanceof Error ? error.message : error}`);
+ }
+ }
+
+ console.log(`\n📊 Integration Results: ${successCount}/${platforms.length} platforms working`);
+}
+
+async function main() {
+ console.log("🚀 Testing Marketing Pipeline Platform APIs\n");
+ console.log("=" .repeat(50));
+
+ // Test individual platform APIs
+ const redditOk = await testRedditAPI();
+ const twitterOk = await testTwitterAPI();
+ const linkedinOk = await testLinkedInAPI();
+ const blueskyOk = await testBlueskyAPI();
+
+ // Test integrated pipeline
+ await testIntegratedPipeline();
+
+ console.log("\n" + "=".repeat(50));
+ console.log("📋 Summary:");
+ console.log(` Reddit API: ${redditOk ? '✅' : '❌'}`);
+ console.log(` Twitter API: ${twitterOk ? '✅' : '❌'}`);
+ console.log(` LinkedIn API: ${linkedinOk ? '✅' : '⚠️ '} ${linkedinOk ? '' : '(Limited access)'}`);
+ console.log(` Bluesky API: ${blueskyOk ? '✅' : '❌'}`);
+
+ const workingAPIs = [redditOk, twitterOk, linkedinOk, blueskyOk].filter(Boolean).length;
+ console.log(`\n🎯 ${workingAPIs}/4 platform APIs are working!`);
+
+ if (workingAPIs === 0) {
+ console.log("\n❗ No APIs are working. Check your API credentials in .env");
+ process.exit(1);
+ } else if (workingAPIs < 4) {
+ console.log("\n⚠️ Some APIs need setup. See setup instructions above.");
+ } else {
+ console.log("\n🎉 All platform APIs are working perfectly!");
+ }
+}
+
+main().catch((err) => {
+ console.error("💥 Test failed:", err);
+ process.exit(1);
+});
\ No newline at end of file
diff --git a/scripts/test-trend-search.ts b/scripts/test-trend-search.ts
index d63c02c1..791cd048 100644
--- a/scripts/test-trend-search.ts
+++ b/scripts/test-trend-search.ts
@@ -75,7 +75,8 @@ Running pipeline (plan → search → synthesize)…
}
*/
-import "dotenv/config";
+import dotenv from "dotenv";
+dotenv.config();
// Skip the full env validation so we don't need DB/Clerk/Inngest keys
process.env.SKIP_ENV_VALIDATION = "true";
diff --git a/src/app/api/agents/predictive-document-analysis/agent.ts b/src/app/api/agents/predictive-document-analysis/agent.ts
index c13f5ba1..4eb30123 100644
--- a/src/app/api/agents/predictive-document-analysis/agent.ts
+++ b/src/app/api/agents/predictive-document-analysis/agent.ts
@@ -9,5 +9,8 @@ export type {
PredictiveAnalysisResult,
MissingDocumentPrediction,
ResolvedReference,
- SearchResult
+ SearchResult,
+ DocumentInsight,
+ InsightCategory,
+ InsightSeverity,
} from "~/app/api/agents/predictive-document-analysis/types";
\ No newline at end of file
diff --git a/src/app/api/agents/predictive-document-analysis/index.ts b/src/app/api/agents/predictive-document-analysis/index.ts
index 2a34f499..306e3a59 100644
--- a/src/app/api/agents/predictive-document-analysis/index.ts
+++ b/src/app/api/agents/predictive-document-analysis/index.ts
@@ -3,7 +3,7 @@ export { analyzeDocumentChunks } from "~/app/api/agents/predictive-document-anal
export { extractReferences } from "~/app/api/agents/predictive-document-analysis/services/referenceExtractor";
export { findSuggestedCompanyDocuments } from "~/app/api/agents/predictive-document-analysis/services/documentMatcher";
export { getEmbeddings, batchGetEmbeddings } from "~/app/api/agents/predictive-document-analysis/utils/embeddings";
-export { groupContentFromChunks, cleanText, hasSpecificIdentifier } from "~/app/api/agents/predictive-document-analysis/utils/content";
+export { groupContentFromChunks, cleanText, hasSpecificIdentifier, isValidReference } from "~/app/api/agents/predictive-document-analysis/utils/content";
// Type exports
export type {
diff --git a/src/app/api/agents/predictive-document-analysis/route.ts b/src/app/api/agents/predictive-document-analysis/route.ts
index 12b9ff4d..4c08696b 100644
--- a/src/app/api/agents/predictive-document-analysis/route.ts
+++ b/src/app/api/agents/predictive-document-analysis/route.ts
@@ -3,7 +3,8 @@ import { db } from "~/server/db/index";
import { eq, sql, and, gt, desc, ne } from "drizzle-orm";
import { analyzeDocumentChunks } from "~/app/api/agents/predictive-document-analysis/agent";
import type { PredictiveAnalysisResult } from "~/app/api/agents/predictive-document-analysis/agent";
-import { predictiveDocumentAnalysisResults, document, pdfChunks } from "~/server/db/schema";
+import { predictiveDocumentAnalysisResults, document, pdfChunks, documentContextChunks, documentStructure } from "~/server/db/schema";
+import { sanitizeErrorMessage } from "~/app/api/agents/predictive-document-analysis/utils/logging";
import {
ANALYSIS_BATCH_CONFIG,
ANALYSIS_TYPES,
@@ -46,6 +47,7 @@ type PredictiveAnalysisOutput = {
totalMissingDocuments: number;
highPriorityItems: number;
totalRecommendations: number;
+ totalInsights: number;
totalSuggestedRelated: number;
analysisTimestamp: string;
};
@@ -180,17 +182,46 @@ export async function POST(request: Request) {
}, { status: HTTP_STATUS.NOT_FOUND });
}
- const chunksResults = await db
+ // Read from RLM table first (with structure headings), fall back to legacy pdfChunks
+ const rlmChunks = await db
.select({
- id: pdfChunks.id,
- content: pdfChunks.content,
- page: pdfChunks.page
+ id: documentContextChunks.id,
+ content: documentContextChunks.content,
+ page: documentContextChunks.pageNumber,
+ sectionHeading: documentStructure.title,
})
- .from(pdfChunks)
- .where(eq(pdfChunks.documentId, BigInt(documentId)))
- .orderBy(pdfChunks.id);
+ .from(documentContextChunks)
+ .leftJoin(
+ documentStructure,
+ eq(documentContextChunks.structureId, documentStructure.id)
+ )
+ .where(eq(documentContextChunks.documentId, BigInt(documentId)))
+ .orderBy(documentContextChunks.id);
+
+ let chunks: PdfChunk[];
+
+ if (rlmChunks.length > 0) {
+ chunks = rlmChunks.map(c => ({
+ id: c.id,
+ content: c.content,
+ page: c.page ?? 1,
+ sectionHeading: c.sectionHeading,
+ }));
+ } else {
+ const legacyChunks = await db
+ .select({
+ id: pdfChunks.id,
+ content: pdfChunks.content,
+ page: pdfChunks.page
+ })
+ .from(pdfChunks)
+ .where(eq(pdfChunks.documentId, BigInt(documentId)))
+ .orderBy(pdfChunks.id);
+
+ chunks = legacyChunks;
+ }
- if (chunksResults.length === 0) {
+ if (chunks.length === 0) {
recordResult("error");
return NextResponse.json({
success: false,
@@ -199,8 +230,6 @@ export async function POST(request: Request) {
}, { status: HTTP_STATUS.NOT_FOUND });
}
- const chunks: PdfChunk[] = chunksResults;
-
let existingDocuments: string[] = [];
if (includeRelatedDocs) {
const currentDoc = await db.select({ companyId: document.companyId })
@@ -252,6 +281,7 @@ export async function POST(request: Request) {
totalMissingDocuments: analysisResult.missingDocuments.length,
highPriorityItems: analysisResult.missingDocuments.filter(doc => doc.priority === 'high').length,
totalRecommendations: analysisResult.recommendations.length,
+ totalInsights: analysisResult.insights?.length ?? 0,
totalSuggestedRelated: analysisResult.suggestedRelatedDocuments?.length ?? 0,
analysisTimestamp: new Date().toISOString()
},
@@ -273,7 +303,7 @@ export async function POST(request: Request) {
fromCache: false
}, { status: HTTP_STATUS.OK });
} catch (error: unknown) {
- console.error("Predictive Document Analysis Error:", error);
+ console.error("Predictive Document Analysis Error:", sanitizeErrorMessage(error));
let status: number = HTTP_STATUS.INTERNAL_SERVER_ERROR;
let message = "Failed to perform predictive document analysis";
diff --git a/src/app/api/agents/predictive-document-analysis/services/analysisEngine.ts b/src/app/api/agents/predictive-document-analysis/services/analysisEngine.ts
index 5416c351..f156ef92 100644
--- a/src/app/api/agents/predictive-document-analysis/services/analysisEngine.ts
+++ b/src/app/api/agents/predictive-document-analysis/services/analysisEngine.ts
@@ -7,6 +7,7 @@ import type {
AnalysisSpecification,
PredictiveAnalysisResult,
MissingDocumentPrediction,
+ DocumentInsight,
SearchResult
} from "~/app/api/agents/predictive-document-analysis/types";
import { ANALYSIS_TYPES } from "~/app/api/agents/predictive-document-analysis/types";
@@ -14,6 +15,8 @@ import { groupContentFromChunks } from "~/app/api/agents/predictive-document-ana
import { createChunkBatches } from "~/app/api/agents/predictive-document-analysis/utils/batching";
import { extractReferences, deduplicateReferences } from "~/app/api/agents/predictive-document-analysis/services/referenceExtractor";
import { findSuggestedCompanyDocuments } from "~/app/api/agents/predictive-document-analysis/services/documentMatcher";
+import { extractDeterministicInsights } from "~/app/api/agents/predictive-document-analysis/utils/insightExtractors";
+import { sanitizeErrorMessage } from "~/app/api/agents/predictive-document-analysis/utils/logging";
import pLimit from "p-limit";
import { db } from "~/server/db/index";
import { document } from "~/server/db/schema";
@@ -35,7 +38,7 @@ async function withRetry(
lastError = error instanceof Error ? error : new Error(String(error));
if (attempt < maxRetries) {
- console.warn(`Attempt ${attempt} failed, retrying in ${delayMs}ms...`, lastError.message);
+ console.warn(`Attempt ${attempt} failed, retrying in ${delayMs}ms...`, sanitizeErrorMessage(lastError));
await new Promise(resolve => setTimeout(resolve, delayMs));
delayMs *= 2;
}
@@ -52,9 +55,23 @@ const MissingDocumentSchema = z.object({
priority: z.enum(['high', 'medium', 'low']).describe('The priority of the missing document')
});
+const InsightSchema = z.object({
+ category: z.enum(['deadline', 'resource', 'action-item', 'caveat']).describe(
+ 'deadline = due dates/exams/submissions; resource = suggested videos/readings/tools; action-item = tasks/follow-ups; caveat = policies/restrictions/conditions'
+ ),
+ severity: z.enum(['note', 'warning']).describe('warning for time-sensitive or critical items, note for informational'),
+ title: z.string().describe('Short scannable label, e.g. "Homework 3 due Feb 20" or "Watch: Design Sprint video"'),
+ detail: z.string().describe('Full context sentence from the document'),
+ page: z.number().describe('Page number where this insight appears'),
+ sourceQuote: z.string().optional().describe('Exact quote from the document'),
+ url: z.string().optional().describe('URL if this insight references an external resource'),
+ date: z.string().optional().describe('Raw date text for deadline-type insights'),
+});
+
const AnalysisResultSchema = z.object({
missingDocuments: z.array(MissingDocumentSchema).describe('The missing documents found in the document'),
- recommendations: z.array(z.string()).describe('The recommendations for handling the missing documents')
+ recommendations: z.array(z.string()).describe('The recommendations for handling the missing documents'),
+ insights: z.array(InsightSchema).describe('Notable items found IN the document that deserve attention: deadlines, suggested resources, action items, or important caveats. Max 5 per batch.'),
});
function createAnalysisPrompt(
@@ -66,31 +83,37 @@ function createAnalysisPrompt(
existingDocsStr = `\nExisting documents (do not suggest these as missing): ${specification.existingDocuments.join(', ')}.`;
}
- const guidanceByType = {
+ const guidanceByType: Record = {
contract: `Focus on contractual references like exhibits, schedules, addendums, and supporting agreements that are mentioned but not present.`,
financial: `Focus on financial references like balance sheets, income statements, audit reports, and supporting financial documentation that are mentioned but not present.`,
technical: `Focus on technical references like specifications, manuals, diagrams, and project deliverables that are mentioned but not present.`,
compliance: `Focus on compliance references like regulatory filings, policy documents, certifications, and legal requirements that are mentioned but not present.`,
+ educational: `Focus on referenced course materials, syllabi, handouts, assignment templates, readings, and linked resources (URLs, videos) that are mentioned but not included.`,
+ hr: `Focus on referenced policies, forms, benefits documents, org charts, employee handbooks, and compliance materials that are mentioned but not included.`,
+ research: `Focus on cited papers, datasets, supplementary materials, methodology documents, and referenced figures or tables that are mentioned but not included.`,
general: `Focus on any document references, attachments, or supporting materials that are mentioned but not present in the current document.`
};
- // Prevent hallucinations
const analysisInstructions = `
IMPORTANT: Base your analysis ONLY on what is explicitly mentioned in the document content.
Do not assume or infer missing documents that aren't clearly referenced.
Reference indicators to look for:
- • Direct mentions of specific documents by name
+ • Direct mentions of specific documents by name (e.g., "syllabus", "handbook", "Exhibit A")
• References to attachments, exhibits, schedules, or appendices
• Cross-references to other sections or documents
- • Mentions of supporting documentation
- • References to external files or resources
+ • Mentions of supporting documentation, forms, or templates
+ • Directives like "please see", "refer to", "posted on", "available at"
For each potential missing document, verify:
✓ Is it explicitly mentioned in the text?
✓ Is the reference clear and specific?
✓ Is it actually missing (not just referenced)?
✓ What is its importance to understanding this document?
+
+ TONE REQUIREMENT:
+ Recommendations must be direct and actionable. Do NOT use conditional language like "If you need...", "If this is meant to...", or "If this content is expected to...".
+ State findings as facts. Example: "The syllabus is referenced on page 1 but not included in this document." NOT "If you need the syllabus, consider looking for it."
`;
return `
@@ -99,14 +122,47 @@ function createAnalysisPrompt(
Analyze the document content step-by-step to find missing referenced documents${specification.includeRelatedDocs ? ' considering broader related document context and potential online searches for templates' : ''}.
Chain of Thought:
- 1. Scan the content for explicit references to other documents (e.g., "see Exhibit A", "as per Schedule 3").
+ 1. Scan the content for explicit references to other documents (e.g., "see Exhibit A", "please see syllabus", "refer to the handbook", "as per Schedule 3", "posted on Canvas").
2. For each reference, check if it's likely missing (not included in this content or existing documents).
3. Classify: Name, type, reason (concise), page where referenced, priority (high if critical, medium if supportive, low if optional).
- 4. Generate 2-3 recommendations for handling missing items, including searching online for templates.
+ 4. Generate 1-2 actionable recommendations ONLY for the missing items you actually found. If you found no missing items, return an empty recommendations array.
5. Avoid duplicates or suggestions for existing documents.
6. Focus on explicit references; be concise and accurate.
- ${guidanceByType[specification.type] || guidanceByType.general}
+ CRITICAL RULES:
+ - If no missing documents are found, return EMPTY arrays for both missingDocuments and recommendations. Do NOT generate entries that say "no missing documents were identified" — that is not a finding.
+ - Do NOT generate recommendations about "maintaining" the status quo or generic document management advice. Only recommend concrete actions for concrete findings.
+ - Each recommendation must reference a specific missing document by name. Generic advice like "run a final pass" or "preserve page numbers" is not useful.
+
+ INSIGHTS EXTRACTION (separate from missing documents):
+ In addition to missing documents, extract up to 5 notable items found IN the content that deserve the reader's attention.
+
+ Categories:
+ - deadline: Upcoming homework due dates, exam dates, submission deadlines, project milestones
+ - resource: Suggested videos, readings, tools, or papers the author recommends reviewing — include the URL if present
+ - action-item: Tasks the reader should complete, follow-ups, things to prepare, even if no explicit date is given
+ - caveat: Important policies, restrictions, conditions, or warnings stated in the content
+
+ For educational content, actively look for:
+ - Assignments, activities, entrance tickets, or quizzes mentioned as upcoming work
+ - Videos or readings the instructor recommends (especially URLs to YouTube, Vimeo, or other platforms)
+ - Policy statements about academic integrity, AI usage, attendance, or late work
+ - Platform-specific tasks like "post on Courselore" or "submit via Canvas"
+
+ EXAMPLES of good insights:
+ {"category":"resource","severity":"note","title":"Watch: Design Sprint Overview","detail":"The instructor recommends watching the Design Sprint methodology video before next class.","page":5,"url":"https://youtu.be/x-DLQp9xb20"}
+ {"category":"action-item","severity":"warning","title":"Post self-introduction on Courselore","detail":"Students must post a self-introduction on Courselore by end of the first week.","page":3}
+ {"category":"caveat","severity":"warning","title":"Academic Integrity Code applies","detail":"All work must comply with the university's academic integrity code. Violations result in a failing grade.","page":2,"url":"https://cs.jhu.edu/academic-integrity-code"}
+
+ ANTI-PATTERNS — do NOT produce these:
+ - Section headings or topic names as insights (e.g., "Heuristic Evaluation", "Common Violations")
+ - Table-of-contents entries or slide titles
+ - Generic observations like "The document discusses design principles"
+ - Items already covered in missingDocuments
+
+ If there are no notable insights, return an empty insights array.
+
+ ${guidanceByType[specification.type] ?? guidanceByType.general}
${analysisInstructions}
@@ -134,7 +190,7 @@ async function performWebSearch(query: string, maxResults = 5): Promise {
+ const highPriority = predictions.filter(p => p.priority === 'high');
+ if (highPriority.length === 0) return predictions;
+
+ const fullContent = groupContentFromChunks(allChunks);
+ const contentWindow = fullContent.slice(0, 30000);
+
+ const chat = new ChatOpenAI({
+ openAIApiKey: process.env.OPENAI_API_KEY,
+ modelName: "gpt-5.2",
+ temperature: 0.0,
+ });
+
+ const structuredModel = chat.withStructuredOutput(VerificationResultSchema, {
+ name: "verification_result"
+ });
+
+ const limit = pLimit(5);
+ const verifiedSet = new Set();
+ const removedSet = new Set();
+
+ await Promise.all(
+ highPriority.map(prediction =>
+ limit(async () => {
+ try {
+ const verificationPrompt = `You are a fact-checking assistant. A previous analysis claimed that the following document is referenced but missing.
+
+Claimed missing document: "${prediction.documentName}" (type: ${prediction.documentType})
+Claimed reference location: Page ${prediction.page}
+Claimed reason: ${prediction.reason}
+
+Your task: Search the source text below and determine if this document is actually referenced.
+- If you find the reference, quote the EXACT sentence.
+- If the name is slightly wrong, provide the corrected name.
+- If you cannot find any reference to this document, mark verified as false.
+
+SOURCE TEXT:
+${contentWindow}`;
+
+ const timeoutPromise = new Promise((_, reject) => {
+ setTimeout(() => reject(new Error('Verification timed out')), Math.min(timeoutMs, 15000));
+ });
+
+ const result = await Promise.race([
+ structuredModel.invoke([
+ new SystemMessage("Verify document references with exact quotes. Be strict: only mark verified if you find a clear reference."),
+ new HumanMessage(verificationPrompt)
+ ]),
+ timeoutPromise,
+ ]);
+
+ const key = prediction.documentName.toLowerCase().trim();
+ if (result.verified) {
+ verifiedSet.add(key);
+ if (result.correctedName) {
+ prediction.documentName = result.correctedName;
+ }
+ if (result.adjustedPriority) {
+ prediction.priority = result.adjustedPriority;
+ }
+ } else {
+ removedSet.add(key);
+ }
+ } catch {
+ // On timeout/error, keep the prediction (conservative)
+ }
+ })
+ )
+ );
+
+ if (removedSet.size > 0) {
+ console.log(`[PDA Verification] Removed ${removedSet.size} unverified high-priority predictions`);
+ }
+
+ return predictions.filter(p => {
+ const key = p.documentName.toLowerCase().trim();
+ return !removedSet.has(key);
+ });
+}
+
+// ---------------------------------------------------------------------------
+// URL extraction: find linked external resources without an LLM call
+// ---------------------------------------------------------------------------
+
+const URL_REGEX = /https?:\/\/[^\s<>"')\]},]+/gi;
+
+function extractURLReferences(allChunks: PdfChunk[]): MissingDocumentPrediction[] {
+ const seen = new Set();
+ const results: MissingDocumentPrediction[] = [];
+
+ for (const chunk of allChunks) {
+ const urls = chunk.content?.match(URL_REGEX);
+ if (!urls) continue;
+
+ for (const rawUrl of urls) {
+ const url = rawUrl.replace(/[.,;:!?)]+$/, '');
+ if (seen.has(url)) continue;
+ seen.add(url);
+
+ let domain: string;
+ let displayPath = '';
+ try {
+ const parsed = new URL(url);
+ domain = parsed.hostname.replace(/^www\./, '');
+ const path = parsed.pathname.replace(/\/+$/, '');
+ if (path && path !== '/') {
+ displayPath = path.length > 40 ? path.slice(0, 37) + '…' : path;
+ }
+ } catch {
+ domain = url.slice(0, 40);
+ }
+
+ const displayName = displayPath ? `${domain}${displayPath}` : domain;
+
+ results.push({
+ documentName: `External: ${displayName}`,
+ documentType: 'external-resource',
+ reason: `External resource linked on page ${chunk.page}`,
+ page: chunk.page,
+ priority: 'low',
+ suggestedLinks: [{ title: displayName, url, snippet: url }],
+ });
+ }
+ }
+
+ return results;
+}
+
export type AnalysisRunStats = {
aiCalls: number;
batches: number;
@@ -223,21 +423,62 @@ export async function analyzeDocumentChunks(
);
try {
- const chunkResults = await Promise.all(chunkPromises);
+ const [chunkResults, deterministicInsights] = await Promise.all([
+ Promise.all(chunkPromises),
+ Promise.resolve(extractDeterministicInsights(allChunks)),
+ ]);
+
+ const llmInsights: DocumentInsight[] = chunkResults.flatMap(
+ result => result.insights ?? [],
+ );
const combinedResult: PredictiveAnalysisResult = {
- missingDocuments: chunkResults.flatMap(result => result.missingDocuments || []),
- recommendations: chunkResults.flatMap(result => result.recommendations || []),
+ missingDocuments: chunkResults.flatMap(result => result.missingDocuments ?? []),
+ recommendations: chunkResults.flatMap(result => result.recommendations ?? []),
};
+ combinedResult.missingDocuments = filterNonFindingDocuments(combinedResult.missingDocuments);
combinedResult.missingDocuments = deduplicateMissingDocuments(combinedResult.missingDocuments);
+
+ combinedResult.recommendations = filterBoilerplateRecommendations(combinedResult.recommendations);
combinedResult.recommendations = deduplicateRecommendations(combinedResult.recommendations);
+ const urlRefs = extractURLReferences(allChunks);
+ combinedResult.missingDocuments.push(...urlRefs);
+
+ combinedResult.missingDocuments = promoteStrongRecommendations(
+ combinedResult.missingDocuments,
+ combinedResult.recommendations,
+ );
+
+ const MAX_RECOMMENDATIONS = 5;
+ if (combinedResult.recommendations.length > MAX_RECOMMENDATIONS) {
+ combinedResult.recommendations = combinedResult.recommendations.slice(0, MAX_RECOMMENDATIONS);
+ }
+
+ if (combinedResult.missingDocuments.some(d => d.priority === 'high')) {
+ combinedResult.missingDocuments = await verifyPredictions(
+ combinedResult.missingDocuments,
+ allChunks,
+ timeoutMs,
+ );
+ }
+
if (specification.includeRelatedDocs && combinedResult.missingDocuments.length > 0) {
await enhanceWithCompanyDocuments(combinedResult, allChunks, specification, timeoutMs);
await enhanceWithWebSearch(combinedResult, specification);
}
+ combinedResult.insights = mergeAndDeduplicateInsights(
+ deterministicInsights,
+ llmInsights,
+ );
+
+ combinedResult.missingDocuments = fuseInsightsWithExternalLinks(
+ combinedResult.insights,
+ combinedResult.missingDocuments,
+ );
+
const totalCharacters = allChunks.reduce((sum, chunk) => sum + (chunk.content?.length ?? 0), 0);
const stats: AnalysisRunStats = {
aiCalls: batches.length,
@@ -252,7 +493,7 @@ export async function analyzeDocumentChunks(
stats
};
} catch (error) {
- console.error("Batch analysis error:", error);
+ console.error("Batch analysis error:", sanitizeErrorMessage(error));
throw error;
}
}
@@ -290,7 +531,7 @@ async function enhanceWithCompanyDocuments(
missing.suggestedCompanyDocuments = suggestions;
}
} catch (error) {
- console.error(`Error finding suggestions for ${missing.documentName}:`, error);
+ console.error(`Error finding suggestions for ${missing.documentName}:`, sanitizeErrorMessage(error));
}
}
}
@@ -316,19 +557,88 @@ async function enhanceWithWebSearch(
}
-function deduplicateMissingDocuments(docs: MissingDocumentPrediction[]): MissingDocumentPrediction[] {
- const seen = new Set();
+const PRIORITY_RANK: Record = { high: 3, medium: 2, low: 1 };
+
+const NON_FINDING_PATTERNS = [
+ /no\s+(?:missing|explicitly\s+referenced|referenced)\s+(?:external\s+)?documents?\s+(?:were|was|are)\s+(?:identified|found|detected)/i,
+ /no\s+explicit(?:ly)?\s+referenced/i,
+ /there\s+are\s+no\s+missing/i,
+ /does\s+not\s+(?:explicitly\s+)?reference\s+any/i,
+ /no\s+exhibits?,?\s+appendix|appendices/i,
+ /content\s+does\s+not\s+(?:explicitly\s+)?reference/i,
+];
+
+const BOILERPLATE_PATTERNS = [
+ /maintain\s+this\s+(?:status|section|approach)/i,
+ /continue\s+(?:monitoring|the\s+same\s+scan)/i,
+ /keep\s+(?:external\s+links|the\s+full|this\s+section)/i,
+ /run\s+a\s+final\s+pass/i,
+ /preserve\s+page\s+(?:numbers|headers)/i,
+ /when\s+exporting\s+(?:or\s+compiling|future)/i,
+ /when\s+adding\s+future\s+references/i,
+ /no\s+(?:missing|explicitly)\s+referenced/i,
+];
+
+const GARBAGE_DOC_NAMES = new Set([
+ 'content', 'document', 'text', 'page', 'file', 'section', 'material',
+ 'information', 'data', 'n/a', 'none', 'unknown', 'the document',
+]);
+
+function filterNonFindingDocuments(docs: MissingDocumentPrediction[]): MissingDocumentPrediction[] {
return docs.filter(doc => {
- const key = doc.documentName.toLowerCase().trim();
- if (seen.has(key)) {
- return false;
- }
- seen.add(key);
+ const name = doc.documentName.toLowerCase().trim();
+ if (GARBAGE_DOC_NAMES.has(name) || name.length < 3) return false;
+
+ const reason = doc.reason.toLowerCase();
+ if (NON_FINDING_PATTERNS.some(p => p.test(reason))) return false;
+ if (NON_FINDING_PATTERNS.some(p => p.test(name))) return false;
+
+ return true;
+ });
+}
+
+function filterBoilerplateRecommendations(recs: string[]): string[] {
+ return recs.filter(rec => {
+ if (NON_FINDING_PATTERNS.some(p => p.test(rec))) return false;
+ if (BOILERPLATE_PATTERNS.some(p => p.test(rec))) return false;
return true;
});
+}
+
+function deduplicateMissingDocuments(docs: MissingDocumentPrediction[], threshold = 0.75): MissingDocumentPrediction[] {
+ const unique: MissingDocumentPrediction[] = [];
+
+ for (const doc of docs) {
+ const docName = doc.documentName.toLowerCase().trim();
+ let mergedInto: MissingDocumentPrediction | null = null;
+
+ for (const existing of unique) {
+ const existingName = existing.documentName.toLowerCase().trim();
+ if (
+ existingName === docName ||
+ existingName.includes(docName) ||
+ docName.includes(existingName) ||
+ stringSimilarity(docName, existingName) > threshold
+ ) {
+ mergedInto = existing;
+ break;
+ }
+ }
+
+ if (mergedInto) {
+ if ((PRIORITY_RANK[doc.priority] ?? 0) > (PRIORITY_RANK[mergedInto.priority] ?? 0)) {
+ mergedInto.priority = doc.priority;
+ mergedInto.reason = doc.reason;
+ }
+ } else {
+ unique.push({ ...doc });
+ }
+ }
+
+ return unique;
}
-function deduplicateRecommendations(recommendations: string[], threshold = 0.8): string[] {
+function deduplicateRecommendations(recommendations: string[], threshold = 0.6): string[] {
const unique = [];
for (const rec of recommendations) {
@@ -345,3 +655,141 @@ function deduplicateRecommendations(recommendations: string[], threshold = 0.8):
}
return unique;
}
+
+// ---------------------------------------------------------------------------
+// Insight merge / dedup / sort / cap
+// ---------------------------------------------------------------------------
+
+const SEVERITY_RANK: Record = { warning: 2, note: 1 };
+const MAX_INSIGHTS = 10;
+
+function mergeAndDeduplicateInsights(
+ deterministic: DocumentInsight[],
+ llmGenerated: DocumentInsight[],
+ threshold = 0.6,
+): DocumentInsight[] {
+ const all = [...deterministic, ...llmGenerated];
+ const unique: DocumentInsight[] = [];
+
+ for (const insight of all) {
+ const titleLower = insight.title.toLowerCase();
+ let isDuplicate = false;
+ for (const existing of unique) {
+ if (stringSimilarity(titleLower, existing.title.toLowerCase()) > threshold) {
+ isDuplicate = true;
+ if ((SEVERITY_RANK[insight.severity] ?? 0) > (SEVERITY_RANK[existing.severity] ?? 0)) {
+ existing.severity = insight.severity;
+ }
+ break;
+ }
+ }
+ if (!isDuplicate) {
+ unique.push({ ...insight });
+ }
+ }
+
+ unique.sort((a, b) => {
+ const sevDiff = (SEVERITY_RANK[b.severity] ?? 0) - (SEVERITY_RANK[a.severity] ?? 0);
+ if (sevDiff !== 0) return sevDiff;
+ return a.page - b.page;
+ });
+
+ return unique.slice(0, MAX_INSIGHTS);
+}
+
+// ---------------------------------------------------------------------------
+// Insight-Link Fusion: remove external-resource entries from missingDocuments
+// when the same URL has already been promoted to a richer resource insight.
+// ---------------------------------------------------------------------------
+
+function fuseInsightsWithExternalLinks(
+ insights: DocumentInsight[],
+ missingDocs: MissingDocumentPrediction[],
+): MissingDocumentPrediction[] {
+ const insightUrls = new Set();
+ for (const insight of insights) {
+ if (insight.url) {
+ try {
+ insightUrls.add(new URL(insight.url).href.replace(/\/+$/, ''));
+ } catch {
+ insightUrls.add(insight.url);
+ }
+ }
+ }
+
+ if (insightUrls.size === 0) return missingDocs;
+
+ return missingDocs.filter(doc => {
+ if (doc.documentType !== 'external-resource') return true;
+ const docUrl = doc.suggestedLinks?.[0]?.url;
+ if (!docUrl) return true;
+
+ let normalized: string;
+ try {
+ normalized = new URL(docUrl).href.replace(/\/+$/, '');
+ } catch {
+ normalized = docUrl;
+ }
+
+ return !insightUrls.has(normalized);
+ });
+}
+
+// ---------------------------------------------------------------------------
+// Promotion: upgrade recommendations that describe missing references into
+// structured missingDocument entries so they appear in the "Missing References"
+// panel rather than buried in free-text suggestions.
+// ---------------------------------------------------------------------------
+
+const PROMOTION_PHRASES = [
+ 'referenced but not included',
+ 'referenced but not attached',
+ 'referenced but not provided',
+ 'not included in this document',
+ 'not attached',
+ 'not provided',
+ 'is missing',
+ 'should be attached',
+ 'should be included',
+ 'was not found',
+ 'does not appear',
+];
+
+const NAMED_DOC_PATTERN = /["']([^"']+)["']|(?:the|a)\s+([\w\s]+?)\s+(?:is|was|should|does)/i;
+const PAGE_PATTERN = /page\s+(\d+)/i;
+
+function promoteStrongRecommendations(
+ existingDocs: MissingDocumentPrediction[],
+ recommendations: string[],
+): MissingDocumentPrediction[] {
+ const promoted = [...existingDocs];
+ const existingNames = new Set(
+ existingDocs.map(d => d.documentName.toLowerCase().trim()),
+ );
+
+ for (const rec of recommendations) {
+ const lower = rec.toLowerCase();
+ const matchesPhrase = PROMOTION_PHRASES.some(p => lower.includes(p));
+ if (!matchesPhrase) continue;
+
+ const nameMatch = NAMED_DOC_PATTERN.exec(rec);
+ const docName = (nameMatch?.[1] ?? nameMatch?.[2] ?? '').trim();
+ if (!docName || docName.length < 3) continue;
+
+ if (existingNames.has(docName.toLowerCase())) continue;
+
+ const pageMatch = PAGE_PATTERN.exec(rec);
+ const page = pageMatch ? parseInt(pageMatch[1]!, 10) : 1;
+
+ promoted.push({
+ documentName: docName,
+ documentType: 'other',
+ reason: rec,
+ page,
+ priority: 'medium',
+ });
+ existingNames.add(docName.toLowerCase());
+ }
+
+ return promoted;
+}
diff --git a/src/app/api/agents/predictive-document-analysis/services/annOptimizer.ts b/src/app/api/agents/predictive-document-analysis/services/annOptimizer.ts
index 245163c3..6e395b78 100644
--- a/src/app/api/agents/predictive-document-analysis/services/annOptimizer.ts
+++ b/src/app/api/agents/predictive-document-analysis/services/annOptimizer.ts
@@ -1,9 +1,10 @@
import { db } from "~/server/db/index";
import { and, eq, inArray, sql } from "drizzle-orm";
-import { documentSections, pdfChunks } from "~/server/db/schema";
+import { documentSections, pdfChunks, documentRetrievalChunks } from "~/server/db/schema";
+import { sanitizeErrorMessage } from "~/app/api/agents/predictive-document-analysis/utils/logging";
interface ANNConfig {
- strategy: 'hnsw' | 'ivf' | 'hybrid' | 'prefiltered';
+ strategy: 'hnsw' | 'ivf' | 'hybrid' | 'prefiltered' | 'matryoshka';
probeCount?: number;
efSearch?: number;
maxCandidates?: number;
@@ -57,6 +58,9 @@ export class ANNOptimizer {
case 'prefiltered':
return this.prefilteredSearch(queryEmbedding, documentIds, limit, distanceThreshold);
+
+ case 'matryoshka':
+ return this.matryoshkaSearch(queryEmbedding, documentIds, limit, distanceThreshold);
case 'hybrid':
default:
@@ -70,142 +74,96 @@ export class ANNOptimizer {
limit: number,
threshold: number
): Promise {
-
- const embeddingStr = `[${queryEmbedding.join(',')}]`;
-
- const approximateLimit = Math.min(limit * 5, 100);
-
- const results = await db.select({
- id: documentSections.id,
- content: documentSections.content,
- page: documentSections.pageNumber,
- documentId: documentSections.documentId,
- distance: sql`${documentSections.embedding} <=> ${embeddingStr}::vector`,
- })
- .from(documentSections)
- .where(inArray(documentSections.documentId, documentIds.map(id => BigInt(id))))
- .orderBy(sql`${documentSections.embedding} <=> ${embeddingStr}::vector`)
- .limit(approximateLimit);
-
- let rows: ANNRow[] = results.map(r => ({
- id: r.id,
- content: r.content,
- page: r.page ?? 0,
- documentId: Number(r.documentId),
- distance: Number(r.distance ?? 1),
- }));
-
- // Fallback to legacy table
- if (rows.length === 0 && documentIds.length === 1) {
- const legacyResults = await db.select({
- id: pdfChunks.id,
- content: pdfChunks.content,
- page: pdfChunks.page,
- documentId: pdfChunks.documentId,
- distance: sql`${pdfChunks.embedding} <=> ${embeddingStr}::vector`,
+ try {
+ const embeddingStr = `[${queryEmbedding.join(',')}]`;
+
+ const approximateLimit = Math.min(limit * 5, 100);
+
+ const results = await db.select({
+ id: documentSections.id,
+ content: documentSections.content,
+ page: documentSections.pageNumber,
+ documentId: documentSections.documentId,
+ distance: sql`${documentSections.embedding} <=> ${embeddingStr}::vector`,
})
- .from(pdfChunks)
- .where(eq(pdfChunks.documentId, BigInt(documentIds[0]!)))
- .orderBy(sql`${pdfChunks.embedding} <=> ${embeddingStr}::vector`)
+ .from(documentSections)
+ .where(inArray(documentSections.documentId, documentIds.map(id => BigInt(id))))
+ .orderBy(sql`${documentSections.embedding} <=> ${embeddingStr}::vector`)
.limit(approximateLimit);
- rows = legacyResults.map(r => ({
+ let rows: ANNRow[] = results.map(r => ({
id: r.id,
content: r.content,
- page: r.page,
+ page: r.page ?? 0,
documentId: Number(r.documentId),
distance: Number(r.distance ?? 1),
}));
- }
-
- const refinedResults = rows
- .map(row => ({
- ...row,
- confidence: Math.max(0, 1 - row.distance)
- }))
- .filter(r => r.distance <= threshold)
- .sort((a, b) => a.distance - b.distance)
- .slice(0, limit);
-
- return refinedResults as ANNResult[];
- }
-
- private async ivfSearch(
- queryEmbedding: number[],
- documentIds: number[],
- limit: number,
- threshold: number
- ): Promise {
- const relevantClusters = await this.findRelevantDocumentClusters(
- queryEmbedding,
- documentIds,
- this.config.probeCount ?? 3
- );
- if (relevantClusters.length === 0) {
- return this.hnswSearch(queryEmbedding, documentIds, limit, threshold);
- }
+ // Fallback to legacy table
+ if (rows.length === 0 && documentIds.length === 1) {
+ const legacyResults = await db.select({
+ id: pdfChunks.id,
+ content: pdfChunks.content,
+ page: pdfChunks.page,
+ documentId: pdfChunks.documentId,
+ distance: sql`${pdfChunks.embedding} <=> ${embeddingStr}::vector`,
+ })
+ .from(pdfChunks)
+ .where(eq(pdfChunks.documentId, BigInt(documentIds[0]!)))
+ .orderBy(sql`${pdfChunks.embedding} <=> ${embeddingStr}::vector`)
+ .limit(approximateLimit);
+
+ rows = legacyResults.map(r => ({
+ id: r.id,
+ content: r.content,
+ page: r.page,
+ documentId: Number(r.documentId),
+ distance: Number(r.distance ?? 1),
+ }));
+ }
- const clusterChunkIds = relevantClusters.flatMap(c => c.chunkIds);
-
- if (clusterChunkIds.length === 0) {
+ const refinedResults = rows
+ .map(row => ({
+ ...row,
+ confidence: Math.max(0, 1 - row.distance)
+ }))
+ .filter(r => r.distance <= threshold)
+ .sort((a, b) => a.distance - b.distance)
+ .slice(0, limit);
+
+ return refinedResults as ANNResult[];
+ } catch (error) {
+ console.warn("HNSW search failed:", sanitizeErrorMessage(error));
return [];
}
-
- const embeddingStr = `[${queryEmbedding.join(',')}]`;
-
- const results = await db.select({
- id: documentSections.id,
- content: documentSections.content,
- page: documentSections.pageNumber,
- documentId: documentSections.documentId,
- distance: sql`${documentSections.embedding} <=> ${embeddingStr}::vector`,
- })
- .from(documentSections)
- .where(and(
- inArray(documentSections.id, clusterChunkIds),
- sql`${documentSections.embedding} <=> ${embeddingStr}::vector <= ${threshold}`,
- ))
- .orderBy(sql`${documentSections.embedding} <=> ${embeddingStr}::vector`)
- .limit(limit);
-
- return results.map(row => ({
- id: row.id,
- content: row.content,
- page: row.page ?? 0,
- documentId: Number(row.documentId),
- distance: Number(row.distance ?? 1),
- confidence: Math.max(0, 1 - Number(row.distance ?? 1)),
- }));
}
-
- private async prefilteredSearch(
+ private async ivfSearch(
queryEmbedding: number[],
documentIds: number[],
limit: number,
threshold: number
): Promise {
-
- const docScores = await this.calculateDocumentRelevanceScores(queryEmbedding, documentIds);
-
- const sortedDocIds = docScores
- .filter(d => d.score > (this.config.prefilterThreshold ?? 0.3))
- .sort((a, b) => b.score - a.score)
- .map(d => d.documentId);
-
- if (sortedDocIds.length === 0) {
- return this.hnswSearch(queryEmbedding, documentIds, limit, threshold);
- }
-
- const results: ANNResult[] = [];
- const embeddingStr = `[${queryEmbedding.join(',')}]`;
-
- for (const docId of sortedDocIds) {
- if (results.length >= limit) break;
+ try {
+ const relevantClusters = await this.findRelevantDocumentClusters(
+ queryEmbedding,
+ documentIds,
+ this.config.probeCount ?? 3
+ );
+
+ if (relevantClusters.length === 0) {
+ return this.hnswSearch(queryEmbedding, documentIds, limit, threshold);
+ }
- const remaining = limit - results.length;
- const docResults = await db.select({
+ const clusterChunkIds = relevantClusters.flatMap(c => c.chunkIds);
+
+ if (clusterChunkIds.length === 0) {
+ return [];
+ }
+
+ const embeddingStr = `[${queryEmbedding.join(',')}]`;
+
+ const results = await db.select({
id: documentSections.id,
content: documentSections.content,
page: documentSections.pageNumber,
@@ -214,13 +172,13 @@ export class ANNOptimizer {
})
.from(documentSections)
.where(and(
- eq(documentSections.documentId, BigInt(docId)),
+ inArray(documentSections.id, clusterChunkIds),
sql`${documentSections.embedding} <=> ${embeddingStr}::vector <= ${threshold}`,
))
.orderBy(sql`${documentSections.embedding} <=> ${embeddingStr}::vector`)
- .limit(remaining * 2);
+ .limit(limit);
- const mappedResults: ANNResult[] = docResults.map(row => ({
+ return results.map(row => ({
id: row.id,
content: row.content,
page: row.page ?? 0,
@@ -228,14 +186,161 @@ export class ANNOptimizer {
distance: Number(row.distance ?? 1),
confidence: Math.max(0, 1 - Number(row.distance ?? 1)),
}));
-
- results.push(...mappedResults.slice(0, remaining));
+ } catch (error) {
+ console.warn("IVF search failed:", sanitizeErrorMessage(error));
+ return [];
}
+ }
+
+
+ private async prefilteredSearch(
+ queryEmbedding: number[],
+ documentIds: number[],
+ limit: number,
+ threshold: number
+ ): Promise {
+ try {
+ const docScores = await this.calculateDocumentRelevanceScores(queryEmbedding, documentIds);
+
+ const sortedDocIds = docScores
+ .filter(d => d.score > (this.config.prefilterThreshold ?? 0.3))
+ .sort((a, b) => b.score - a.score)
+ .map(d => d.documentId);
+
+ if (sortedDocIds.length === 0) {
+ return this.hnswSearch(queryEmbedding, documentIds, limit, threshold);
+ }
+
+ const results: ANNResult[] = [];
+ const embeddingStr = `[${queryEmbedding.join(',')}]`;
+
+ for (const docId of sortedDocIds) {
+ if (results.length >= limit) break;
+
+ const remaining = limit - results.length;
+ const docResults = await db.select({
+ id: documentSections.id,
+ content: documentSections.content,
+ page: documentSections.pageNumber,
+ documentId: documentSections.documentId,
+ distance: sql`${documentSections.embedding} <=> ${embeddingStr}::vector`,
+ })
+ .from(documentSections)
+ .where(and(
+ eq(documentSections.documentId, BigInt(docId)),
+ sql`${documentSections.embedding} <=> ${embeddingStr}::vector <= ${threshold}`,
+ ))
+ .orderBy(sql`${documentSections.embedding} <=> ${embeddingStr}::vector`)
+ .limit(remaining * 2);
+
+ const mappedResults: ANNResult[] = docResults.map(row => ({
+ id: row.id,
+ content: row.content,
+ page: row.page ?? 0,
+ documentId: Number(row.documentId),
+ distance: Number(row.distance ?? 1),
+ confidence: Math.max(0, 1 - Number(row.distance ?? 1)),
+ }));
+
+ results.push(...mappedResults.slice(0, remaining));
+ }
- return results.sort((a, b) => a.distance - b.distance);
+ return results.sort((a, b) => a.distance - b.distance);
+ } catch (error) {
+ console.warn("Prefiltered search failed:", sanitizeErrorMessage(error));
+ return [];
+ }
}
+ /**
+ * Matryoshka coarse-to-fine: use 512-dim short embeddings from
+ * document_retrieval_chunks (HNSW-indexed) for fast candidate filtering,
+ * then re-rank the top candidates with full 1536-dim embeddings.
+ */
+ private async matryoshkaSearch(
+ queryEmbedding: number[],
+ documentIds: number[],
+ limit: number,
+ threshold: number
+ ): Promise {
+ try {
+ const shortDim = 512;
+ const queryShort = queryEmbedding.slice(0, shortDim);
+ const shortStr = `[${queryShort.join(',')}]`;
+
+ const coarseCandidateCount = Math.min(limit * 6, 120);
+
+ const coarseResults = await db.select({
+ id: documentRetrievalChunks.id,
+ content: documentRetrievalChunks.content,
+ documentId: documentRetrievalChunks.documentId,
+ contextChunkId: documentRetrievalChunks.contextChunkId,
+ shortDistance: sql`${documentRetrievalChunks.embeddingShort} <=> ${shortStr}::vector`,
+ })
+ .from(documentRetrievalChunks)
+ .where(inArray(documentRetrievalChunks.documentId, documentIds.map(id => BigInt(id))))
+ .orderBy(sql`${documentRetrievalChunks.embeddingShort} <=> ${shortStr}::vector`)
+ .limit(coarseCandidateCount);
+
+ if (coarseResults.length === 0) {
+ return this.hnswSearch(queryEmbedding, documentIds, limit, threshold);
+ }
+
+ const candidateIds = coarseResults.map(r => r.id);
+ const fullStr = `[${queryEmbedding.join(',')}]`;
+
+ const refinedResults = await db.select({
+ id: documentRetrievalChunks.id,
+ content: documentRetrievalChunks.content,
+ documentId: documentRetrievalChunks.documentId,
+ distance: sql`${documentRetrievalChunks.embedding} <=> ${fullStr}::vector`,
+ })
+ .from(documentRetrievalChunks)
+ .where(inArray(documentRetrievalChunks.id, candidateIds))
+ .orderBy(sql`${documentRetrievalChunks.embedding} <=> ${fullStr}::vector`)
+ .limit(limit);
+
+ const contextChunkIds = coarseResults
+ .map(r => Number(r.contextChunkId))
+ .filter(id => !isNaN(id));
+
+ const pageMap = new Map();
+ if (contextChunkIds.length > 0) {
+ const pages = await db.select({
+ id: documentSections.id,
+ page: documentSections.pageNumber,
+ })
+ .from(documentSections)
+ .where(inArray(documentSections.id, contextChunkIds));
+
+ for (const p of pages) {
+ pageMap.set(p.id, p.page ?? 1);
+ }
+ }
+
+ const contextIdMap = new Map(coarseResults.map(r => [r.id, Number(r.contextChunkId)]));
+
+ return refinedResults
+ .map(row => {
+ const dist = Number(row.distance ?? 1);
+ const ctxId = contextIdMap.get(row.id);
+ return {
+ id: row.id,
+ content: row.content,
+ page: ctxId ? (pageMap.get(ctxId) ?? 1) : 1,
+ documentId: Number(row.documentId),
+ distance: dist,
+ confidence: Math.max(0, 1 - dist),
+ };
+ })
+ .filter(r => r.distance <= threshold);
+ } catch (error) {
+ console.warn("Matryoshka search failed:", sanitizeErrorMessage(error));
+ return [];
+ }
+ }
+
private async hybridSearch(
queryEmbedding: number[],
documentIds: number[],
@@ -251,7 +356,8 @@ export class ANNOptimizer {
return this.prefilteredSearch(queryEmbedding, documentIds, limit, threshold);
}
- return this.ivfSearch(queryEmbedding, documentIds, limit, threshold);
+ // For large document sets, use Matryoshka coarse-to-fine
+ return this.matryoshkaSearch(queryEmbedding, documentIds, limit, threshold);
}
private async calculateDocumentRelevanceScores(
diff --git a/src/app/api/agents/predictive-document-analysis/services/documentMatcher.ts b/src/app/api/agents/predictive-document-analysis/services/documentMatcher.ts
index 4b754de0..53437be3 100644
--- a/src/app/api/agents/predictive-document-analysis/services/documentMatcher.ts
+++ b/src/app/api/agents/predictive-document-analysis/services/documentMatcher.ts
@@ -9,7 +9,9 @@ import type {
} from "~/app/api/agents/predictive-document-analysis/types";
import { getEmbeddings } from "~/app/api/agents/predictive-document-analysis/utils/embeddings";
import { cleanText, truncateText } from "~/app/api/agents/predictive-document-analysis/utils/content";
+import { sanitizeErrorMessage } from "~/app/api/agents/predictive-document-analysis/utils/logging";
import ANNOptimizer from "~/app/api/agents/predictive-document-analysis/services/annOptimizer";
+import { hybridSearchWithRRF } from "~/app/api/agents/predictive-document-analysis/services/hybridSearch";
type MatchCandidate = {
documentId: number;
@@ -77,9 +79,23 @@ export async function findSuggestedCompanyDocuments(
const highConfidenceMatches = Array.from(matchCandidates.values()).filter(m => m.confidence > 0.7);
if (highConfidenceMatches.length < 2) {
- const contextMatches = await findOptimizedContextualMatches(missingDoc, otherDocIds);
+ // Run ANN vector search and BM25+vector hybrid search in parallel
+ const searchQuery = `${missingDoc.documentType} ${missingDoc.documentName}`;
+ const [contextMatches, hybridMatches] = await Promise.all([
+ findOptimizedContextualMatches(missingDoc, otherDocIds),
+ hybridSearchWithRRF(searchQuery, otherDocIds, 6).catch(() => [] as DocumentMatch[]),
+ ]);
+
+ const allContextMatches = [...contextMatches, ...hybridMatches];
+ const bestByDoc = new Map();
+ for (const m of allContextMatches) {
+ const existing = bestByDoc.get(m.documentId);
+ if (!existing || m.similarity > existing.similarity) {
+ bestByDoc.set(m.documentId, m);
+ }
+ }
- for (const match of contextMatches) {
+ for (const match of bestByDoc.values()) {
const existing = matchCandidates.get(match.documentId);
if (!existing || (match.similarity > existing.confidence && match.similarity > 0.5)) {
const validatedMatch = await validateContextualMatch(missingDoc, match);
@@ -90,7 +106,7 @@ export async function findSuggestedCompanyDocuments(
page: match.page,
snippet: validatedMatch.snippet,
reasons: validatedMatch.reasons,
- matchTypes: existing ? [...(existing.matchTypes ?? []), 'contextual-ann'] : ['contextual-ann'],
+ matchTypes: existing ? [...(existing.matchTypes ?? []), 'contextual-hybrid'] : ['contextual-hybrid'],
finalScore: validatedMatch.confidence * 0.9
});
}
@@ -118,7 +134,7 @@ export async function findSuggestedCompanyDocuments(
return finalSuggestions;
} catch (error) {
- console.error("Error finding suggested company documents:", error);
+ console.error("Error finding suggested company documents:", sanitizeErrorMessage(error));
return [];
}
}
@@ -255,7 +271,7 @@ async function findOptimizedContextualMatches(
});
}
} catch (error) {
- console.warn(`ANN search failed for query "${query}", falling back to traditional search:`, error);
+ console.warn(`ANN search failed for query "${query}", falling back to traditional search:`, sanitizeErrorMessage(error));
const fallbackMatches = await findTraditionalContextualMatches(query, queryEmbedding, docIds);
allMatches.push(...fallbackMatches);
@@ -279,30 +295,35 @@ async function findTraditionalContextualMatches(
queryEmbedding: number[],
docIds: number[]
): Promise {
- const distanceSql = sql`embedding <=> ${`[${queryEmbedding.join(',')}]`}::vector`;
- const results = await db.select({
- id: documentSections.id,
- content: documentSections.content,
- page: documentSections.pageNumber,
- documentId: documentSections.documentId,
- distance: distanceSql
- }).from(documentSections).where(and(
- inArray(documentSections.documentId, docIds.map(id => BigInt(id))),
- sql`${distanceSql} < 0.3`
- )).orderBy(distanceSql).limit(5);
+ try {
+ const distanceSql = sql`embedding <=> ${`[${queryEmbedding.join(',')}]`}::vector`;
+ const results = await db.select({
+ id: documentSections.id,
+ content: documentSections.content,
+ page: documentSections.pageNumber,
+ documentId: documentSections.documentId,
+ distance: distanceSql
+ }).from(documentSections).where(and(
+ inArray(documentSections.documentId, docIds.map(id => BigInt(id))),
+ sql`${distanceSql} < 0.3`
+ )).orderBy(distanceSql).limit(5);
- return results.map(result => {
- const distance = Number(result.distance) ?? 1;
- const similarity = Math.max(0, (1 - distance) * 0.7);
+ return results.map(result => {
+ const distance = Number(result.distance) ?? 1;
+ const similarity = Math.max(0, (1 - distance) * 0.7);
- return {
- documentId: Number(result.documentId),
- page: result.page ?? 1,
- snippet: truncateText(result.content, 150),
- similarity,
- content: result.content
- };
- });
+ return {
+ documentId: Number(result.documentId),
+ page: result.page ?? 1,
+ snippet: truncateText(result.content, 150),
+ similarity,
+ content: result.content
+ };
+ });
+ } catch (error) {
+ console.warn("Traditional contextual search failed:", sanitizeErrorMessage(error));
+ return [];
+ }
}
async function validateContextualMatch(
diff --git a/src/app/api/agents/predictive-document-analysis/services/hybridSearch.ts b/src/app/api/agents/predictive-document-analysis/services/hybridSearch.ts
new file mode 100644
index 00000000..7eb90b5f
--- /dev/null
+++ b/src/app/api/agents/predictive-document-analysis/services/hybridSearch.ts
@@ -0,0 +1,162 @@
+import { db } from "~/server/db/index";
+import { and, inArray, sql } from "drizzle-orm";
+import { documentSections } from "~/server/db/schema";
+import { getEmbeddings } from "~/app/api/agents/predictive-document-analysis/utils/embeddings";
+import { truncateText } from "~/app/api/agents/predictive-document-analysis/utils/content";
+import type { DocumentMatch } from "~/app/api/agents/predictive-document-analysis/types";
+
+interface RankedResult {
+ documentId: number;
+ page: number;
+ content: string;
+ rank: number;
+}
+
+/**
+ * Full-text search using PostgreSQL's built-in ts_vector/ts_query.
+ * Returns results ranked by ts_rank.
+ */
+async function bm25Search(
+ query: string,
+ docIds: number[],
+ limit = 10,
+): Promise {
+ if (docIds.length === 0) return [];
+
+ const tsQuery = query
+ .split(/\s+/)
+ .filter(w => w.length > 1)
+ .map(w => w.replace(/[^a-zA-Z0-9]/g, ''))
+ .filter(Boolean)
+ .join(' | ');
+
+ if (!tsQuery) return [];
+
+ const results = await db.select({
+ id: documentSections.id,
+ content: documentSections.content,
+ page: documentSections.pageNumber,
+ documentId: documentSections.documentId,
+ rank: sql`ts_rank(to_tsvector('english', ${documentSections.content}), to_tsquery('english', ${tsQuery}))`,
+ })
+ .from(documentSections)
+ .where(and(
+ inArray(documentSections.documentId, docIds.map(id => BigInt(id))),
+ sql`to_tsvector('english', ${documentSections.content}) @@ to_tsquery('english', ${tsQuery})`,
+ ))
+ .orderBy(sql`ts_rank(to_tsvector('english', ${documentSections.content}), to_tsquery('english', ${tsQuery})) DESC`)
+ .limit(limit);
+
+ return results.map((r, idx) => ({
+ documentId: Number(r.documentId),
+ page: r.page ?? 1,
+ content: r.content,
+ rank: idx + 1,
+ }));
+}
+
+/**
+ * Dense vector search using cosine similarity.
+ */
+async function vectorSearch(
+ query: string,
+ docIds: number[],
+ limit = 10,
+ threshold = 0.4,
+): Promise {
+ if (docIds.length === 0) return [];
+
+ const queryEmbedding = await getEmbeddings(query);
+ if (queryEmbedding.length === 0) return [];
+
+ const embeddingStr = `[${queryEmbedding.join(',')}]`;
+
+ const results = await db.select({
+ id: documentSections.id,
+ content: documentSections.content,
+ page: documentSections.pageNumber,
+ documentId: documentSections.documentId,
+ distance: sql`${documentSections.embedding} <=> ${embeddingStr}::vector`,
+ })
+ .from(documentSections)
+ .where(and(
+ inArray(documentSections.documentId, docIds.map(id => BigInt(id))),
+ sql`${documentSections.embedding} <=> ${embeddingStr}::vector < ${threshold}`,
+ ))
+ .orderBy(sql`${documentSections.embedding} <=> ${embeddingStr}::vector`)
+ .limit(limit);
+
+ return results.map((r, idx) => ({
+ documentId: Number(r.documentId),
+ page: r.page ?? 1,
+ content: r.content,
+ rank: idx + 1,
+ }));
+}
+
+/**
+ * Reciprocal Rank Fusion: merges ranked lists from different retrieval methods.
+ * RRF(d) = sum( 1 / (k + rank_i(d)) ) for each list i that contains d.
+ * k=60 is standard (from the original Cormack et al. paper).
+ */
+function reciprocalRankFusion(
+ lists: RankedResult[][],
+ k = 60,
+): Map {
+ const fused = new Map();
+
+ for (const list of lists) {
+ for (const item of list) {
+ const key = `${item.documentId}:${item.page}`;
+ const existing = fused.get(key);
+ const rrfScore = 1 / (k + item.rank);
+
+ if (existing) {
+ existing.score += rrfScore;
+ if (item.content.length > existing.content.length) {
+ existing.content = item.content;
+ }
+ } else {
+ fused.set(key, {
+ score: rrfScore,
+ documentId: item.documentId,
+ page: item.page,
+ content: item.content,
+ });
+ }
+ }
+ }
+
+ return fused;
+}
+
+/**
+ * Hybrid search combining BM25 full-text and vector similarity with RRF.
+ */
+export async function hybridSearchWithRRF(
+ query: string,
+ docIds: number[],
+ limit = 8,
+): Promise {
+ if (docIds.length === 0) return [];
+
+ const [bm25Results, vecResults] = await Promise.all([
+ bm25Search(query, docIds, limit * 2).catch(() => [] as RankedResult[]),
+ vectorSearch(query, docIds, limit * 2).catch(() => [] as RankedResult[]),
+ ]);
+
+ if (bm25Results.length === 0 && vecResults.length === 0) return [];
+
+ const fused = reciprocalRankFusion([bm25Results, vecResults]);
+
+ return Array.from(fused.values())
+ .sort((a, b) => b.score - a.score)
+ .slice(0, limit)
+ .map(r => ({
+ documentId: r.documentId,
+ page: r.page,
+ snippet: truncateText(r.content, 150),
+ similarity: Math.min(r.score * 60, 0.95),
+ content: r.content,
+ }));
+}
diff --git a/src/app/api/agents/predictive-document-analysis/services/referenceExtractor.ts b/src/app/api/agents/predictive-document-analysis/services/referenceExtractor.ts
index 8f5e688e..892c7863 100644
--- a/src/app/api/agents/predictive-document-analysis/services/referenceExtractor.ts
+++ b/src/app/api/agents/predictive-document-analysis/services/referenceExtractor.ts
@@ -2,7 +2,8 @@ import { ChatOpenAI } from "@langchain/openai";
import { HumanMessage, SystemMessage } from "@langchain/core/messages";
import { z } from "zod";
import type { PdfChunk, DocumentReference } from "~/app/api/agents/predictive-document-analysis/types";
-import { groupContentFromChunks, hasSpecificIdentifier } from "~/app/api/agents/predictive-document-analysis/utils/content";
+import { groupContentFromChunks, isValidReference } from "~/app/api/agents/predictive-document-analysis/utils/content";
+import { sanitizeErrorMessage } from "~/app/api/agents/predictive-document-analysis/utils/logging";
const ReferenceExtractionSchema = z.object({
references: z.array(z.object({
@@ -15,20 +16,25 @@ const ReferenceExtractionSchema = z.object({
function createReferenceExtractionPrompt(content: string): string {
return `
- You are an expert in extracting references from documents.
+ You are an expert at extracting document references from any type of document.
- Extract ONLY clear, explicit references to separate documents that should be attached or included (e.g., "See Exhibit A", "Schedule 1 attached", "Refer to Addendum B").
+ Extract clear, explicit references to separate documents, resources, or materials that the reader is expected to consult but that are NOT included in the current content.
+
+ Examples of references to extract:
+ • "See Exhibit A", "Schedule 1 attached", "Refer to Addendum B"
+ • "Please see syllabus", "refer to the handbook", "as described in the user guide"
+ • "See the policy document", "complete Form W-9", "review the template"
+ • "Posted on Canvas", "available on the course website"
- IMPORTANT RULES:
- - Only extract references that use specific document identifiers (Exhibit A, Schedule 1, Attachment B, etc.)
- - Ignore general mentions like "other documents", "additional forms", "related materials"
- - Ignore references to external documents that are clearly not part of this document set
- - Only include references where the document is expected to be attached or included
- - Be very conservative - when in doubt, don't extract it
+ RULES:
+ - Extract any named document, form, guide, syllabus, handbook, manual, template, or policy that is referenced
+ - Include references where the reader is directed to consult another document ("see", "refer to", "please review", "posted on")
+ - Ignore vague generic mentions like "other documents", "various materials", "related items"
+ - Do NOT extract URLs themselves (those are handled separately)
For each valid reference, provide:
- - name: The specific document identifier (e.g., "Exhibit A", "Schedule 1")
- - type: The document type (exhibit, schedule, attachment, addendum)
+ - name: The document name (e.g., "Exhibit A", "Syllabus", "Employee Handbook", "Form W-9")
+ - type: The document type (exhibit, schedule, attachment, addendum, syllabus, handbook, policy, form, template, guide, manual, other)
- page: The page number where referenced
- contextSnippet: 15-30 words around the reference showing why it should be included
@@ -69,12 +75,12 @@ export async function extractReferences(
const references = response.references;
const filteredReferences = references.filter(ref =>
- hasSpecificIdentifier(ref.documentName)
+ isValidReference(ref.documentName)
);
return filteredReferences;
} catch (error) {
- console.error("Reference extraction error:", error);
+ console.error("Reference extraction error:", sanitizeErrorMessage(error));
return [];
}
}
diff --git a/src/app/api/agents/predictive-document-analysis/stream/route.ts b/src/app/api/agents/predictive-document-analysis/stream/route.ts
new file mode 100644
index 00000000..db5291df
--- /dev/null
+++ b/src/app/api/agents/predictive-document-analysis/stream/route.ts
@@ -0,0 +1,180 @@
+import { NextResponse } from "next/server";
+import { db } from "~/server/db/index";
+import { eq, and, gt, desc, sql } from "drizzle-orm";
+import {
+ predictiveDocumentAnalysisResults,
+ document,
+ documentContextChunks,
+} from "~/server/db/schema";
+import { inngest } from "~/server/inngest/client";
+import { validateRequestBody, PredictiveAnalysisSchema } from "~/lib/validation";
+import { CACHE_CONFIG, ERROR_TYPES, HTTP_STATUS, type AnalysisType } from "~/lib/constants";
+
+export const runtime = "nodejs";
+export const maxDuration = 300;
+
+/**
+ * SSE endpoint for predictive document analysis.
+ *
+ * Dispatches the analysis to Inngest for async processing, then streams
+ * progress updates back to the client via Server-Sent Events. The client
+ * receives:
+ * - "status" events with progress updates
+ * - a "result" event with the full analysis when complete
+ * - an "error" event if the job fails
+ */
+export async function POST(request: Request) {
+ const validation = await validateRequestBody(request, PredictiveAnalysisSchema);
+ if (!validation.success) {
+ return validation.response;
+ }
+
+ const {
+ documentId,
+ analysisType,
+ includeRelatedDocs,
+ timeoutMs,
+ forceRefresh,
+ } = validation.data;
+
+ const typedAnalysisType: AnalysisType = analysisType ?? "general";
+ const typedIncludeRelatedDocs = includeRelatedDocs ?? false;
+
+ // Check cache first (unless forceRefresh)
+ if (!forceRefresh) {
+ const cached = await db
+ .select({ resultJson: predictiveDocumentAnalysisResults.resultJson })
+ .from(predictiveDocumentAnalysisResults)
+ .where(
+ and(
+ eq(predictiveDocumentAnalysisResults.documentId, BigInt(documentId)),
+ eq(predictiveDocumentAnalysisResults.analysisType, typedAnalysisType),
+ eq(predictiveDocumentAnalysisResults.includeRelatedDocs, typedIncludeRelatedDocs),
+ gt(
+ predictiveDocumentAnalysisResults.createdAt,
+ sql`NOW() - INTERVAL '${sql.raw(`${CACHE_CONFIG.TTL_HOURS} hours`)}'`
+ )
+ )
+ )
+ .orderBy(desc(predictiveDocumentAnalysisResults.createdAt))
+ .limit(1);
+
+ if (cached[0]?.resultJson) {
+ const encoder = new TextEncoder();
+ const stream = new ReadableStream({
+ start(controller) {
+ controller.enqueue(encoder.encode(`data: ${JSON.stringify({ type: "result", data: cached[0]!.resultJson, fromCache: true })}\n\n`));
+ controller.close();
+ },
+ });
+ return new Response(stream, {
+ headers: {
+ "Content-Type": "text/event-stream",
+ "Cache-Control": "no-cache",
+ Connection: "keep-alive",
+ },
+ });
+ }
+ }
+
+ // Verify document exists and has chunks
+ const docCheck = await db
+ .select({ id: document.id })
+ .from(document)
+ .where(eq(document.id, documentId))
+ .limit(1);
+
+ if (docCheck.length === 0) {
+ return NextResponse.json(
+ { success: false, message: "Document not found.", errorType: ERROR_TYPES.VALIDATION },
+ { status: HTTP_STATUS.NOT_FOUND }
+ );
+ }
+
+ const chunkCount = await db
+ .select({ count: sql`count(*)` })
+ .from(documentContextChunks)
+ .where(eq(documentContextChunks.documentId, BigInt(documentId)));
+
+ const totalChunks = Number(chunkCount[0]?.count ?? 0);
+ if (totalChunks === 0) {
+ return NextResponse.json(
+ { success: false, message: "No chunks found for document.", errorType: ERROR_TYPES.VALIDATION },
+ { status: HTTP_STATUS.NOT_FOUND }
+ );
+ }
+
+ // Dispatch to Inngest
+ const jobId = `pda-${documentId}-${Date.now()}`;
+ await inngest.send({
+ name: "predictive-analysis/run.requested",
+ data: {
+ documentId,
+ analysisType: typedAnalysisType,
+ includeRelatedDocs: typedIncludeRelatedDocs,
+ timeoutMs,
+ jobId,
+ },
+ });
+
+ // Stream progress via SSE by polling the result table
+ const encoder = new TextEncoder();
+ const maxWaitMs = timeoutMs ?? 120000;
+ const pollIntervalMs = 3000;
+
+ const stream = new ReadableStream({
+ async start(controller) {
+ const send = (event: string, data: unknown) => {
+ controller.enqueue(encoder.encode(`data: ${JSON.stringify({ type: event, data })}\n\n`));
+ };
+
+ send("status", { phase: "queued", totalChunks, jobId });
+
+ const startTime = Date.now();
+ let found = false;
+
+ while (Date.now() - startTime < maxWaitMs) {
+ await new Promise(resolve => setTimeout(resolve, pollIntervalMs));
+
+ const result = await db
+ .select({ resultJson: predictiveDocumentAnalysisResults.resultJson })
+ .from(predictiveDocumentAnalysisResults)
+ .where(
+ and(
+ eq(predictiveDocumentAnalysisResults.documentId, BigInt(documentId)),
+ eq(predictiveDocumentAnalysisResults.analysisType, typedAnalysisType),
+ gt(
+ predictiveDocumentAnalysisResults.createdAt,
+ sql`NOW() - INTERVAL '5 minutes'`
+ )
+ )
+ )
+ .orderBy(desc(predictiveDocumentAnalysisResults.createdAt))
+ .limit(1);
+
+ if (result[0]?.resultJson) {
+ send("result", { ...result[0].resultJson, fromCache: false });
+ found = true;
+ break;
+ }
+
+ const elapsed = Math.round((Date.now() - startTime) / 1000);
+ send("status", { phase: "processing", elapsed, totalChunks, jobId });
+ }
+
+ if (!found) {
+ send("error", { message: "Analysis timed out. Results will be cached when complete." });
+ }
+
+ controller.close();
+ },
+ });
+
+ return new Response(stream, {
+ headers: {
+ "Content-Type": "text/event-stream",
+ "Cache-Control": "no-cache",
+ Connection: "keep-alive",
+ },
+ });
+}
diff --git a/src/app/api/agents/predictive-document-analysis/types.ts b/src/app/api/agents/predictive-document-analysis/types.ts
index efe9669c..ece55243 100644
--- a/src/app/api/agents/predictive-document-analysis/types.ts
+++ b/src/app/api/agents/predictive-document-analysis/types.ts
@@ -2,6 +2,7 @@ export type PdfChunk = {
id: number;
content: string;
page: number;
+ sectionHeading?: string | null;
};
export type AnalysisSpecification = {
@@ -52,9 +53,30 @@ export type ResolvedReference = {
priority: 'high' | 'medium' | 'low';
};
+export type InsightCategory =
+ | 'deadline'
+ | 'resource'
+ | 'key-reference'
+ | 'action-item'
+ | 'caveat';
+
+export type InsightSeverity = 'note' | 'warning';
+
+export type DocumentInsight = {
+ category: InsightCategory;
+ severity: InsightSeverity;
+ title: string;
+ detail: string;
+ page: number;
+ sourceQuote?: string;
+ url?: string;
+ date?: string;
+};
+
export type PredictiveAnalysisResult = {
missingDocuments: MissingDocumentPrediction[];
recommendations: string[];
+ insights?: DocumentInsight[];
suggestedRelatedDocuments?: SearchResult[];
resolvedDocuments?: ResolvedReference[];
};
@@ -90,5 +112,8 @@ export const ANALYSIS_TYPES = {
financial: `You are an expert in analyzing financial documents to identify missing reports, statements, and supporting documentation.`,
technical: `You are an expert in analyzing technical documents to identify missing specifications, manuals, and project deliverables.`,
compliance: `You are an expert in analyzing compliance documents to identify missing regulatory filings and policy documents.`,
+ educational: `You are an expert in analyzing educational materials to identify missing referenced course documents, syllabi, handouts, readings, and linked resources.`,
+ hr: `You are an expert in analyzing HR and employee documents to identify missing referenced policies, forms, benefits materials, and compliance documents.`,
+ research: `You are an expert in analyzing research documents to identify missing cited papers, datasets, supplementary materials, and methodology references.`,
general: `You are an expert in analyzing documents to identify any missing referenced or implied documents.`
} as const;
\ No newline at end of file
diff --git a/src/app/api/agents/predictive-document-analysis/utils/batching.ts b/src/app/api/agents/predictive-document-analysis/utils/batching.ts
index 5adada6e..06b35a2f 100644
--- a/src/app/api/agents/predictive-document-analysis/utils/batching.ts
+++ b/src/app/api/agents/predictive-document-analysis/utils/batching.ts
@@ -6,7 +6,13 @@ export type ChunkBatchingOptions = {
};
/**
- * Groups sequential chunks to keep OpenAI round trips bounded.
+ * Groups chunks into batches, keeping semantically related chunks together.
+ *
+ * Strategy:
+ * 1. Group by section heading (when available from document structure).
+ * 2. Within each section group, chunks stay in page order.
+ * 3. Section groups are packed into batches respecting size limits.
+ * 4. Falls back to sequential batching when no structure data exists.
*/
export function createChunkBatches(
chunks: PdfChunk[],
@@ -21,6 +27,72 @@ export function createChunkBatches(
throw new Error("maxCharactersPerCall must be greater than zero");
}
+ const hasStructure = chunks.some(c => c.sectionHeading);
+ if (!hasStructure) {
+ return sequentialBatch(chunks, maxChunksPerCall, maxCharactersPerCall);
+ }
+
+ // Group by section heading, preserving insertion order
+ const sectionGroups: PdfChunk[][] = [];
+ const groupMap = new Map();
+
+ for (const chunk of chunks) {
+ const key = chunk.sectionHeading ?? `__page_${chunk.page}`;
+ let group = groupMap.get(key);
+ if (!group) {
+ group = [];
+ groupMap.set(key, group);
+ sectionGroups.push(group);
+ }
+ group.push(chunk);
+ }
+
+ // Pack section groups into batches
+ const batches: PdfChunk[][] = [];
+ let currentBatch: PdfChunk[] = [];
+ let currentCharCount = 0;
+
+ for (const group of sectionGroups) {
+ const groupCharCount = group.reduce((s, c) => s + (c.content?.length ?? 0), 0);
+
+ // If this entire group fits in the current batch, add it
+ const wouldExceedChunks = currentBatch.length + group.length > maxChunksPerCall;
+ const wouldExceedChars = currentCharCount + groupCharCount > maxCharactersPerCall;
+
+ if (currentBatch.length > 0 && (wouldExceedChunks || wouldExceedChars)) {
+ batches.push(currentBatch);
+ currentBatch = [];
+ currentCharCount = 0;
+ }
+
+ // If the group itself is too large, split it sequentially
+ if (group.length > maxChunksPerCall || groupCharCount > maxCharactersPerCall) {
+ if (currentBatch.length > 0) {
+ batches.push(currentBatch);
+ currentBatch = [];
+ currentCharCount = 0;
+ }
+ const subBatches = sequentialBatch(group, maxChunksPerCall, maxCharactersPerCall);
+ batches.push(...subBatches);
+ continue;
+ }
+
+ currentBatch.push(...group);
+ currentCharCount += groupCharCount;
+ }
+
+ if (currentBatch.length > 0) {
+ batches.push(currentBatch);
+ }
+
+ return batches;
+}
+
+function sequentialBatch(
+ chunks: PdfChunk[],
+ maxChunksPerCall: number,
+ maxCharactersPerCall: number,
+): PdfChunk[][] {
const batches: PdfChunk[][] = [];
let currentBatch: PdfChunk[] = [];
let currentCharCount = 0;
diff --git a/src/app/api/agents/predictive-document-analysis/utils/content.ts b/src/app/api/agents/predictive-document-analysis/utils/content.ts
index 30eb045e..3c18baf6 100644
--- a/src/app/api/agents/predictive-document-analysis/utils/content.ts
+++ b/src/app/api/agents/predictive-document-analysis/utils/content.ts
@@ -2,7 +2,12 @@ import type { PdfChunk } from "~/app/api/agents/predictive-document-analysis/typ
export function groupContentFromChunks(chunks: PdfChunk[]): string {
return chunks
- .map((chunk) => `=== Page ${chunk.page} ===\n${chunk.content}`)
+ .map((chunk) => {
+ const header = chunk.sectionHeading
+ ? `=== Page ${chunk.page} | Section: ${chunk.sectionHeading} ===`
+ : `=== Page ${chunk.page} ===`;
+ return `${header}\n${chunk.content}`;
+ })
.join("\n\n");
}
@@ -59,12 +64,44 @@ export function truncateText(text: string, maxLength = 200): string {
return text.slice(0, maxLength - 3) + '...';
}
+/**
+ * @deprecated Use isValidReference() instead. Kept for backward compatibility.
+ */
export function hasSpecificIdentifier(documentName: string): boolean {
- const identifierPatterns = [
- /^(exhibit|schedule|attachment|addendum|appendix)\s+[a-z0-9]+$/i,
- /^[a-z]+\s+(exhibit|schedule|attachment|addendum|appendix)$/i,
- /^(section|clause|article)\s+[0-9]+(\.[0-9]+)*$/i
+ return isValidReference(documentName);
+}
+
+const GENERIC_REJECT_PHRASES = new Set([
+ 'other documents', 'additional forms', 'related materials',
+ 'various materials', 'related items', 'supporting documents',
+ 'other files', 'additional documents', 'various documents',
+ 'relevant documents', 'necessary documents', 'required documents',
+]);
+
+export function isValidReference(documentName: string): boolean {
+ const name = documentName.trim();
+ if (name.length < 2 || name.length > 200) return false;
+
+ if (GENERIC_REJECT_PHRASES.has(name.toLowerCase())) return false;
+
+ const validPatterns = [
+ // Legal/contract identifiers (original)
+ /^(exhibit|schedule|attachment|addendum|appendix)\s+[a-z0-9]+/i,
+ /^[a-z]+\s+(exhibit|schedule|attachment|addendum|appendix)/i,
+ /^(section|clause|article)\s+[0-9]+(\.[0-9]+)*/i,
+
+ // Named document types (educational, HR, general)
+ /\b(syllabus|handbook|manual|policy|guidelines?|template|curriculum|prospectus)\b/i,
+ /\b(form|certificate|license|permit|charter|bylaws?|constitution)\b/i,
+ /\b(report|statement|agreement|contract|memo|memorandum|notice)\b/i,
+ /\b(plan|proposal|specification|diagram|blueprint|schematic)\b/i,
+ /\b(agenda|minutes|transcript|roster|directory|catalog|brochure)\b/i,
+ /\b(guide|tutorial|worksheet|workbook|checklist|rubric)\b/i,
+
+ // Numbered / coded references
+ /\b(form|table|figure|chart)\s+[a-z0-9\-\.]+/i,
+ /^[A-Z]{1,5}[-\s]?\d+/,
];
-
- return identifierPatterns.some(pattern => pattern.test(documentName.trim()));
+
+ return validPatterns.some(pattern => pattern.test(name));
}
\ No newline at end of file
diff --git a/src/app/api/agents/predictive-document-analysis/utils/embeddings.ts b/src/app/api/agents/predictive-document-analysis/utils/embeddings.ts
index 058364d5..48f7d32b 100644
--- a/src/app/api/agents/predictive-document-analysis/utils/embeddings.ts
+++ b/src/app/api/agents/predictive-document-analysis/utils/embeddings.ts
@@ -1,16 +1,24 @@
import { OpenAIEmbeddings } from "@langchain/openai";
+import { LRUCache } from "lru-cache";
+import { sanitizeErrorMessage } from "~/app/api/agents/predictive-document-analysis/utils/logging";
-const embeddingCache = new Map();
+const EMBEDDING_MODEL = "text-embedding-3-large";
+const MAX_CACHE_ENTRIES = 500;
+
+const embeddingCache = new LRUCache({
+ max: MAX_CACHE_ENTRIES,
+});
export async function getEmbeddings(text: string): Promise {
- if (embeddingCache.has(text)) {
- return embeddingCache.get(text)!;
+ const cached = embeddingCache.get(text);
+ if (cached) {
+ return cached;
}
try {
const embeddings = new OpenAIEmbeddings({
openAIApiKey: process.env.OPENAI_API_KEY,
- modelName: "text-embedding-ada-002",
+ modelName: EMBEDDING_MODEL,
});
const [embedding] = await embeddings.embedDocuments([text]);
@@ -19,7 +27,7 @@ export async function getEmbeddings(text: string): Promise {
embeddingCache.set(text, result);
return result;
} catch (error) {
- console.error("Error getting embeddings:", error);
+ console.error("Error getting embeddings:", sanitizeErrorMessage(error));
return [];
}
}
@@ -30,7 +38,7 @@ export async function batchGetEmbeddings(texts: string[]): Promise {
try {
const embeddings = new OpenAIEmbeddings({
openAIApiKey: process.env.OPENAI_API_KEY,
- modelName: "text-embedding-ada-002",
+ modelName: EMBEDDING_MODEL,
});
const results = await embeddings.embedDocuments(uniqueTexts);
@@ -42,7 +50,7 @@ export async function batchGetEmbeddings(texts: string[]): Promise {
return texts.map(text => embeddingMap.get(text) ?? []);
} catch (error) {
- console.error("Error getting batch embeddings:", error);
+ console.error("Error getting batch embeddings:", sanitizeErrorMessage(error));
return texts.map(() => []);
}
}
@@ -54,6 +62,6 @@ export function clearEmbeddingCache(): void {
export function getEmbeddingCacheStats() {
return {
size: embeddingCache.size,
- entries: Array.from(embeddingCache.keys()).slice(0, 5) // First 5 keys for debugging
+ maxSize: MAX_CACHE_ENTRIES,
};
-}
\ No newline at end of file
+}
diff --git a/src/app/api/agents/predictive-document-analysis/utils/insightExtractors.ts b/src/app/api/agents/predictive-document-analysis/utils/insightExtractors.ts
new file mode 100644
index 00000000..a85512bd
--- /dev/null
+++ b/src/app/api/agents/predictive-document-analysis/utils/insightExtractors.ts
@@ -0,0 +1,537 @@
+import type { PdfChunk, DocumentInsight } from "~/app/api/agents/predictive-document-analysis/types";
+import stringSimilarity from 'string-similarity-js';
+
+// ---------------------------------------------------------------------------
+// Layer 1 — Deterministic insight extraction (zero LLM cost)
+// ---------------------------------------------------------------------------
+
+// ── Document format detection ─────────────────────────────────
+
+export type DocumentFormat = 'slides' | 'prose' | 'mixed';
+
+export function detectDocumentFormat(chunks: PdfChunk[]): DocumentFormat {
+ if (chunks.length === 0) return 'mixed';
+
+ const totalChars = chunks.reduce((s, c) => s + (c.content?.length ?? 0), 0);
+ const avgChunkLen = totalChars / chunks.length;
+ const totalPages = new Set(chunks.map(c => c.page)).size;
+
+ const bulletPattern = /[●•○◦▪▸▹►–—]\s|^\s*[-*]\s/m;
+ const bulletChunks = chunks.filter(c => bulletPattern.test(c.content ?? ''));
+ const bulletRatio = bulletChunks.length / chunks.length;
+
+ if (avgChunkLen < 500 && totalPages > 15 && bulletRatio > 0.25) return 'slides';
+ if (avgChunkLen > 1200 && bulletRatio < 0.15) return 'prose';
+ return 'mixed';
+}
+
+// ── Sentence boundary helper ──────────────────────────────────
+
+function extractSurroundingSentence(text: string, matchIndex: number, matchLength: number): string {
+ const sentenceBreak = /[.!?]\s+|[\n\r]{2,}/;
+
+ let start = matchIndex;
+ const searchBack = text.slice(Math.max(0, matchIndex - 300), matchIndex);
+ const backParts = searchBack.split(sentenceBreak);
+ if (backParts.length > 1) {
+ start = matchIndex - (backParts[backParts.length - 1]?.length ?? 0);
+ } else {
+ start = Math.max(0, matchIndex - 300);
+ }
+
+ let end = matchIndex + matchLength;
+ const searchForward = text.slice(end, end + 300);
+ const fwdMatch = sentenceBreak.exec(searchForward);
+ if (fwdMatch?.index !== undefined) {
+ end = end + fwdMatch.index + fwdMatch[0].length;
+ } else {
+ end = Math.min(text.length, end + 300);
+ }
+
+ return text.slice(start, end).trim();
+}
+
+// ── extractDeadlines ──────────────────────────────────────────
+
+const DEADLINE_PATTERNS = [
+ /\b(?:due|deadline)\s*(?:on|by|:)?\s*(.{3,40}?\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|june?|july?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\b[^.!?\n]{0,30})/gi,
+ /\b(?:due|deadline)\s*(?:on|by|:)?\s*(\d{1,2}[\/\-]\d{1,2}(?:[\/\-]\d{2,4})?)/gi,
+ /\b(?:submit|turn\s*in|hand\s*in)\s+(?:by|before)\s+(.{3,60})/gi,
+ /\b(homework\s+\d+[^.!?\n]{0,60}(?:due|deadline)[^.!?\n]{0,40})/gi,
+ /\b(assignment\s+\d+[^.!?\n]{0,60}(?:due|deadline)[^.!?\n]{0,40})/gi,
+ /\b(?:homework|assignment|project|paper|essay|lab)\s+\d*\s*(?:is\s+)?due\b[^.!?\n]{0,60}/gi,
+ /\b(?:quiz|exam|test|midterm|final)\s+(?:on|:)\s*(.{3,60})/gi,
+ /\b(midterm|final\s+exam|final\s+project)[^.!?\n]{0,80}/gi,
+];
+
+export function extractDeadlines(chunks: PdfChunk[]): DocumentInsight[] {
+ const seen = new Set();
+ const results: DocumentInsight[] = [];
+
+ for (const chunk of chunks) {
+ const text = chunk.content ?? '';
+ if (!text) continue;
+
+ for (const pattern of DEADLINE_PATTERNS) {
+ pattern.lastIndex = 0;
+ let match: RegExpExecArray | null;
+ while ((match = pattern.exec(text)) !== null) {
+ const matchedText = match[0].trim();
+ const normalizedKey = matchedText.toLowerCase().replace(/\s+/g, ' ').slice(0, 60);
+ if (seen.has(normalizedKey)) continue;
+ seen.add(normalizedKey);
+
+ const sentence = extractSurroundingSentence(text, match.index, match[0].length);
+ const title = matchedText.length > 60
+ ? matchedText.slice(0, 57) + '...'
+ : matchedText;
+
+ results.push({
+ category: 'deadline',
+ severity: 'warning',
+ title,
+ detail: sentence,
+ page: chunk.page,
+ sourceQuote: matchedText,
+ date: match[1]?.trim(),
+ });
+ }
+ }
+ }
+
+ return results;
+}
+
+// ── extractRecurringReferences (rewritten with anti-header heuristics) ─────
+
+const CAPITALIZED_PHRASE = /(?:[A-Z][a-zA-Z]+(?:'s)?(?:[\s,&]+|[-])){1,4}[A-Z][a-zA-Z]+/g;
+const QUOTED_TITLE = /"([^"]{5,80})"/g;
+
+const STOPLIST = new Set([
+ 'the', 'this', 'that', 'these', 'those', 'page', 'slide',
+ 'please', 'note', 'see', 'also', 'section', 'chapter',
+ 'figure', 'table', 'part', 'item', 'class', 'lecture',
+ 'monday', 'tuesday', 'wednesday', 'thursday', 'friday',
+ 'saturday', 'sunday', 'january', 'february', 'march', 'april',
+ 'may', 'june', 'july', 'august', 'september', 'october',
+ 'november', 'december', 'spring', 'fall', 'summer', 'winter',
+ 'common', 'design', 'overview', 'introduction', 'review',
+ 'summary', 'conclusion', 'questions', 'discussion', 'agenda',
+ 'outline', 'objectives', 'learning', 'today', 'next',
+ 'evaluation', 'prototype', 'prototyping', 'testing',
+ 'analysis', 'methodology', 'framework', 'approach',
+ 'project', 'assignment', 'activity', 'exercise',
+ 'high', 'low', 'mid', 'fidelity',
+]);
+
+function isStopPhrase(phrase: string): boolean {
+ const words = phrase.toLowerCase().split(/\s+/);
+ if (words.length < 2) return true;
+ if (words.every(w => STOPLIST.has(w))) return true;
+ if (words.length === 2 && words.some(w => STOPLIST.has(w ?? ''))) return true;
+ if (words[0] && STOPLIST.has(words[0]) && words.length <= 3) return true;
+ if (/^(In|On|At|By|For|To|The|This|That|If|As|Or|An?|What|Why|How|When|Where)\s/i.test(phrase)) return true;
+ return false;
+}
+
+type PhraseInfo = {
+ phrase: string;
+ pages: Set;
+ firstPage: number;
+ contexts: string[];
+};
+
+function computeMaxGap(pages: Set): number {
+ const sorted = [...pages].sort((a, b) => a - b);
+ let maxGap = 0;
+ for (let i = 1; i < sorted.length; i++) {
+ maxGap = Math.max(maxGap, (sorted[i] ?? 0) - (sorted[i - 1] ?? 0));
+ }
+ return maxGap;
+}
+
+function contextsAreDiverse(contexts: string[], threshold = 0.8): boolean {
+ if (contexts.length <= 1) return false;
+ for (let i = 1; i < contexts.length; i++) {
+ const sim = stringSimilarity(
+ (contexts[0] ?? '').toLowerCase().slice(0, 200),
+ (contexts[i] ?? '').toLowerCase().slice(0, 200),
+ );
+ if (sim < threshold) return true;
+ }
+ return false;
+}
+
+export function extractRecurringReferences(
+ chunks: PdfChunk[],
+ format: DocumentFormat = 'mixed',
+): DocumentInsight[] {
+ const sectionHeaders = new Set();
+ for (const chunk of chunks) {
+ if (chunk.sectionHeading) {
+ sectionHeaders.add(chunk.sectionHeading.toLowerCase().replace(/\s+/g, ' ').trim());
+ }
+ }
+
+ const frequencyMap = new Map();
+
+ for (const chunk of chunks) {
+ const text = chunk.content ?? '';
+ if (!text) continue;
+
+ const phrases: string[] = [];
+
+ CAPITALIZED_PHRASE.lastIndex = 0;
+ let m: RegExpExecArray | null;
+ while ((m = CAPITALIZED_PHRASE.exec(text)) !== null) {
+ phrases.push(m[0].trim());
+ }
+
+ QUOTED_TITLE.lastIndex = 0;
+ while ((m = QUOTED_TITLE.exec(text)) !== null) {
+ if (m[1]) phrases.push(m[1].trim());
+ }
+
+ for (const raw of phrases) {
+ if (isStopPhrase(raw)) continue;
+
+ const key = raw.toLowerCase().replace(/\s+/g, ' ');
+
+ let matchesHeader = false;
+ for (const header of sectionHeaders) {
+ if (stringSimilarity(key, header) > 0.7 || header.includes(key) || key.includes(header)) {
+ matchesHeader = true;
+ break;
+ }
+ }
+ if (matchesHeader) continue;
+
+ const existing = frequencyMap.get(key);
+ if (existing) {
+ existing.pages.add(chunk.page);
+ if (existing.contexts.length < 4) {
+ const idx = text.indexOf(raw);
+ if (idx >= 0) {
+ existing.contexts.push(extractSurroundingSentence(text, idx, raw.length));
+ }
+ }
+ } else {
+ const idx = text.indexOf(raw);
+ const context = idx >= 0
+ ? extractSurroundingSentence(text, idx, raw.length)
+ : raw;
+
+ frequencyMap.set(key, {
+ phrase: raw,
+ pages: new Set([chunk.page]),
+ firstPage: chunk.page,
+ contexts: [context],
+ });
+ }
+ }
+ }
+
+ const minPages = format === 'slides' ? 5 : 3;
+ const minGap = format === 'slides' ? 10 : 3;
+
+ const allPages = new Set(chunks.map(c => c.page));
+ const totalPageCount = allPages.size;
+ const firstPageContent = chunks
+ .filter(c => c.page === Math.min(...allPages))
+ .map(c => (c.content ?? '').toLowerCase())
+ .join(' ');
+
+ const recurring = [...frequencyMap.values()]
+ .filter(info => {
+ if (info.pages.size < minPages) return false;
+ if (computeMaxGap(info.pages) < minGap) return false;
+ if (!contextsAreDiverse(info.contexts)) return false;
+
+ const key = info.phrase.toLowerCase().replace(/\s+/g, ' ');
+ if (firstPageContent.includes(key) && info.pages.size / totalPageCount > 0.08) {
+ return false;
+ }
+ return true;
+ })
+ .sort((a, b) => b.pages.size - a.pages.size)
+ .slice(0, 5);
+
+ return recurring.map(info => ({
+ category: 'key-reference' as const,
+ severity: 'note' as const,
+ title: info.phrase,
+ detail: `Referenced on ${info.pages.size} pages (${[...info.pages].sort((a, b) => a - b).join(', ')})`,
+ page: info.firstPage,
+ sourceQuote: info.contexts[0]
+ ? (info.contexts[0].length > 200 ? info.contexts[0].slice(0, 197) + '...' : info.contexts[0])
+ : undefined,
+ }));
+}
+
+// ── extractResourceSuggestions ─────────────────────────────────
+
+const URL_REGEX = /https?:\/\/[^\s<>"')\]},]+/gi;
+
+const VIDEO_DOMAINS = new Set([
+ 'youtube.com', 'youtu.be', 'vimeo.com', 'dailymotion.com',
+ 'twitch.tv', 'wistia.com', 'loom.com',
+]);
+
+const RESOURCE_ACTION_RE = /\b(?:watch|view|see|check\s*out|review|read|visit|explore|look\s*at|refer\s*to|go\s*to)\b/i;
+const RESOURCE_FRAMING_RE = /\b(?:recommended|suggested|required|optional|useful|helpful|important|reference|supplementary|additional)\s+(?:reading|viewing|video|resource|material|link)/i;
+
+function isVideoDomain(hostname: string): boolean {
+ const cleaned = hostname.replace(/^www\./, '');
+ return VIDEO_DOMAINS.has(cleaned);
+}
+
+const TRAILING_JUNK_RE = /\s+(?:et|and|or|the|a|an|in|on|of|for|by|to|at|is|are|was|with|from)\s*$/i;
+
+function cleanSnippet(words: string[], maxWords: number): string {
+ let snippet = words.slice(0, maxWords).join(' ');
+ snippet = snippet.replace(TRAILING_JUNK_RE, '');
+ if (snippet.length < 4 && words.length > maxWords) {
+ snippet = words.slice(0, maxWords + 2).join(' ').replace(TRAILING_JUNK_RE, '');
+ }
+ return snippet;
+}
+
+function buildResourceTitle(url: string, context: string, hostname: string): string {
+ if (isVideoDomain(hostname)) {
+ const contextClean = context.replace(/https?:\/\/[^\s]+/g, '').trim();
+ const words = contextClean.split(/\s+/).filter(w => w.length > 1 && !/^[●•○▪►–—*]$/.test(w));
+ const snippet = cleanSnippet(words, 6);
+ if (snippet.length > 5) return `Watch: ${snippet}`;
+ return `Watch: Video on ${hostname.replace(/^www\./, '')}`;
+ }
+
+ const actionMatch = RESOURCE_ACTION_RE.exec(context);
+ if (actionMatch) {
+ const afterAction = context.slice((actionMatch.index ?? 0) + actionMatch[0].length).trim();
+ const clean = afterAction.replace(/https?:\/\/[^\s]+/g, '').trim();
+ const words = clean.split(/\s+/).filter(w => w.length > 1);
+ const snippet = cleanSnippet(words, 6);
+ if (snippet.length > 5) {
+ const verb = actionMatch[0].charAt(0).toUpperCase() + actionMatch[0].slice(1).toLowerCase();
+ return `${verb}: ${snippet}`;
+ }
+ }
+
+ const domain = hostname.replace(/^www\./, '');
+ try {
+ const parsed = new URL(url);
+ const path = parsed.pathname.replace(/\/+$/, '');
+ if (path && path !== '/') {
+ const pathSnippet = path.length > 30 ? path.slice(0, 27) + '...' : path;
+ return `Resource: ${domain}${pathSnippet}`;
+ }
+ } catch { /* skip */ }
+
+ return `Resource: ${domain}`;
+}
+
+export function extractResourceSuggestions(chunks: PdfChunk[]): DocumentInsight[] {
+ const seen = new Set();
+ const results: DocumentInsight[] = [];
+
+ for (const chunk of chunks) {
+ const text = chunk.content ?? '';
+ if (!text) continue;
+
+ URL_REGEX.lastIndex = 0;
+ let urlMatch: RegExpExecArray | null;
+ while ((urlMatch = URL_REGEX.exec(text)) !== null) {
+ const rawUrl = urlMatch[0].replace(/[.,;:!?)]+$/, '');
+ if (seen.has(rawUrl)) continue;
+ seen.add(rawUrl);
+
+ let hostname: string;
+ try {
+ hostname = new URL(rawUrl).hostname;
+ } catch {
+ continue;
+ }
+
+ const contextStart = Math.max(0, urlMatch.index - 200);
+ const contextEnd = Math.min(text.length, urlMatch.index + urlMatch[0].length + 200);
+ const contextWindow = text.slice(contextStart, contextEnd);
+
+ const hasActionVerb = RESOURCE_ACTION_RE.test(contextWindow);
+ const hasFraming = RESOURCE_FRAMING_RE.test(contextWindow);
+ const isVideo = isVideoDomain(hostname);
+
+ if (!hasActionVerb && !hasFraming && !isVideo) continue;
+
+ const sentence = extractSurroundingSentence(text, urlMatch.index, urlMatch[0].length);
+ const title = buildResourceTitle(rawUrl, contextWindow, hostname);
+
+ results.push({
+ category: 'resource',
+ severity: 'note',
+ title,
+ detail: sentence.replace(/https?:\/\/[^\s]+/g, '').trim() || `Resource linked on page ${chunk.page}`,
+ page: chunk.page,
+ url: rawUrl,
+ sourceQuote: sentence.length > 200 ? sentence.slice(0, 197) + '...' : sentence,
+ });
+ }
+ }
+
+ return results;
+}
+
+// ── extractActionItems ────────────────────────────────────────
+
+const ASSIGNMENT_PATTERNS = [
+ /\b(entrance\s+ticket\s*\d*)[^.!?\n]{0,80}/gi,
+ /\b(activity\s+\d+)[^.!?\n]{0,80}/gi,
+ /\b(exercise\s+\d+)[^.!?\n]{0,80}/gi,
+ /\b(lab\s+\d+)[^.!?\n]{0,80}/gi,
+ /\b(homework\s+\d+)(?!\s*(?:is\s+)?due)[^.!?\n]{0,80}/gi,
+ /\b(quiz\s+\d+)(?!\s+on)[^.!?\n]{0,80}/gi,
+ /\b(project\s+\d+)[^.!?\n]{0,80}/gi,
+];
+
+const PLATFORM_TASK_RE = /\b(?:post|submit|upload|share|register|sign\s*up|enroll|respond|reply|introduce\s+yourself|self[- ]?intro(?:duction)?)\s+(?:on|to|via|at|in|through)\s+(canvas|courselore|piazza|blackboard|moodle|teams|slack|gradescope|sakai|brightspace|discord|github|google\s*(?:classroom|drive|docs|forms))[^.!?\n]{0,60}/gi;
+
+const IMPERATIVE_RE = /^(?:complete|prepare|bring|create|write|read|review|finish|attend|watch|sign\s*up\s+for|set\s*up|make\s*sure|don'?t\s+forget|remember\s+to)\s+[^.!?\n]{5,80}/gim;
+
+const URGENCY_RE = /\b(?:before\s+(?:next|class|lecture|lab|section)|by\s+(?:end\s+of|next|tomorrow|monday|tuesday|wednesday|thursday|friday)|asap|immediately|today|tonight)\b/i;
+
+export function extractActionItems(chunks: PdfChunk[]): DocumentInsight[] {
+ const seen = new Set();
+ const results: DocumentInsight[] = [];
+
+ function addResult(matchedText: string, text: string, matchIndex: number, matchLength: number, page: number) {
+ const normalizedKey = matchedText.toLowerCase().replace(/\s+/g, ' ').slice(0, 60);
+ if (seen.has(normalizedKey)) return;
+ seen.add(normalizedKey);
+
+ const sentence = extractSurroundingSentence(text, matchIndex, matchLength);
+ const title = matchedText.length > 60
+ ? matchedText.slice(0, 57) + '...'
+ : matchedText;
+
+ const isUrgent = URGENCY_RE.test(sentence);
+
+ results.push({
+ category: 'action-item',
+ severity: isUrgent ? 'warning' : 'note',
+ title,
+ detail: sentence,
+ page,
+ sourceQuote: matchedText,
+ });
+ }
+
+ for (const chunk of chunks) {
+ const text = chunk.content ?? '';
+ if (!text) continue;
+
+ for (const pattern of ASSIGNMENT_PATTERNS) {
+ pattern.lastIndex = 0;
+ let match: RegExpExecArray | null;
+ while ((match = pattern.exec(text)) !== null) {
+ addResult(match[0].trim(), text, match.index, match[0].length, chunk.page);
+ }
+ }
+
+ PLATFORM_TASK_RE.lastIndex = 0;
+ let match: RegExpExecArray | null;
+ while ((match = PLATFORM_TASK_RE.exec(text)) !== null) {
+ addResult(match[0].trim(), text, match.index, match[0].length, chunk.page);
+ }
+
+ IMPERATIVE_RE.lastIndex = 0;
+ while ((match = IMPERATIVE_RE.exec(text)) !== null) {
+ addResult(match[0].trim(), text, match.index, match[0].length, chunk.page);
+ }
+ }
+
+ return results;
+}
+
+// ── extractCaveats ────────────────────────────────────────────
+
+const CAVEAT_PATTERNS: Array<{ pattern: RegExp; severity: 'warning' | 'note' }> = [
+ { pattern: /\b(academic\s+integrity\s*(?:code|policy|violation)?)[^.!?\n]{0,100}/gi, severity: 'warning' },
+ { pattern: /\b(plagiarism\s+(?:policy|will|is|results?)[^.!?\n]{0,80})/gi, severity: 'warning' },
+ { pattern: /\b(honor\s+code)[^.!?\n]{0,80}/gi, severity: 'warning' },
+ { pattern: /\b(zero\s+tolerance)[^.!?\n]{0,80}/gi, severity: 'warning' },
+ { pattern: /\b(will\s+result\s+in\s+(?:a\s+)?(?:failing|zero|grade\s+of|expulsion|suspension|penalty))[^.!?\n]{0,60}/gi, severity: 'warning' },
+ { pattern: /\b(generative\s+AI|ChatGPT|AI[- ]?(?:generated|policy|use|tools?))\b[^.!?\n]{0,100}/gi, severity: 'note' },
+ { pattern: /\b((?:is|are)\s+(?:prohibited|not\s+allowed|strictly\s+forbidden|not\s+permitted))[^.!?\n]{0,80}/gi, severity: 'warning' },
+ { pattern: /\b((?:required|mandatory|prerequisite|must\s+(?:complete|attend|submit|bring|have)))[^.!?\n]{0,80}/gi, severity: 'note' },
+ { pattern: /\b(late\s+(?:penalty|submission|work|assignments?)\s*(?:policy|will|:)?)[^.!?\n]{0,80}/gi, severity: 'warning' },
+ { pattern: /\b(attendance\s+(?:policy|is\s+(?:required|mandatory)))[^.!?\n]{0,80}/gi, severity: 'note' },
+];
+
+export function extractCaveats(chunks: PdfChunk[]): DocumentInsight[] {
+ const seen = new Set();
+ const pagePatternCount = new Map();
+ const results: DocumentInsight[] = [];
+
+ for (const chunk of chunks) {
+ const text = chunk.content ?? '';
+ if (!text) continue;
+
+ for (let pi = 0; pi < CAVEAT_PATTERNS.length; pi++) {
+ const { pattern, severity } = CAVEAT_PATTERNS[pi]!;
+ const pagePatternKey = `${chunk.page}:${pi}`;
+ const hitCount = pagePatternCount.get(pagePatternKey) ?? 0;
+ if (hitCount >= 1) continue;
+
+ pattern.lastIndex = 0;
+ let match: RegExpExecArray | null;
+ while ((match = pattern.exec(text)) !== null) {
+ if ((pagePatternCount.get(pagePatternKey) ?? 0) >= 1) break;
+
+ const matchedText = match[0].trim();
+ const normalizedKey = matchedText.toLowerCase().replace(/\s+/g, ' ').slice(0, 60);
+ if (seen.has(normalizedKey)) continue;
+
+ let tooSimilar = false;
+ for (const existing of results) {
+ if (existing.page === chunk.page &&
+ stringSimilarity(normalizedKey, existing.title.toLowerCase().slice(0, 60)) > 0.4) {
+ tooSimilar = true;
+ break;
+ }
+ }
+ if (tooSimilar) continue;
+
+ seen.add(normalizedKey);
+ pagePatternCount.set(pagePatternKey, (pagePatternCount.get(pagePatternKey) ?? 0) + 1);
+
+ const sentence = extractSurroundingSentence(text, match.index, match[0].length);
+ const title = matchedText.length > 60
+ ? matchedText.slice(0, 57) + '...'
+ : matchedText;
+
+ results.push({
+ category: 'caveat',
+ severity,
+ title,
+ detail: sentence,
+ page: chunk.page,
+ sourceQuote: matchedText,
+ });
+ }
+ }
+ }
+
+ return results;
+}
+
+// ── Combined deterministic extraction ─────────────────────────
+
+export function extractDeterministicInsights(chunks: PdfChunk[]): DocumentInsight[] {
+ const format = detectDocumentFormat(chunks);
+ return [
+ ...extractDeadlines(chunks),
+ ...extractResourceSuggestions(chunks),
+ ...extractActionItems(chunks),
+ ...extractCaveats(chunks),
+ ...extractRecurringReferences(chunks, format),
+ ];
+}
diff --git a/src/app/api/agents/predictive-document-analysis/utils/logging.ts b/src/app/api/agents/predictive-document-analysis/utils/logging.ts
new file mode 100644
index 00000000..46d34058
--- /dev/null
+++ b/src/app/api/agents/predictive-document-analysis/utils/logging.ts
@@ -0,0 +1,8 @@
+const VECTOR_PATTERN = /\[[-\d.,eE+\s]{200,}\]/g;
+
+export function sanitizeErrorMessage(error: unknown): string {
+ if (!(error instanceof Error)) return 'Unknown error';
+ const msg = error.message;
+ if (msg.length < 300) return msg;
+ return msg.replace(VECTOR_PATTERN, '[]').slice(0, 500);
+}
diff --git a/src/app/api/company/metadata/extract/route.ts b/src/app/api/company/metadata/extract/route.ts
new file mode 100644
index 00000000..f0823e99
--- /dev/null
+++ b/src/app/api/company/metadata/extract/route.ts
@@ -0,0 +1,169 @@
+/**
+ * Demo endpoint — Company Metadata Extraction
+ *
+ * Processes ALL documents for the logged-in user's company, extracts
+ * metadata from each, and merges them into a single canonical JSON.
+ * No DB writes — returns the result directly.
+ *
+ * Usage:
+ * POST /api/company/metadata/extract
+ * (no body required — uses the authenticated user's company)
+ *
+ * POST /api/company/metadata/extract body: { "debug": true }
+ * (returns per-document diagnostics instead of running extraction)
+ *
+ * Returns the full CompanyMetadataJSON + aggregated diff.
+ */
+
+import { NextResponse } from "next/server";
+import { auth } from "@clerk/nextjs/server";
+import { eq, sql } from "drizzle-orm";
+
+import { db } from "~/server/db";
+import { users, document as documentTable, documentContextChunks } from "~/server/db/schema";
+import { companyMetadata } from "~/server/db/schema/company-metadata";
+import { extractCompanyFacts } from "~/lib/tools/company-metadata/extractor";
+import { mergeCompanyMetadata } from "~/lib/tools/company-metadata/merger";
+import { createEmptyMetadata } from "~/lib/tools/company-metadata/types";
+import type { CompanyMetadataJSON, MetadataDiff } from "~/lib/tools/company-metadata/types";
+
+export async function POST(request: Request) {
+ try {
+ // Auth
+ const { userId } = await auth();
+ if (!userId) {
+ return NextResponse.json(
+ { error: "Unauthorized" },
+ { status: 401 },
+ );
+ }
+
+ const [userInfo] = await db
+ .select({ companyId: users.companyId })
+ .from(users)
+ .where(eq(users.userId, userId));
+
+ if (!userInfo) {
+ return NextResponse.json(
+ { error: "User not found" },
+ { status: 400 },
+ );
+ }
+
+ const companyId = String(userInfo.companyId);
+
+ // Check for debug mode
+ let debug = false;
+ try {
+ const body = (await request.json()) as { debug?: boolean };
+ debug = body.debug === true;
+ } catch {
+ // No body or invalid JSON — that's fine
+ }
+
+ // Find all documents for this company
+ const docs = await db
+ .select({ id: documentTable.id, title: documentTable.title })
+ .from(documentTable)
+ .where(eq(documentTable.companyId, userInfo.companyId));
+
+ if (docs.length === 0) {
+ return NextResponse.json({
+ message: "No documents found for this company",
+ metadata: null,
+ documentsProcessed: 0,
+ });
+ }
+
+ // Debug mode: return per-document chunk counts without running extraction
+ if (debug) {
+ const diagnostics = [];
+ for (const doc of docs) {
+ const [row] = await db
+ .select({ count: sql`count(*)` })
+ .from(documentContextChunks)
+ .where(eq(documentContextChunks.documentId, BigInt(doc.id)));
+ diagnostics.push({
+ documentId: doc.id,
+ title: doc.title,
+ chunkCount: Number(row?.count ?? 0),
+ });
+ }
+ return NextResponse.json({
+ companyId,
+ totalDocuments: docs.length,
+ documents: diagnostics,
+ documentsWithChunks: diagnostics.filter((d) => d.chunkCount > 0).length,
+ });
+ }
+
+ // Process each document sequentially, merging into canonical metadata
+ let metadata: CompanyMetadataJSON = createEmptyMetadata(companyId);
+ const allDiffs: MetadataDiff = { added: [], updated: [], deprecated: [] };
+ let documentsWithFacts = 0;
+
+ for (const doc of docs) {
+ const extracted = await extractCompanyFacts({
+ documentId: doc.id,
+ companyId,
+ });
+
+ if (!extracted) continue;
+
+ const { updatedMetadata, diff } = mergeCompanyMetadata(
+ metadata,
+ extracted,
+ );
+
+ metadata = updatedMetadata;
+ allDiffs.added.push(...diff.added);
+ allDiffs.updated.push(...diff.updated);
+ allDiffs.deprecated.push(...diff.deprecated);
+ documentsWithFacts++;
+ }
+
+ if (documentsWithFacts === 0) {
+ return NextResponse.json({
+ message: "No extractable company facts found in any document",
+ metadata: null,
+ documentsProcessed: docs.length,
+ });
+ }
+
+ // Save to database
+ await db
+ .insert(companyMetadata)
+ .values({
+ companyId: userInfo.companyId,
+ metadata: metadata,
+ })
+ .onConflictDoUpdate({
+ target: companyMetadata.companyId,
+ set: {
+ metadata: metadata,
+ },
+ });
+
+ return NextResponse.json({
+ metadata,
+ documentsProcessed: docs.length,
+ documentsWithFacts,
+ diff: {
+ added: allDiffs.added,
+ updated: allDiffs.updated,
+ deprecated: allDiffs.deprecated,
+ summary: {
+ added: allDiffs.added.length,
+ updated: allDiffs.updated.length,
+ deprecated: allDiffs.deprecated.length,
+ },
+ },
+ });
+ } catch (error) {
+ console.error("[company-metadata] POST /extract error:", error);
+ return NextResponse.json(
+ { error: "Internal server error" },
+ { status: 500 },
+ );
+ }
+}
diff --git a/src/app/api/company/metadata/route.ts b/src/app/api/company/metadata/route.ts
new file mode 100644
index 00000000..e27fb685
--- /dev/null
+++ b/src/app/api/company/metadata/route.ts
@@ -0,0 +1,67 @@
+/**
+ * GET /api/company/metadata
+ *
+ * Returns the stored company metadata for the logged-in user's company.
+ */
+
+import { NextResponse } from "next/server";
+import { auth } from "@clerk/nextjs/server";
+import { eq } from "drizzle-orm";
+
+import { db } from "~/server/db";
+import { users } from "~/server/db/schema";
+import { companyMetadata } from "~/server/db/schema/company-metadata";
+
+export async function GET() {
+ try {
+ const { userId } = await auth();
+ if (!userId) {
+ return NextResponse.json(
+ { error: "Unauthorized" },
+ { status: 401 },
+ );
+ }
+
+ const [userInfo] = await db
+ .select({ companyId: users.companyId })
+ .from(users)
+ .where(eq(users.userId, userId));
+
+ if (!userInfo) {
+ return NextResponse.json(
+ { error: "User not found" },
+ { status: 400 },
+ );
+ }
+
+ const [result] = await db
+ .select({
+ metadata: companyMetadata.metadata,
+ schemaVersion: companyMetadata.schemaVersion,
+ createdAt: companyMetadata.createdAt,
+ updatedAt: companyMetadata.updatedAt,
+ })
+ .from(companyMetadata)
+ .where(eq(companyMetadata.companyId, userInfo.companyId));
+
+ if (!result) {
+ return NextResponse.json({
+ metadata: null,
+ message: "No metadata found. Upload documents and run extraction first.",
+ });
+ }
+
+ return NextResponse.json({
+ metadata: result.metadata,
+ schemaVersion: result.schemaVersion,
+ createdAt: result.createdAt,
+ updatedAt: result.updatedAt,
+ });
+ } catch (error) {
+ console.error("[company-metadata] GET error:", error);
+ return NextResponse.json(
+ { error: "Internal server error" },
+ { status: 500 },
+ );
+ }
+}
diff --git a/src/app/api/company/onboarding/route.ts b/src/app/api/company/onboarding/route.ts
new file mode 100644
index 00000000..b5edcf3e
--- /dev/null
+++ b/src/app/api/company/onboarding/route.ts
@@ -0,0 +1,92 @@
+import { NextResponse } from "next/server";
+import { auth } from "@clerk/nextjs/server";
+import { eq } from "drizzle-orm";
+
+import { db } from "~/server/db";
+import { users, company } from "~/server/db/schema";
+
+interface OnboardingBody {
+ description?: string;
+ industry?: string;
+}
+
+const AUTHORIZED_ROLES = new Set(["employer", "owner"]);
+
+export async function POST(request: Request) {
+ try {
+ const { userId } = await auth();
+ if (!userId) {
+ return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
+ }
+
+ const [userInfo] = await db
+ .select({ companyId: users.companyId, role: users.role })
+ .from(users)
+ .where(eq(users.userId, userId));
+
+ if (!userInfo) {
+ return NextResponse.json({ error: "User not found" }, { status: 400 });
+ }
+
+ if (!AUTHORIZED_ROLES.has(userInfo.role)) {
+ return NextResponse.json({ error: "Forbidden" }, { status: 403 });
+ }
+
+ const body = (await request.json()) as OnboardingBody;
+
+ await db
+ .update(company)
+ .set({
+ description: body.description?.trim() ?? null,
+ industry: body.industry?.trim() ?? null,
+ })
+ .where(eq(company.id, Number(userInfo.companyId)));
+
+ return NextResponse.json({ success: true });
+ } catch (error) {
+ console.error("[company/onboarding] POST error:", error);
+ return NextResponse.json(
+ { error: "Internal server error" },
+ { status: 500 },
+ );
+ }
+}
+
+export async function GET() {
+ try {
+ const { userId } = await auth();
+ if (!userId) {
+ return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
+ }
+
+ const [userInfo] = await db
+ .select({ companyId: users.companyId })
+ .from(users)
+ .where(eq(users.userId, userId));
+
+ if (!userInfo) {
+ return NextResponse.json({ error: "User not found" }, { status: 400 });
+ }
+
+ const [companyRow] = await db
+ .select({
+ name: company.name,
+ description: company.description,
+ industry: company.industry,
+ })
+ .from(company)
+ .where(eq(company.id, Number(userInfo.companyId)));
+
+ return NextResponse.json({
+ name: companyRow?.name ?? null,
+ description: companyRow?.description ?? null,
+ industry: companyRow?.industry ?? null,
+ });
+ } catch (error) {
+ console.error("[company/onboarding] GET error:", error);
+ return NextResponse.json(
+ { error: "Internal server error" },
+ { status: 500 },
+ );
+ }
+}
diff --git a/src/app/api/documents/[id]/content/route.ts b/src/app/api/documents/[id]/content/route.ts
new file mode 100644
index 00000000..506dcdbc
--- /dev/null
+++ b/src/app/api/documents/[id]/content/route.ts
@@ -0,0 +1,91 @@
+import { NextResponse } from "next/server";
+import { eq } from "drizzle-orm";
+import { auth } from "@clerk/nextjs/server";
+import { db } from "~/server/db";
+import { document } from "~/server/db/schema";
+import { isPrivateBlobUrl, fetchBlob } from "~/server/storage/vercel-blob";
+
+const EXTENSION_TO_MIME: Record = {
+ ".pdf": "application/pdf",
+ ".png": "image/png",
+ ".jpg": "image/jpeg",
+ ".jpeg": "image/jpeg",
+ ".gif": "image/gif",
+ ".webp": "image/webp",
+ ".tiff": "image/tiff",
+ ".tif": "image/tiff",
+ ".bmp": "image/bmp",
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ ".txt": "text/plain",
+ ".csv": "text/csv",
+ ".html": "text/html",
+ ".md": "text/markdown",
+};
+
+function inferMime(name: string): string {
+ const match = /(\.[a-z0-9]+)(?:\?|#|$)/i.exec(name);
+ return (match?.[1] && EXTENSION_TO_MIME[match[1].toLowerCase()]) ?? "application/octet-stream";
+}
+
+interface RouteParams {
+ params: Promise<{ id: string }>;
+}
+
+export async function GET(_request: Request, { params }: RouteParams) {
+ try {
+ const { userId } = await auth();
+ if (!userId) {
+ return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
+ }
+
+ const { id } = await params;
+ const docId = parseInt(id, 10);
+ if (isNaN(docId)) {
+ return NextResponse.json({ error: "Invalid document ID" }, { status: 400 });
+ }
+
+ const [doc] = await db
+ .select({ url: document.url, title: document.title })
+ .from(document)
+ .where(eq(document.id, docId));
+
+ if (!doc) {
+ return NextResponse.json({ error: "Document not found" }, { status: 404 });
+ }
+
+ if (!isPrivateBlobUrl(doc.url)) {
+ return NextResponse.redirect(doc.url, { status: 307 });
+ }
+
+ const blobRes = await fetchBlob(doc.url);
+ if (!blobRes.ok) {
+ return NextResponse.json(
+ { error: "Failed to retrieve document from storage" },
+ { status: 502 },
+ );
+ }
+
+ const mimeType =
+ blobRes.headers.get("content-type") ?? inferMime(doc.title);
+
+ return new NextResponse(blobRes.body, {
+ status: 200,
+ headers: {
+ "Content-Type": mimeType,
+ ...(blobRes.headers.get("content-length")
+ ? { "Content-Length": blobRes.headers.get("content-length")! }
+ : {}),
+ "Content-Disposition": `inline; filename="${encodeURIComponent(doc.title)}"; filename*=UTF-8''${encodeURIComponent(doc.title)}`,
+ "Cache-Control": "private, max-age=3600",
+ },
+ });
+ } catch (error) {
+ console.error("Error serving document content:", error);
+ return NextResponse.json(
+ { error: "Failed to serve document", details: error instanceof Error ? error.message : "Unknown error" },
+ { status: 500 },
+ );
+ }
+}
diff --git a/src/app/api/fetchCompany/route.ts b/src/app/api/fetchCompany/route.ts
index 7f797129..73c9ce45 100644
--- a/src/app/api/fetchCompany/route.ts
+++ b/src/app/api/fetchCompany/route.ts
@@ -1,8 +1,7 @@
import { NextResponse } from "next/server";
import { db } from "../../../server/db/index";
import { company, users } from "../../../server/db/schema";
-import { eq, and } from "drizzle-orm";
-import * as console from "console";
+import { eq } from "drizzle-orm";
import { auth } from "@clerk/nextjs/server";
@@ -30,12 +29,19 @@ export async function GET() {
const companyId = userInfo.companyId;
- const companies = await db
+ const [companyRecord] = await db
.select()
.from(company)
- .where(and(eq(company.id, Number(companyId))));
+ .where(eq(company.id, Number(companyId)));
- return NextResponse.json(companies, { status: 200 });
+ if (!companyRecord) {
+ return NextResponse.json(
+ { error: "Company not found." },
+ { status: 404 }
+ );
+ }
+
+ return NextResponse.json(companyRecord, { status: 200 });
} catch (error: unknown) {
console.error("Error fetching documents:", error);
return NextResponse.json(
diff --git a/src/app/api/fetchDocument/route.ts b/src/app/api/fetchDocument/route.ts
index 84d0ee70..441a4bca 100644
--- a/src/app/api/fetchDocument/route.ts
+++ b/src/app/api/fetchDocument/route.ts
@@ -4,6 +4,7 @@ import { document, users, fileUploads } from "../../../server/db/schema/base";
import { eq, inArray } from "drizzle-orm";
import { validateRequestBody, UserIdSchema } from "~/lib/validation";
import { auth } from '@clerk/nextjs/server';
+import { isPrivateBlobUrl } from "~/server/storage/vercel-blob";
/** Extract file id from /api/files/{id} URL so we can look up mimeType from file_uploads */
const FILE_API_ID_REGEX = /\/api\/files\/(\d+)/;
@@ -104,8 +105,13 @@ export async function POST(request: Request) {
const mimeType = mimeFromFile
?? inferMimeFromName(doc.title)
?? inferMimeFromName(doc.url);
+ const url = isPrivateBlobUrl(doc.url)
+ ? `/api/documents/${Number(doc.id)}/content`
+ : doc.url;
+
return {
...doc,
+ url,
id: Number(doc.id),
companyId: Number(doc.companyId),
...(mimeType && { mimeType }),
diff --git a/src/app/api/files/[id]/route.ts b/src/app/api/files/[id]/route.ts
index a6f6cb3f..a8c98684 100644
--- a/src/app/api/files/[id]/route.ts
+++ b/src/app/api/files/[id]/route.ts
@@ -7,6 +7,7 @@ import { NextResponse } from "next/server";
import { eq } from "drizzle-orm";
import { db } from "~/server/db";
import { fileUploads } from "~/server/db/schema";
+import { isPrivateBlobUrl, fetchBlob } from "~/server/storage/vercel-blob";
const MIME_BY_EXTENSION: Record = {
pdf: "application/pdf",
@@ -71,6 +72,43 @@ export async function GET(
);
}
+ if (file.storageProvider === "vercel_blob" && file.storageUrl) {
+ if (isPrivateBlobUrl(file.storageUrl)) {
+ const blobRes = await fetchBlob(file.storageUrl);
+ if (!blobRes.ok) {
+ return NextResponse.json(
+ { error: "Failed to retrieve file from storage" },
+ { status: 502 }
+ );
+ }
+ const mimeType =
+ blobRes.headers.get("content-type") ??
+ file.mimeType?.trim() ??
+ inferMimeTypeFromFilename(file.filename);
+ return new NextResponse(blobRes.body, {
+ status: 200,
+ headers: {
+ "Content-Type": mimeType,
+ ...(blobRes.headers.get("content-length")
+ ? { "Content-Length": blobRes.headers.get("content-length")! }
+ : {}),
+ "Content-Disposition": `inline; filename="${encodeURIComponent(file.filename)}"; filename*=UTF-8''${encodeURIComponent(file.filename)}`,
+ "Cache-Control": "private, max-age=31536000",
+ },
+ });
+ }
+ return NextResponse.redirect(file.storageUrl, {
+ status: 307,
+ });
+ }
+
+ if (!file.fileData) {
+ return NextResponse.json(
+ { error: "File is not available in database storage" },
+ { status: 404 }
+ );
+ }
+
// Decode base64 data back to binary
const binaryData = Buffer.from(file.fileData, "base64");
const mimeType = file.mimeType?.trim() || inferMimeTypeFromFilename(file.filename);
diff --git a/src/app/api/inngest/route.ts b/src/app/api/inngest/route.ts
index 75602044..90e2a217 100644
--- a/src/app/api/inngest/route.ts
+++ b/src/app/api/inngest/route.ts
@@ -11,11 +11,13 @@ import { serve } from "inngest/next";
import { inngest } from "~/server/inngest/client";
import { uploadDocument } from "~/server/inngest/functions/processDocument";
import { trendSearchJob } from "~/server/inngest/functions/trendSearch";
+import { extractCompanyMetadataJob } from "~/server/inngest/functions/extractCompanyMetadata";
+import { predictiveAnalysisJob } from "~/server/inngest/functions/predictiveAnalysis";
// Register all Inngest functions
const handler = serve({
client: inngest,
- functions: [uploadDocument, trendSearchJob],
+ functions: [uploadDocument, trendSearchJob, extractCompanyMetadataJob, predictiveAnalysisJob],
});
export const GET = handler.GET;
diff --git a/src/app/api/marketing-pipeline/route.ts b/src/app/api/marketing-pipeline/route.ts
new file mode 100644
index 00000000..94a1cc0f
--- /dev/null
+++ b/src/app/api/marketing-pipeline/route.ts
@@ -0,0 +1,79 @@
+import { NextResponse } from "next/server";
+import { auth } from "@clerk/nextjs/server";
+import { eq } from "drizzle-orm";
+import { db } from "~/server/db";
+import { users } from "~/server/db/schema";
+import { MarketingPipelineInputSchema, runMarketingPipeline } from "~/lib/tools/marketing-pipeline";
+
+export const runtime = "nodejs";
+export const maxDuration = 60;
+
+export async function POST(request: Request) {
+ try {
+ const { userId } = await auth();
+ if (!userId) {
+ return NextResponse.json(
+ { success: false, message: "Unauthorized" },
+ { status: 401 },
+ );
+ }
+
+ const body = (await request.json()) as unknown;
+ const validation = MarketingPipelineInputSchema.safeParse(body);
+ if (!validation.success) {
+ return NextResponse.json(
+ {
+ success: false,
+ message: "Invalid input",
+ errors: validation.error.flatten(),
+ },
+ { status: 400 },
+ );
+ }
+
+ const [requestingUser] = await db
+ .select()
+ .from(users)
+ .where(eq(users.userId, userId))
+ .limit(1);
+
+ if (!requestingUser) {
+ return NextResponse.json(
+ { success: false, message: "User not found" },
+ { status: 404 },
+ );
+ }
+
+ const companyId = Number(requestingUser.companyId);
+ if (Number.isNaN(companyId)) {
+ return NextResponse.json(
+ { success: false, message: "Invalid company ID" },
+ { status: 400 },
+ );
+ }
+
+ const result = await runMarketingPipeline({
+ companyId,
+ input: validation.data,
+ });
+
+ return NextResponse.json(
+ {
+ success: true,
+ data: result,
+ },
+ { status: 200 },
+ );
+ } catch (error) {
+ console.error("[marketing-pipeline] POST error:", error);
+ return NextResponse.json(
+ {
+ success: false,
+ message: "Failed to run marketing pipeline",
+ error: error instanceof Error ? error.message : "Unknown error",
+ },
+ { status: 500 },
+ );
+ }
+}
+
diff --git a/src/app/api/signup/employerCompany/route.ts b/src/app/api/signup/employerCompany/route.ts
index 874c101c..0127b6a2 100644
--- a/src/app/api/signup/employerCompany/route.ts
+++ b/src/app/api/signup/employerCompany/route.ts
@@ -15,6 +15,13 @@ export async function POST(request: Request) {
try {
const {userId, name, email, companyName, numberOfEmployees} = (await request.json()) as PostBody;
+ // Validate required fields
+ if (!name?.trim()) {
+ return createValidationError(
+ "User name is required. Please ensure you are logged in with a complete profile."
+ );
+ }
+
// Check if company already exists
const [existingCompany] = await db
.select()
@@ -36,6 +43,7 @@ export async function POST(request: Request) {
.returning({ id: company.id });
if(!newCompany) {
+ console.error("Company creation returned no data. Database insert failed.");
return createValidationError(
"Could not create company. Please try again later."
);
@@ -59,6 +67,10 @@ export async function POST(request: Request) {
}
catch (error: unknown) {
console.error("Error during employer company signup:", error);
+ if (error instanceof Error) {
+ console.error("Error message:", error.message);
+ console.error("Error stack:", error.stack);
+ }
return handleApiError(error);
}
}
diff --git a/src/app/api/updateCompany/route.ts b/src/app/api/updateCompany/route.ts
index 2c9808ac..8925a0c0 100644
--- a/src/app/api/updateCompany/route.ts
+++ b/src/app/api/updateCompany/route.ts
@@ -26,7 +26,7 @@ export async function POST(request: Request) {
if (!validation.success) {
return validation.response;
}
- const { name, employerPasskey, employeePasskey, numberOfEmployees, useUploadThing } = validation.data;
+ const { name, description, industry, employerPasskey, employeePasskey, numberOfEmployees, useUploadThing } = validation.data;
const [userRecord] = await db
.select({
@@ -59,6 +59,8 @@ export async function POST(request: Request) {
const updateData: Partial<{
name: string;
+ description: string | null;
+ industry: string | null;
employerpasskey: string;
employeepasskey: string;
numberOfEmployees: string;
@@ -68,7 +70,13 @@ export async function POST(request: Request) {
numberOfEmployees,
};
- // Only include passkeys if they were explicitly provided
+ if (description !== undefined) {
+ updateData.description = description?.trim() ?? null;
+ }
+ if (industry !== undefined) {
+ updateData.industry = industry?.trim() ?? null;
+ }
+
if (employerPasskey !== undefined) {
updateData.employerpasskey = employerPasskey;
}
@@ -76,7 +84,6 @@ export async function POST(request: Request) {
updateData.employeepasskey = employeePasskey;
}
- // Only include useUploadThing if it was explicitly provided
if (useUploadThing !== undefined) {
updateData.useUploadThing = useUploadThing;
}
diff --git a/src/app/api/upload-local/route.ts b/src/app/api/upload-local/route.ts
index 3ff066e2..37fcf088 100644
--- a/src/app/api/upload-local/route.ts
+++ b/src/app/api/upload-local/route.ts
@@ -7,6 +7,7 @@ import { NextResponse } from "next/server";
import { auth } from "@clerk/nextjs/server";
import { db } from "~/server/db";
import { fileUploads } from "~/server/db/schema";
+import { putFile } from "~/server/storage/vercel-blob";
import { isUploadAccepted } from "~/lib/upload-accepted";
const MAX_FILE_SIZE = 16 * 1024 * 1024; // 16MB to match UploadThing config
@@ -39,10 +40,6 @@ export async function POST(request: Request) {
);
}
- console.log(
- `[UploadLocal] Received file: name=${file.name}, mime=${file.type}, size=${(file.size / 1024).toFixed(1)}KB, user=${userId}`
- );
-
if (!isUploadAccepted({ name: file.name, type: file.type })) {
console.warn(`[UploadLocal] Rejected: unsupported file type name=${file.name}, mime=${file.type}`);
return NextResponse.json(
@@ -51,33 +48,46 @@ export async function POST(request: Request) {
);
}
+ console.log(
+ `[UploadLocal] Uploading to Vercel Blob: name=${file.name}, mime=${file.type}, size=${(file.size / 1024).toFixed(1)}KB, user=${userId}`
+ );
+
if (file.size > MAX_FILE_SIZE) {
- console.warn(`[UploadLocal] Rejected: file too large size=${(file.size / 1024 / 1024).toFixed(1)}MB, max=${MAX_FILE_SIZE / 1024 / 1024}MB`);
+ console.warn(
+ `[UploadLocal] Rejected: file too large size=${(file.size / 1024 / 1024).toFixed(1)}MB, max=${MAX_FILE_SIZE / 1024 / 1024}MB`
+ );
return NextResponse.json(
{ error: `File too large. Maximum size is ${MAX_FILE_SIZE / 1024 / 1024}MB.` },
{ status: 400 }
);
}
- // Convert file to base64
- console.log(`[UploadLocal] Converting to base64...`);
- const arrayBuffer = await file.arrayBuffer();
- const base64Data = Buffer.from(arrayBuffer).toString("base64");
- console.log(`[UploadLocal] Base64 encoded: ${(base64Data.length / 1024).toFixed(1)}KB`);
-
- // Store in database
- console.log(`[UploadLocal] Storing in database...`);
- const [uploadedFile] = await db.insert(fileUploads).values({
- userId,
+ const blob = await putFile({
filename: file.name,
- mimeType: file.type,
- fileData: base64Data,
- fileSize: file.size,
- }).returning({
- id: fileUploads.id,
- filename: fileUploads.filename,
+ data: await file.arrayBuffer(),
+ contentType: file.type || undefined,
});
+ const [uploadedFile] = await db
+ .insert(fileUploads)
+ .values({
+ userId,
+ filename: file.name,
+ mimeType: file.type,
+ fileData: null,
+ fileSize: file.size,
+ storageProvider: "vercel_blob",
+ storageUrl: blob.url,
+ storagePathname: blob.pathname,
+ blobChecksum: blob.checksum ?? null,
+ })
+ .returning({
+ id: fileUploads.id,
+ filename: fileUploads.filename,
+ storageProvider: fileUploads.storageProvider,
+ storageUrl: fileUploads.storageUrl,
+ });
+
if (!uploadedFile) {
console.error("[UploadLocal] Database insert returned no result");
return NextResponse.json(
@@ -87,7 +97,7 @@ export async function POST(request: Request) {
}
// Return URL that can be used to fetch the file
- const fileUrl = `/api/files/${uploadedFile.id}`;
+ const fileUrl = blob.url;
const elapsed = Date.now() - uploadStart;
console.log(
@@ -99,6 +109,8 @@ export async function POST(request: Request) {
url: fileUrl,
name: uploadedFile.filename,
id: uploadedFile.id,
+ provider: uploadedFile.storageProvider,
+ pathname: blob.pathname,
});
} catch (error) {
const elapsed = Date.now() - uploadStart;
diff --git a/src/app/deployment/components/sections/DockerDeploymentPage.tsx b/src/app/deployment/components/sections/DockerDeploymentPage.tsx
index 961905b3..fefc0d6e 100644
--- a/src/app/deployment/components/sections/DockerDeploymentPage.tsx
+++ b/src/app/deployment/components/sections/DockerDeploymentPage.tsx
@@ -3,14 +3,11 @@
import React from 'react';
import { motion } from 'motion/react';
import {
- Container,
Server,
Database,
RefreshCw,
CheckCircle2,
ShieldAlert,
- ExternalLink,
- ArrowRight,
} from 'lucide-react';
import type { DeploymentProps } from '../../types';
import { Section, Step } from '../ui';
@@ -86,6 +83,8 @@ docker run --rm -p 3000:3000 \\
-e CLERK_SECRET_KEY="$CLERK_SECRET_KEY" \\
-e NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY="$NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY" \\
-e OPENAI_API_KEY="$OPENAI_API_KEY" \\
+ -e BLOB_READ_WRITE_TOKEN="$BLOB_READ_WRITE_TOKEN" \\
+ -e INNGEST_EVENT_KEY="$INNGEST_EVENT_KEY" \\
pdr-ai-app`;
return (
@@ -132,8 +131,14 @@ docker run --rm -p 3000:3000 \\
code={`DATABASE_URL="postgresql://postgres:password@db:5432/pdr_ai_v2"
NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_xxx
CLERK_SECRET_KEY=sk_live_xxx
-OPENAI_API_KEY=sk-proj-xxx`}
- onCopy={() => copyToClipboard(`DATABASE_URL="postgresql://postgres:password@db:5432/pdr_ai_v2"\nNEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_xxx\nCLERK_SECRET_KEY=sk_live_xxx\nOPENAI_API_KEY=sk-proj-xxx`, 'docker-1')}
+OPENAI_API_KEY=sk-proj-xxx
+
+# Vercel Blob — required for document uploads
+BLOB_READ_WRITE_TOKEN=vercel_blob_rw_xxxxxxxxxxxx
+
+# Inngest — use a placeholder for local dev
+INNGEST_EVENT_KEY=dev-placeholder`}
+ onCopy={() => copyToClipboard(`DATABASE_URL="postgresql://postgres:password@db:5432/pdr_ai_v2"\nNEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_xxx\nCLERK_SECRET_KEY=sk_live_xxx\nOPENAI_API_KEY=sk-proj-xxx\n\nBLOB_READ_WRITE_TOKEN=vercel_blob_rw_xxxxxxxxxxxx\n\nINNGEST_EVENT_KEY=dev-placeholder`, 'docker-1')}
copied={copiedCode === 'docker-1'}
darkMode={darkMode}
/>
diff --git a/src/app/deployment/components/sections/InngestPage.tsx b/src/app/deployment/components/sections/InngestPage.tsx
index 378130e4..380da0af 100644
--- a/src/app/deployment/components/sections/InngestPage.tsx
+++ b/src/app/deployment/components/sections/InngestPage.tsx
@@ -10,11 +10,9 @@ import {
Shield,
BarChart3,
CheckCircle2,
- AlertTriangle,
- Sparkles,
} from 'lucide-react';
import type { DeploymentProps } from '../../types';
-import { Section, Step, ApiKeyCard, InfoBox, WarningBox } from '../ui';
+import { Section, CodeBlock, Step, ApiKeyCard, InfoBox, WarningBox } from '../ui';
export const InngestPage: React.FC = ({
darkMode,
@@ -23,7 +21,6 @@ export const InngestPage: React.FC = ({
}) => {
return (
<>
- {/* Hero Section */}
= ({
} rounded-full font-medium mb-6 text-sm`}
>
- Optional Integration
+ Required Integration
@@ -49,21 +46,10 @@ export const InngestPage: React.FC = ({
Reliable background processing for document OCR pipelines with automatic retries,
observability, and step-based execution.
-
-
-
- INNGEST_EVENT_KEY is required for background document processing
-
- {/* Benefits Section */}
-
+ {/* Benefits */}
+
}
@@ -104,7 +90,7 @@ export const InngestPage: React.FC = ({
- {/* How It Works */}
+ {/* How the Pipeline Works */}
+ {/* Development Setup */}
+
}
+ title="Inngest starts automatically"
+ icon={ }
darkMode={darkMode}
>
-
- When Inngest is not configured, the same pipeline runs synchronously using{' '}
-
- processDocumentSync()
-
- . The request returns immediately (fire-and-forget) via setImmediate(), but there are no retries or observability.
+
+ Running pnpm dev automatically
+ starts both Next.js and the Inngest dev server using concurrently.
+ No additional setup is needed to get background jobs working locally.
+
+
+ The Inngest dev server dashboard is available at{' '}
+ http://localhost:8288{' '}
+ where you can monitor jobs, view step execution, and inspect logs.
-
- {/* Setup Steps */}
-
-
- copyToClipboard('npx inngest-cli@latest dev', 'inngest-1')}
- copied={copiedCode === 'inngest-1'}
- darkMode={darkMode}
- />
-
- copyToClipboard('pnpm dev', 'inngest-2')}
- copied={copiedCode === 'inngest-2'}
- darkMode={darkMode}
- />
-
- copyToClipboard('http://localhost:8288', 'inngest-3')}
- copied={copiedCode === 'inngest-3'}
- darkMode={darkMode}
- />
-
+
+
+
+ Start the dev server (Next.js + Inngest together)
+
+
copyToClipboard('pnpm dev', 'inngest-dev')}
+ copied={copiedCode === 'inngest-dev'}
+ darkMode={darkMode}
+ />
+
+ This runs the equivalent of:
+
+ copyToClipboard('concurrently "next dev --turbo" "pnpm dlx inngest-cli@latest dev -u http://localhost:3000/api/inngest"', 'inngest-dev-full')}
+ copied={copiedCode === 'inngest-dev-full'}
+ darkMode={darkMode}
+ />
+
-
-
+
+
+ INNGEST_EVENT_KEY in development
+
+
+ The Inngest dev server does not require a real event key. You can use any placeholder value
+ in your .env file:
+
+
copyToClipboard('INNGEST_EVENT_KEY=dev-placeholder', 'inngest-dev-key')}
+ copied={copiedCode === 'inngest-dev-key'}
+ darkMode={darkMode}
+ />
+
+
{/* Production Setup */}
+
+ In production, Inngest Cloud handles job scheduling, retries, and monitoring.
+ You need a real event key and signing key from your Inngest account.
+
+
= ({
steps={[
'Create account at inngest.com',
'Create a new app in the dashboard',
- 'Go to Settings → Signing Key',
- 'Copy INNGEST_SIGNING_KEY to your environment',
'Go to Settings → Event Keys → Create Key',
'Copy INNGEST_EVENT_KEY to your environment',
+ 'Go to Settings → Signing Key',
+ 'Copy INNGEST_SIGNING_KEY to your environment',
]}
darkMode={darkMode}
/>
-
+
- copyToClipboard(
- `# Inngest Background Jobs (Required)
-INNGEST_EVENT_KEY=your_event_key_here
-INNGEST_SIGNING_KEY=signkey-prod-xxxxx`,
- 'inngest-prod-1'
- )
- }
- copied={copiedCode === 'inngest-prod-1'}
- darkMode={darkMode}
- />
+ onCopy={() =>
+ copyToClipboard(
+ `INNGEST_EVENT_KEY=your_real_event_key\nINNGEST_SIGNING_KEY=signkey-prod-xxxxx`,
+ 'inngest-prod-env'
+ )
+ }
+ copied={copiedCode === 'inngest-prod-env'}
+ darkMode={darkMode}
+ />
+
- {/* Architecture */}
+ {/* Architecture Diagram */}
-
+
{`┌─────────────────────────────────────────────────────────────┐
│ Document Upload │
@@ -270,17 +243,20 @@ INNGEST_SIGNING_KEY=signkey-prod-xxxxx`,
▼
┌─────────────────────────────────────────────────────────────┐
│ triggerDocumentProcessing() │
-│ INNGEST_EVENT_KEY required → Send event to job runner │
-│ (Key validated at startup; no sync fallback) │
+│ Sends event to Inngest (requires INNGEST_EVENT_KEY) │
└────────────────────────────┬────────────────────────────────┘
│
+ ┌──────────────┴──────────────┐
+ │ Dev: local Inngest server │
+ │ Prod: Inngest Cloud │
+ └──────────────┬──────────────┘
+ │
▼
┌─────────────────────────────────────────────────────────────┐
-│ Inngest Cloud │
-│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌───────┐ │
-│ │ Step A │→│ Step B │→│ Step C │→│ Step D │→│ Step E│ │
-│ │ Router │ │Normalize│ │ Chunk │ │Vectorize│ │ Store │ │
-│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ └───────┘ │
+│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌───────┐│
+│ │ Step A │→│ Step B │→│ Step C │→│ Step D │→│ Step E││
+│ │ Router │ │Normalize│ │ Chunk │ │Vectorize│ │ Store ││
+│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ └───────┘│
│ │
│ ✓ Automatic retries ✓ Step isolation ✓ Observability │
└─────────────────────────────────────────────────────────────┘`}
@@ -288,25 +264,48 @@ INNGEST_SIGNING_KEY=signkey-prod-xxxxx`,
- {/* Verification */}
+ {/* Verify */}
+
+ {/* Troubleshooting */}
+
+
+
+
+ App crashes with "INNGEST_EVENT_KEY is required in production"
+
+
+ Set INNGEST_EVENT_KEY=dev-placeholder in
+ your .env file.
+ Any non-empty value works for local development.
+
+
+
+
+ Inngest dashboard shows no functions
+
+
+ Make sure Next.js is running and the Inngest dev server can reach http://localhost:3000/api/inngest.
+ If you started them separately, use pnpm dev instead so they start together.
+
+
+
+
+ Running Inngest dev server separately
+
+
+ If you prefer to run them in separate terminals, start Next.js
+ with pnpm dev:next and
+ Inngest with pnpm inngest:dev.
+
+
>
@@ -372,4 +371,3 @@ const VerificationStep: React.FC
= ({ text, darkMode }) =
{text}
);
-
diff --git a/src/app/deployment/components/sections/MainDeployment.tsx b/src/app/deployment/components/sections/MainDeployment.tsx
index 1dd8c03f..40a4f5be 100644
--- a/src/app/deployment/components/sections/MainDeployment.tsx
+++ b/src/app/deployment/components/sections/MainDeployment.tsx
@@ -213,6 +213,12 @@ export const MainDeployment: React.FC = ({
{' '}(recommended) and copy the connection string.
+
+ } title="Create a Vercel Blob store" darkMode={darkMode}>
+ In your Vercel project, go to Storage → Create Database → Blob and connect it to your project.
+ This provides the BLOB_READ_WRITE_TOKEN needed
+ for document uploads. See the Vercel Blob page in the sidebar for details.
+
@@ -250,8 +256,14 @@ export const MainDeployment: React.FC = ({
NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_your_key_here
CLERK_SECRET_KEY=sk_live_your_key_here
-OPENAI_API_KEY=sk-proj-your_key_here`}
- onCopy={() => copyToClipboard(`DATABASE_URL="postgresql://user:password@host:5432/database?sslmode=require"\n\nNEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_your_key_here\nCLERK_SECRET_KEY=sk_live_your_key_here\n\nOPENAI_API_KEY=sk-proj-your_key_here`, 'step-3')}
+OPENAI_API_KEY=sk-proj-your_key_here
+
+# Vercel Blob — required for document uploads
+BLOB_READ_WRITE_TOKEN=vercel_blob_rw_xxxxxxxxxxxx
+
+# Inngest — use a placeholder for local dev
+INNGEST_EVENT_KEY=dev-placeholder`}
+ onCopy={() => copyToClipboard(`DATABASE_URL="postgresql://user:password@host:5432/database?sslmode=require"\n\nNEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_your_key_here\nCLERK_SECRET_KEY=sk_live_your_key_here\n\nOPENAI_API_KEY=sk-proj-your_key_here\n\nBLOB_READ_WRITE_TOKEN=vercel_blob_rw_xxxxxxxxxxxx\n\nINNGEST_EVENT_KEY=dev-placeholder`, 'step-3')}
copied={copiedCode === 'step-3'}
darkMode={darkMode}
/>
@@ -302,13 +314,20 @@ OPENAI_API_KEY=sk-proj-your_key_here`}
- {/* ── More ways to integrate ── */}
+ {/* ── Required integrations ── */}
-
+
+ }
+ title="Vercel Blob"
+ description="Cloud file storage for document uploads. Required — there is no database fallback."
+ cta="Set up Vercel Blob"
+ darkMode={darkMode}
+ />
}
title="Inngest"
@@ -316,6 +335,16 @@ OPENAI_API_KEY=sk-proj-your_key_here`}
cta="Set up Inngest"
darkMode={darkMode}
/>
+
+
+
+ {/* ── Optional integrations ── */}
+
+
}
title="LangChain Tracing"
diff --git a/src/app/deployment/components/sections/UploadThingPage.tsx b/src/app/deployment/components/sections/UploadThingPage.tsx
index 3ebd03e1..28312e16 100644
--- a/src/app/deployment/components/sections/UploadThingPage.tsx
+++ b/src/app/deployment/components/sections/UploadThingPage.tsx
@@ -60,31 +60,31 @@ export const UploadThingPage: React.FC
= ({
-
+
- PDR AI supports two storage methods for uploaded documents:
+ PDR AI supports two cloud storage backends for uploaded documents. Vercel Blob is the default and required; UploadThing is an optional alternative.
-
-
- Cloud Storage (UploadThing)
+
+
+ Vercel Blob (Default)
- • Better performance for large files
- • CDN-backed delivery
- • Requires UploadThing account
- • Recommended for production
+ • Required — used by default for all uploads
+ • Native Vercel integration
+ • Edge-optimized delivery
+ • Public & private store support
-
-
- Database Storage
+
+
+ UploadThing (Optional)
- • No external service required
- • Files stored in PostgreSQL
- • Good for development/testing
- • Works offline
+ • Optional alternative upload path
+ • CDN-backed delivery
+ • Type-safe file routing
+ • Requires UploadThing account
@@ -126,17 +126,17 @@ export const UploadThingPage: React.FC = ({
- Step 4: Enable Cloud Storage
+ Step 4: Select UploadThing in the Upload Page
- Once configured, you can toggle between Cloud and Database storage on the upload page.
+ Once configured, you can toggle between Vercel Blob and UploadThing in the storage method selector on the upload page.
The preference is saved per-company and persists across sessions.
diff --git a/src/app/deployment/components/sections/VercelBlobPage.tsx b/src/app/deployment/components/sections/VercelBlobPage.tsx
new file mode 100644
index 00000000..d1053b22
--- /dev/null
+++ b/src/app/deployment/components/sections/VercelBlobPage.tsx
@@ -0,0 +1,224 @@
+'use client';
+
+import React from 'react';
+import { motion } from 'motion/react';
+import { Database, Check, ExternalLink } from 'lucide-react';
+import type { DeploymentProps } from '../../types';
+import { Section, CodeBlock, WarningBox, InfoBox } from '../ui';
+
+export const VercelBlobPage: React.FC
= ({
+ darkMode,
+ copyToClipboard,
+ copiedCode,
+}) => {
+ return (
+ <>
+
+
+
+ Required Integration
+
+
+
+ Vercel Blob Storage
+
+
+ Cloud file storage for document uploads, powered by Vercel's edge-optimized blob store.
+
+
+
+
+
+ Vercel Blob is a serverless file storage service that integrates natively with Vercel deployments. PDR AI uses it to store uploaded documents with:
+
+
+
+
+ Edge-optimized file delivery with global CDN
+
+
+
+ Automatic public or private access mode detection
+
+
+
+ Bearer-token authentication for private blobs
+
+
+
+ No additional infrastructure — works out of the box on Vercel
+
+
+
+
+
+
+
+
+
+ Vercel Blob (Required)
+
+
+ • Used for all document uploads
+ • Edge-optimized delivery
+ • Public & private store support
+ • Works on Vercel and non-Vercel hosts
+
+
+
+
+ UploadThing (Optional alternative)
+
+
+ • Optional cloud upload path
+ • CDN-backed delivery
+ • Vercel Blob is still needed for retrieval
+ • See the UploadThing page for setup
+
+
+
+
+
+
+
+
+
+ Step 1: Create a Blob Store in Vercel
+
+
+ Open your project in the{' '}
+
+ Vercel Dashboard
+
+ , then navigate to Storage → Create Database → Blob .
+
+
+ Choose a name for your store (e.g. pdr-ai-documents)
+ and select a region close to your deployment.
+
+
+
+
+
+ Step 2: Connect the Store to Your Project
+
+
+ In the blob store settings, click Connect Project and select your PDR AI project.
+ Vercel will automatically inject the BLOB_READ_WRITE_TOKEN environment variable into your deployment.
+
+
+
+
+
+ Step 3: Add to Local Environment (for development)
+
+
+ Copy the token from your blob store's settings page and add it to your local .env file:
+
+
copyToClipboard('BLOB_READ_WRITE_TOKEN=vercel_blob_rw_xxxxxxxxxxxx', 'blob-env')}
+ copied={copiedCode === 'blob-env'}
+ darkMode={darkMode}
+ />
+
+ You can also pull your Vercel env variables locally with:
+
+ copyToClipboard('vercel env pull .env.local', 'blob-env-pull')}
+ copied={copiedCode === 'blob-env-pull'}
+ darkMode={darkMode}
+ />
+
+
+
+
+ Step 4: Deploy
+
+
+ Once connected, push your code or trigger a redeploy. PDR AI will automatically detect the token
+ and use Vercel Blob for document storage. No code changes are needed.
+
+
+
+
+
+
+
+
+ } darkMode={darkMode}>
+
+ PDR AI automatically detects whether your blob store is configured as public or private.
+ It first attempts a public upload — if your store only allows private access, it retries with private mode
+ and caches the result for subsequent uploads.
+
+
+ Public stores — files are served directly via a CDN URL. Simpler, faster delivery.
+ Private stores — files require a Bearer token to access. PDR AI handles this automatically when fetching documents.
+
+
+
+
+
+
+ When a document is uploaded, PDR AI:
+
+
+ {[
+ { step: '1', text: 'Sanitizes the filename and generates a unique storage key' },
+ { step: '2', text: 'Uploads the file buffer to Vercel Blob with the detected access mode' },
+ { step: '3', text: 'Stores the blob URL and metadata in the database for retrieval' },
+ { step: '4', text: 'For private blobs, injects the Bearer token when fetching the document later' },
+ ].map((s) => (
+
+
+ {s.step}
+
+
{s.text}
+
+ ))}
+
+
+
+
+
+ Useful Vercel CLI commands for managing your blob store:
+
+
+
+
Pull env variables locally
+
copyToClipboard('vercel env pull .env.local', 'cli-pull')}
+ copied={copiedCode === 'cli-pull'}
+ darkMode={darkMode}
+ />
+
+
+
List linked storage
+
copyToClipboard('vercel storage ls', 'cli-ls')}
+ copied={copiedCode === 'cli-ls'}
+ darkMode={darkMode}
+ />
+
+
+
+ >
+ );
+};
diff --git a/src/app/deployment/components/sections/VercelDeploymentPage.tsx b/src/app/deployment/components/sections/VercelDeploymentPage.tsx
index e2e9dae3..66f30441 100644
--- a/src/app/deployment/components/sections/VercelDeploymentPage.tsx
+++ b/src/app/deployment/components/sections/VercelDeploymentPage.tsx
@@ -7,11 +7,9 @@ import {
Database,
Settings,
Globe,
- CheckCircle2,
ShieldAlert,
ExternalLink,
Play,
- Video,
} from 'lucide-react';
import type { DeploymentProps } from '../../types';
import { Section, Step } from '../ui';
@@ -169,19 +167,39 @@ export const VercelDeploymentPage: React.FC = ({
NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_xxx
CLERK_SECRET_KEY=sk_live_xxx
OPENAI_API_KEY=sk-proj-xxx
-INNGEST_EVENT_KEY=evt_xxx`}
- onCopy={() => copyToClipboard(`DATABASE_URL=postgresql://\nNEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_xxx\nCLERK_SECRET_KEY=sk_live_xxx\nOPENAI_API_KEY=sk-proj-xxx\nINNGEST_EVENT_KEY=evt_xxx`, 'v-2')}
+INNGEST_EVENT_KEY=evt_xxx
+BLOB_READ_WRITE_TOKEN=vercel_blob_rw_xxx`}
+ onCopy={() => copyToClipboard(`DATABASE_URL=postgresql://\nNEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_xxx\nCLERK_SECRET_KEY=sk_live_xxx\nOPENAI_API_KEY=sk-proj-xxx\nINNGEST_EVENT_KEY=evt_xxx\nBLOB_READ_WRITE_TOKEN=vercel_blob_rw_xxx`, 'v-2')}
copied={copiedCode === 'v-2'}
darkMode={darkMode}
/>
copyToClipboard('https://vercel.com/dashboard', 'v-blob')}
+ copied={copiedCode === 'v-blob'}
+ darkMode={darkMode}
+ >
+
+
+ Vercel Dashboard → Storage
+
+
+
+ copyToClipboard('https://vercel.com/dashboard', 'v-3')}
@@ -200,7 +218,7 @@ INNGEST_EVENT_KEY=evt_xxx`}
copyToClipboard('https://your-app.vercel.app\nhttps://your-app.vercel.app/sign-in\nhttps://your-app.vercel.app/dashboard', 'v-4')}
@@ -231,9 +249,10 @@ INNGEST_EVENT_KEY=evt_xxx`}
{[
['Auth works', 'Sign in and sign out on production domain'],
- ['Database connected', 'Upload a document — no connection errors in Vercel logs'],
+ ['Database connected', 'Check Vercel logs for successful DB queries, no ETIMEDOUT errors'],
+ ['Blob storage', 'Upload a document — check it stores successfully (no MissingBlobTokenError)'],
['Document Q&A', 'Ask a question against an uploaded document'],
- ['Background jobs', 'If INNGEST_EVENT_KEY is set, trigger a processing pipeline'],
+ ['Background jobs', 'Upload a document and verify the Inngest pipeline runs in the Inngest dashboard'],
].map(([check, how]) => (
{check}
diff --git a/src/app/deployment/components/sections/index.ts b/src/app/deployment/components/sections/index.ts
index 7de8b572..d6787efa 100644
--- a/src/app/deployment/components/sections/index.ts
+++ b/src/app/deployment/components/sections/index.ts
@@ -6,6 +6,7 @@ export { InngestPage } from './InngestPage';
export { LangChainPage } from './LangChainPage';
export { TavilyPage } from './TavilyPage';
export { UploadThingPage } from './UploadThingPage';
+export { VercelBlobPage } from './VercelBlobPage';
export { OCRAzurePage } from './OCRAzurePage';
export { OCRLandingPage } from './OCRLandingPage';
export { OCRDatalabPage } from './OCRDatalabPage';
diff --git a/src/app/deployment/page.tsx b/src/app/deployment/page.tsx
index cf2c08ca..f8541d24 100644
--- a/src/app/deployment/page.tsx
+++ b/src/app/deployment/page.tsx
@@ -1,7 +1,8 @@
'use client';
-import React, { useEffect, useState } from 'react';
+import React, { Suspense, useCallback, useEffect, useState } from 'react';
import { useTheme } from 'next-themes';
+import { useSearchParams } from 'next/navigation';
import { DeploymentNavbar } from './components/DeploymentNavbar';
import { DeploymentSidebar } from './components/DeploymentSidebar';
import {
@@ -13,6 +14,7 @@ import {
LangChainPage,
TavilyPage,
UploadThingPage,
+ VercelBlobPage,
OCRAzurePage,
OCRLandingPage,
OCRDatalabPage,
@@ -21,6 +23,21 @@ import {
import type { DeploymentSection } from './types';
import { SECTIONS } from './types';
+const VALID_SECTIONS = new Set(
+ SECTIONS.flatMap(s => [s.id, ...(s.children?.map(c => c.id) ?? [])])
+);
+
+function SectionFromParams({ onSection }: { onSection: (s: DeploymentSection) => void }) {
+ const searchParams = useSearchParams();
+ useEffect(() => {
+ const section = searchParams.get('section');
+ if (section && VALID_SECTIONS.has(section)) {
+ onSection(section as DeploymentSection);
+ }
+ }, [searchParams, onSection]);
+ return null;
+}
+
const DeploymentPage = () => {
const [mounted, setMounted] = useState(false);
const [copiedCode, setCopiedCode] = useState(null);
@@ -37,6 +54,13 @@ const DeploymentPage = () => {
setMounted(true);
}, []);
+ const handleSectionFromParams = useCallback((section: DeploymentSection) => {
+ setActiveSection(section);
+ if (section.startsWith('ocr-')) {
+ setExpandedSections(prev => prev.includes('ocr') ? prev : [...prev, 'ocr']);
+ }
+ }, []);
+
if (!mounted) {
return
;
}
@@ -97,6 +121,8 @@ const DeploymentPage = () => {
return ;
case 'uploadthing':
return ;
+ case 'vercel-blob':
+ return ;
case 'ocr':
case 'ocr-azure':
return ;
@@ -113,6 +139,9 @@ const DeploymentPage = () => {
return (
+
+
+
{/* Top Navigation */}
;
+ return (
+
+
+
+ );
}
diff --git a/src/app/employer/documents/components/AgentChatInterface.tsx b/src/app/employer/documents/components/AgentChatInterface.tsx
index f389c376..3e3790a5 100644
--- a/src/app/employer/documents/components/AgentChatInterface.tsx
+++ b/src/app/employer/documents/components/AgentChatInterface.tsx
@@ -141,9 +141,9 @@ export const AgentChatInterface: React.FC
= ({
useEffect(() => {
if (textareaRef.current) {
- textareaRef.current.style.height = '52px';
+ textareaRef.current.style.height = '48px';
const scrollHeight = textareaRef.current.scrollHeight;
- textareaRef.current.style.height = `${Math.min(Math.max(scrollHeight, 52), 180)}px`;
+ textareaRef.current.style.height = `${Math.min(Math.max(scrollHeight, 48), 180)}px`;
}
}, [input]);
@@ -291,32 +291,28 @@ export const AgentChatInterface: React.FC = ({
};
return (
-
+
{/* Messages */}
{messages.length === 0 ? (
-
-
-
+
+
+
-
-
- Start a conversation
-
-
+
Start a conversation
+
Ask me anything about {searchScope === 'document' ? (selectedDocTitle ?? 'your document') : 'all your company documents'}. I'm here to help!
-
- {/* Quick prompts */}
-
+
{[
"Summarize the key points",
"What are the main takeaways?",
@@ -325,7 +321,7 @@ export const AgentChatInterface: React.FC
= ({
setInput(prompt)}
- className="px-3 py-1.5 text-xs font-medium text-slate-600 dark:text-slate-400 bg-slate-100 dark:bg-slate-800 rounded-full hover:bg-violet-100 dark:hover:bg-violet-900/30 hover:text-violet-600 dark:hover:text-violet-400 transition-all"
+ className="px-3 py-1.5 text-xs font-medium text-muted-foreground bg-muted rounded-full hover:bg-purple-50 dark:hover:bg-purple-900/20 hover:text-purple-600 dark:hover:text-purple-400 transition-all"
>
{prompt}
@@ -345,42 +341,38 @@ export const AgentChatInterface: React.FC = ({
{/* AI Avatar */}
{msg.role === 'assistant' && (
-
-
+
+
)}
-
+
{msg.role === 'assistant' ? (
) : (
-
- {displayText}
-
+
{displayText}
)}
-
+
{msg.role === 'assistant' && typeof msg.content === 'object' && msg.content !== null && (
<>
{/* Source References */}
{'references' in msg.content && Array.isArray(msg.content.references) && msg.content.references.length > 0 && (
-
-
+
+
Page References
@@ -393,7 +385,7 @@ export const AgentChatInterface: React.FC
= ({
onPageClick?.(reference.page);
}
}}
- className="inline-flex items-center bg-violet-100 dark:bg-violet-900/40 text-violet-700 dark:text-violet-300 px-2.5 py-1 rounded-lg text-xs font-semibold hover:bg-violet-200 dark:hover:bg-violet-800/50 transition-all"
+ className="inline-flex items-center bg-purple-100 dark:bg-purple-900/40 text-purple-700 dark:text-purple-300 px-2.5 py-1 rounded-md text-xs font-semibold hover:bg-purple-200 dark:hover:bg-purple-800/50 transition-all"
>
{reference.page ? `Page ${reference.page}` : "Highlight Source"}
@@ -405,8 +397,8 @@ export const AgentChatInterface: React.FC = ({
{/* Legacy page references fallback */}
{(!('references' in msg.content) || !Array.isArray(msg.content.references) || msg.content.references.length === 0) &&
'pages' in msg.content && Array.isArray(msg.content.pages) && msg.content.pages.length > 0 && (
-
-
+
+
Referenced Pages
@@ -414,7 +406,7 @@ export const AgentChatInterface: React.FC
= ({
onPageClick?.(page)}
- className="inline-flex items-center bg-violet-100 dark:bg-violet-900/40 text-violet-700 dark:text-violet-300 px-2.5 py-1 rounded-lg text-xs font-semibold hover:bg-violet-200 dark:hover:bg-violet-800/50 transition-all"
+ className="inline-flex items-center bg-purple-100 dark:bg-purple-900/40 text-purple-700 dark:text-purple-300 px-2.5 py-1 rounded-md text-xs font-semibold hover:bg-purple-200 dark:hover:bg-purple-800/50 transition-all"
>
Page {page}
@@ -425,26 +417,26 @@ export const AgentChatInterface: React.FC = ({
{/* Web Sources */}
{'webSources' in msg.content && Array.isArray(msg.content.webSources) && msg.content.webSources.length > 0 && (
-