From fd25ffb432c521a83a20d8bcce69dfbcada9ead9 Mon Sep 17 00:00:00 2001 From: Arber Xhindoli <14798762+arberx@users.noreply.github.com> Date: Wed, 27 May 2026 12:39:12 -0400 Subject: [PATCH] feat(traffic): scaffold Cloudflare Worker traffic source (foundation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First slice of the Cloudflare adapter — contracts, schema, and the push-receive integration package. The HTTP route + CLI + doctor checks land in a follow-up PR. Why a Worker instead of GraphQL Analytics or Logpush: Cloudflare's GraphQL API is aggregate-only and Logpush is Business+ only, so the Worker is the universal raw-row access path. Also unblocks future "Cloudflare-as-proxy" support for hosts with no native logs. This is the first push-receive traffic source — every existing adapter pulls. Safe because canonry is single-tenant per deployment; the Worker only ever talks to the operator's own canonry instance. Includes: - Zod schemas for the source config, connect request/response, ingest payload, and the per-event shape - integration-cloudflare-worker package with HMAC-SHA256 signature verifier, event → NormalizedTrafficRequest normalizer, and Worker script generator (broad edge-side bot/referer filter; strict classification stays server-side in integration-traffic) - traffic_sources columns ingest_token_hash + last_worker_version (migration v67) - plans/cloudflare-worker-traffic-source.md design doc - Tests written first (TDD): 43 tests in the new package, 26 contract schema cases, 3 DB column round-trip tests Full workspace test passes (3381/3381). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../api-routes/test/db-dto-coverage.test.ts | 2 + packages/contracts/src/traffic.ts | 85 +++ packages/contracts/test/traffic.test.ts | 252 +++++++++ packages/db/src/migrate.ts | 18 + packages/db/src/schema.ts | 10 + packages/db/test/traffic-tables.test.ts | 83 +++ .../integration-cloudflare-worker/AGENTS.md | 102 ++++ .../integration-cloudflare-worker/CLAUDE.md | 1 + .../package.json | 22 + .../src/index.ts | 4 + .../src/normalize.ts | 90 +++ .../src/script.ts | 250 +++++++++ .../src/types.ts | 29 + .../src/verify.ts | 65 +++ .../test/normalize.test.ts | 127 +++++ .../test/script.test.ts | 121 ++++ .../test/verify.test.ts | 163 ++++++ .../tsconfig.json | 7 + plans/cloudflare-worker-traffic-source.md | 519 ++++++++++++++++++ pnpm-lock.yaml | 6 + vitest.config.ts | 1 + 21 files changed, 1957 insertions(+) create mode 100644 packages/integration-cloudflare-worker/AGENTS.md create mode 100644 packages/integration-cloudflare-worker/CLAUDE.md create mode 100644 packages/integration-cloudflare-worker/package.json create mode 100644 packages/integration-cloudflare-worker/src/index.ts create mode 100644 packages/integration-cloudflare-worker/src/normalize.ts create mode 100644 packages/integration-cloudflare-worker/src/script.ts create mode 100644 packages/integration-cloudflare-worker/src/types.ts create mode 100644 packages/integration-cloudflare-worker/src/verify.ts create mode 100644 packages/integration-cloudflare-worker/test/normalize.test.ts create mode 100644 packages/integration-cloudflare-worker/test/script.test.ts create mode 100644 packages/integration-cloudflare-worker/test/verify.test.ts create mode 100644 packages/integration-cloudflare-worker/tsconfig.json create mode 100644 plans/cloudflare-worker-traffic-source.md diff --git a/packages/api-routes/test/db-dto-coverage.test.ts b/packages/api-routes/test/db-dto-coverage.test.ts index f3089f60..aefe5afe 100644 --- a/packages/api-routes/test/db-dto-coverage.test.ts +++ b/packages/api-routes/test/db-dto-coverage.test.ts @@ -332,6 +332,8 @@ const COVERAGE: Record = { internal: { lastEventIds: 'Bounded ring buffer of recent event IDs; internal dedup state, not part of the source DTO.', configJson: 'Exposed on the DTO as `config`; the DB column keeps the `Json` suffix for grep-ability.', + ingestTokenHash: 'sha256 of the per-source bearer token issued to push-receive adapters (Cloudflare Worker). Auth-only — never returned via the DTO.', + lastWorkerVersion: 'Semver reported by the most recent forwarded event from a push-receive Worker. Surfaced through the doctor check, not the source DTO.', }, }, diff --git a/packages/contracts/src/traffic.ts b/packages/contracts/src/traffic.ts index 71bbec4e..eb21ed39 100644 --- a/packages/contracts/src/traffic.ts +++ b/packages/contracts/src/traffic.ts @@ -169,6 +169,91 @@ export const trafficConnectVercelRequestSchema = z.object({ }) export type TrafficConnectVercelRequest = z.infer +/** + * Persisted in `traffic_sources.configJson` for `sourceType = 'cloudflare'` + * when the source is a Worker push (the only Cloudflare delivery shape this + * release supports). The per-source bearer token + HMAC secret never live + * here — they go to `~/.canonry/config.yaml` under + * `cloudflareTraffic.connections.`. The DB only carries the + * sha256 hash of the bearer for verification. + */ +export const cloudflareWorkerSourceConfigSchema = z.object({ + schemaVersion: z.literal(1), + /** Semver of the Worker script bundle that was generated at connect/rotate time. */ + workerVersion: z.string().min(1), + /** Identifier of the bot/referer keyword set baked into the deployed Worker. */ + expectedBotListVersion: z.string().min(1), + /** Operator-supplied Cloudflare zone id for the deployed Worker. Optional in Phase 1. */ + zoneId: z.string().nullable(), + /** Operator-supplied Cloudflare account id. Optional in Phase 1; required for Phase 2 auto-deploy. */ + accountId: z.string().nullable(), +}) +export type CloudflareWorkerSourceConfig = z.infer + +export const trafficConnectCloudflareRequestSchema = z.object({ + displayName: z.string().min(1).optional(), + /** Cloudflare zone id of the deployed Worker (informational; not validated against Cloudflare). */ + zoneId: z.string().min(1).optional(), + /** Cloudflare account id (informational; required when Phase 2 auto-deploy lands). */ + accountId: z.string().min(1).optional(), +}) +export type TrafficConnectCloudflareRequest = z.infer + +/** + * Returned by `POST /traffic/connect/cloudflare`. The operator deploys the + * generated Worker script to their Cloudflare zone; the embedded bearer + + * HMAC secret authenticate every subsequent ingest request. + */ +export const trafficConnectCloudflareResponseSchema = z.object({ + sourceId: z.string().min(1), + workerScript: z.string().min(1), + wranglerToml: z.string().min(1), + workerVersion: z.string().min(1), + instructions: z.string().min(1), +}) +export type TrafficConnectCloudflareResponse = z.infer + +/** + * One event row inside a `cloudflareWorkerIngestRequest`. Field shape mirrors + * what a Cloudflare Worker can pull off a `Request` (`request.url`, + * `request.headers`, `request.cf`). Every non-mandatory field is nullable — + * `cf.*` properties depend on the customer's plan tier and are absent on + * free/Pro plans without Bot Management. + */ +export const cloudflareWorkerEventSchema = z.object({ + /** Cloudflare `cf-ray` request id — globally unique per request. */ + eventId: z.string().min(1), + observedAt: z.string().min(1), + method: z.string().nullable(), + host: z.string().nullable(), + path: z.string().min(1), + queryString: z.string().nullable(), + status: z.number().int().nullable(), + userAgent: z.string().nullable(), + remoteIp: z.string().nullable(), + referer: z.string().nullable(), + cf: z.object({ + verifiedBot: z.boolean().nullable(), + botScore: z.number().int().nullable(), + country: z.string().nullable(), + asn: z.number().int().nullable(), + asOrganization: z.string().nullable(), + }).nullable(), +}) +export type CloudflareWorkerEvent = z.infer + +/** + * Body of `POST /api/v1/projects/:name/traffic/cloudflare/ingest`. The + * Worker forwards one event per request in this release; the array shape + * keeps the door open for a future Logpush sibling adapter that batches. + */ +export const cloudflareWorkerIngestRequestSchema = z.object({ + schemaVersion: z.literal(1), + workerVersion: z.string().min(1), + events: z.array(cloudflareWorkerEventSchema).min(1).max(100), +}) +export type CloudflareWorkerIngestRequest = z.infer + export const trafficSyncResponseSchema = z.object({ sourceId: z.string(), runId: z.string(), diff --git a/packages/contracts/test/traffic.test.ts b/packages/contracts/test/traffic.test.ts index 839e61ca..1b8b69dc 100644 --- a/packages/contracts/test/traffic.test.ts +++ b/packages/contracts/test/traffic.test.ts @@ -3,7 +3,12 @@ import { TrafficEventConfidences, TrafficEvidenceKinds, TrafficSourceTypes, + cloudflareWorkerEventSchema, + cloudflareWorkerIngestRequestSchema, + cloudflareWorkerSourceConfigSchema, normalizedTrafficRequestSchema, + trafficConnectCloudflareRequestSchema, + trafficConnectCloudflareResponseSchema, trafficConnectVercelRequestSchema, trafficConnectWordpressRequestSchema, vercelTrafficSourceConfigSchema, @@ -169,3 +174,250 @@ describe('trafficConnectVercelRequestSchema', () => { })).toThrow() }) }) + +describe('cloudflareWorkerSourceConfigSchema', () => { + it('accepts a valid Cloudflare Worker source config', () => { + const parsed = cloudflareWorkerSourceConfigSchema.parse({ + schemaVersion: 1, + workerVersion: '1.0.0', + expectedBotListVersion: '2026-05-27', + zoneId: 'zone_abc123', + accountId: 'acct_xyz789', + }) + expect(parsed.workerVersion).toBe('1.0.0') + expect(parsed.zoneId).toBe('zone_abc123') + }) + + it('allows zoneId and accountId to be null', () => { + const parsed = cloudflareWorkerSourceConfigSchema.parse({ + schemaVersion: 1, + workerVersion: '1.0.0', + expectedBotListVersion: '2026-05-27', + zoneId: null, + accountId: null, + }) + expect(parsed.zoneId).toBeNull() + expect(parsed.accountId).toBeNull() + }) + + it('rejects a schemaVersion other than 1', () => { + expect(() => cloudflareWorkerSourceConfigSchema.parse({ + schemaVersion: 2, + workerVersion: '1.0.0', + expectedBotListVersion: '2026-05-27', + zoneId: null, + accountId: null, + })).toThrow() + }) + + it('rejects an empty workerVersion or expectedBotListVersion', () => { + expect(() => cloudflareWorkerSourceConfigSchema.parse({ + schemaVersion: 1, + workerVersion: '', + expectedBotListVersion: '2026-05-27', + zoneId: null, + accountId: null, + })).toThrow() + expect(() => cloudflareWorkerSourceConfigSchema.parse({ + schemaVersion: 1, + workerVersion: '1.0.0', + expectedBotListVersion: '', + zoneId: null, + accountId: null, + })).toThrow() + }) +}) + +describe('trafficConnectCloudflareRequestSchema', () => { + it('accepts an empty body (all fields optional)', () => { + const parsed = trafficConnectCloudflareRequestSchema.parse({}) + expect(parsed.displayName).toBeUndefined() + expect(parsed.zoneId).toBeUndefined() + expect(parsed.accountId).toBeUndefined() + }) + + it('accepts a connect request with every optional field', () => { + const parsed = trafficConnectCloudflareRequestSchema.parse({ + displayName: 'Example zone', + zoneId: 'zone_abc123', + accountId: 'acct_xyz789', + }) + expect(parsed.displayName).toBe('Example zone') + expect(parsed.zoneId).toBe('zone_abc123') + expect(parsed.accountId).toBe('acct_xyz789') + }) + + it('rejects an empty string for any provided field', () => { + expect(() => trafficConnectCloudflareRequestSchema.parse({ displayName: '' })).toThrow() + expect(() => trafficConnectCloudflareRequestSchema.parse({ zoneId: '' })).toThrow() + expect(() => trafficConnectCloudflareRequestSchema.parse({ accountId: '' })).toThrow() + }) +}) + +describe('trafficConnectCloudflareResponseSchema', () => { + it('accepts a populated response', () => { + const parsed = trafficConnectCloudflareResponseSchema.parse({ + sourceId: 'src_abc123', + workerScript: 'addEventListener("fetch", () => {})', + wranglerToml: 'name = "canonry-worker"', + workerVersion: '1.0.0', + instructions: 'Deploy to your zone', + }) + expect(parsed.sourceId).toBe('src_abc123') + expect(parsed.workerScript).toContain('fetch') + }) + + it('rejects empty string for any required field', () => { + expect(() => trafficConnectCloudflareResponseSchema.parse({ + sourceId: '', + workerScript: 'x', + wranglerToml: 'x', + workerVersion: 'x', + instructions: 'x', + })).toThrow() + }) +}) + +describe('cloudflareWorkerEventSchema', () => { + it('accepts a full event with cf properties populated', () => { + const parsed = cloudflareWorkerEventSchema.parse({ + eventId: '8a3d2b0c-cf-ray', + observedAt: '2026-05-27T15:30:00.123Z', + method: 'GET', + host: 'example.com', + path: '/blog/post', + queryString: 'utm_source=chatgpt', + status: 200, + userAgent: 'GPTBot/1.2', + remoteIp: '20.171.207.34', + referer: 'https://chat.openai.com/', + cf: { + verifiedBot: true, + botScore: 30, + country: 'US', + asn: 8075, + asOrganization: 'Microsoft Corporation', + }, + }) + expect(parsed.eventId).toBe('8a3d2b0c-cf-ray') + expect(parsed.cf?.verifiedBot).toBe(true) + }) + + it('accepts a minimal event with cf=null and most fields null', () => { + const parsed = cloudflareWorkerEventSchema.parse({ + eventId: 'ray-id', + observedAt: '2026-05-27T15:30:00.123Z', + method: null, + host: null, + path: '/', + queryString: null, + status: null, + userAgent: null, + remoteIp: null, + referer: null, + cf: null, + }) + expect(parsed.cf).toBeNull() + expect(parsed.path).toBe('/') + }) + + it('rejects an empty path', () => { + expect(() => cloudflareWorkerEventSchema.parse({ + eventId: 'ray-id', + observedAt: '2026-05-27T15:30:00.123Z', + method: null, + host: null, + path: '', + queryString: null, + status: null, + userAgent: null, + remoteIp: null, + referer: null, + cf: null, + })).toThrow() + }) + + it('rejects an empty eventId', () => { + expect(() => cloudflareWorkerEventSchema.parse({ + eventId: '', + observedAt: '2026-05-27T15:30:00.123Z', + method: null, + host: null, + path: '/', + queryString: null, + status: null, + userAgent: null, + remoteIp: null, + referer: null, + cf: null, + })).toThrow() + }) +}) + +describe('cloudflareWorkerIngestRequestSchema', () => { + const validEvent = { + eventId: 'ray-id', + observedAt: '2026-05-27T15:30:00.123Z', + method: 'GET', + host: 'example.com', + path: '/', + queryString: null, + status: 200, + userAgent: 'GPTBot/1.2', + remoteIp: '20.171.207.34', + referer: null, + cf: null, + } + + it('accepts a single-event ingest request', () => { + const parsed = cloudflareWorkerIngestRequestSchema.parse({ + schemaVersion: 1, + workerVersion: '1.0.0', + events: [validEvent], + }) + expect(parsed.events).toHaveLength(1) + }) + + it('accepts an array of up to 100 events', () => { + const events = Array.from({ length: 100 }, (_, i) => ({ ...validEvent, eventId: `ray-${i}` })) + const parsed = cloudflareWorkerIngestRequestSchema.parse({ + schemaVersion: 1, + workerVersion: '1.0.0', + events, + }) + expect(parsed.events).toHaveLength(100) + }) + + it('rejects an empty events array', () => { + expect(() => cloudflareWorkerIngestRequestSchema.parse({ + schemaVersion: 1, + workerVersion: '1.0.0', + events: [], + })).toThrow() + }) + + it('rejects more than 100 events', () => { + const events = Array.from({ length: 101 }, (_, i) => ({ ...validEvent, eventId: `ray-${i}` })) + expect(() => cloudflareWorkerIngestRequestSchema.parse({ + schemaVersion: 1, + workerVersion: '1.0.0', + events, + })).toThrow() + }) + + it('rejects a non-1 schemaVersion', () => { + expect(() => cloudflareWorkerIngestRequestSchema.parse({ + schemaVersion: 2, + workerVersion: '1.0.0', + events: [validEvent], + })).toThrow() + }) + + it('rejects an empty workerVersion', () => { + expect(() => cloudflareWorkerIngestRequestSchema.parse({ + schemaVersion: 1, + workerVersion: '', + events: [validEvent], + })).toThrow() + }) +}) diff --git a/packages/db/src/migrate.ts b/packages/db/src/migrate.ts index 06e0fd9c..d612da68 100644 --- a/packages/db/src/migrate.ts +++ b/packages/db/src/migrate.ts @@ -1431,6 +1431,24 @@ export const MIGRATION_VERSIONS: ReadonlyArray = [ } }, }, + { + version: 67, + name: 'traffic-sources-cloudflare-worker-columns', + // Push-receive traffic sources (currently only `cloudflare`) need a + // per-source bearer for the Worker to authenticate against canonry's + // ingest endpoint, plus a place to remember the deployed Worker + // version so `cloudflare.worker.version-stale` can compare. The + // cleartext bearer + HMAC secret stay in ~/.canonry/config.yaml under + // `cloudflareTraffic.connections.`; only the sha256 of the + // bearer is stored here. + // + // Idempotent: `ALTER TABLE ADD COLUMN` errors with "duplicate column + // name" on retry, which the runner already swallows. + statements: [ + `ALTER TABLE traffic_sources ADD COLUMN ingest_token_hash TEXT`, + `ALTER TABLE traffic_sources ADD COLUMN last_worker_version TEXT`, + ], + }, ] /** diff --git a/packages/db/src/schema.ts b/packages/db/src/schema.ts index cc1d5715..a09dae4a 100644 --- a/packages/db/src/schema.ts +++ b/packages/db/src/schema.ts @@ -625,6 +625,16 @@ export const trafficSources = sqliteTable('traffic_sources', { lastEventIds: text('last_event_ids', { mode: 'json' }).$type(), archivedAt: text('archived_at'), configJson: text('config_json', { mode: 'json' }).$type>().notNull().default({}), + // sha256 hex of the per-source bearer token issued at connect time. Only + // populated for push-receive source types (currently `cloudflare`); pull + // adapters leave this NULL. The cleartext bearer + HMAC secret never live + // in the DB — they go to `~/.canonry/config.yaml` under the per-type + // connections block. + ingestTokenHash: text('ingest_token_hash'), + // Semver reported by the most recent forwarded event. Drives the + // `cloudflare.worker.version-stale` doctor check. NULL until the first + // event arrives or for source types that don't forward versioned events. + lastWorkerVersion: text('last_worker_version'), createdAt: text('created_at').notNull(), updatedAt: text('updated_at').notNull(), }, (table) => [ diff --git a/packages/db/test/traffic-tables.test.ts b/packages/db/test/traffic-tables.test.ts index 7c8c825f..ffac7d03 100644 --- a/packages/db/test/traffic-tables.test.ts +++ b/packages/db/test/traffic-tables.test.ts @@ -493,3 +493,86 @@ test('traffic_sources cascade deletes all dependent rows when project is removed expect(db.select().from(trafficSources).all().length).toBe(0) expect(db.select().from(crawlerEventsHourly).all().length).toBe(0) }) + +test('traffic_sources persists ingest_token_hash and last_worker_version for cloudflare sources', () => { + const { db, tmpDir } = createTempDb() + onTestFinished(() => cleanup(tmpDir)) + + seedProject(db) + + const now = new Date().toISOString() + db.insert(trafficSources).values({ + id: 'src_cf', + projectId: 'proj_1', + sourceType: 'cloudflare', + displayName: 'Cloudflare · example.com', + status: 'connected', + configJson: { + schemaVersion: 1, + workerVersion: '1.0.0', + expectedBotListVersion: '2026-05-27', + zoneId: null, + accountId: null, + }, + ingestTokenHash: 'a'.repeat(64), + lastWorkerVersion: '1.0.0', + createdAt: now, + updatedAt: now, + }).run() + + const [row] = db.select().from(trafficSources).where(eq(trafficSources.id, 'src_cf')).all() + expect(row.sourceType).toBe('cloudflare') + expect(row.ingestTokenHash).toBe('a'.repeat(64)) + expect(row.lastWorkerVersion).toBe('1.0.0') +}) + +test('traffic_sources leaves ingest_token_hash and last_worker_version NULL for pull adapters', () => { + const { db, tmpDir } = createTempDb() + onTestFinished(() => cleanup(tmpDir)) + + seedProject(db) + + const now = new Date().toISOString() + db.insert(trafficSources).values({ + id: 'src_vercel', + projectId: 'proj_1', + sourceType: 'vercel', + displayName: 'Vercel · example.com', + status: 'connected', + configJson: { projectId: 'prj_1', teamId: 'team_1', environment: 'production' }, + createdAt: now, + updatedAt: now, + }).run() + + const [row] = db.select().from(trafficSources).where(eq(trafficSources.id, 'src_vercel')).all() + expect(row.sourceType).toBe('vercel') + expect(row.ingestTokenHash).toBeNull() + expect(row.lastWorkerVersion).toBeNull() +}) + +test('migration 67 adds ingest_token_hash + last_worker_version to a v66 traffic_sources row without losing data', () => { + const { db, tmpDir } = createTempDb() + onTestFinished(() => cleanup(tmpDir)) + + // Confirm v67 is the latest applied migration after createTempDb() + const versions = MIGRATION_VERSIONS.map((v) => v.version) + expect(Math.max(...versions)).toBe(67) + + seedProject(db) + + const now = new Date().toISOString() + db.insert(trafficSources).values({ + id: 'src_legacy', + projectId: 'proj_1', + sourceType: 'cloud-run', + displayName: 'Legacy row written before v67', + status: 'connected', + configJson: { gcpProjectId: 'p', authMode: 'service-account' }, + createdAt: now, + updatedAt: now, + }).run() + + const [row] = db.select().from(trafficSources).where(eq(trafficSources.id, 'src_legacy')).all() + expect(row.ingestTokenHash).toBeNull() + expect(row.lastWorkerVersion).toBeNull() +}) diff --git a/packages/integration-cloudflare-worker/AGENTS.md b/packages/integration-cloudflare-worker/AGENTS.md new file mode 100644 index 00000000..d952fbe7 --- /dev/null +++ b/packages/integration-cloudflare-worker/AGENTS.md @@ -0,0 +1,102 @@ +# integration-cloudflare-worker + +## Purpose + +Cloudflare traffic integration. Generates the JavaScript Worker that the +operator deploys onto their Cloudflare zone; verifies the HMAC-signed +inbound ingest requests it produces; normalizes Worker events into the +provider-neutral `NormalizedTrafficRequest` shape consumed by +`packages/integration-traffic`. + +Unlike the pull adapters (`integration-cloud-run`, `integration-vercel`, +`integration-wordpress-traffic`), this package targets a **push-receive** +delivery model — the customer's Worker `fetch()`-es each filtered request +to a canonry ingest endpoint. The choice is justified by Cloudflare's +data surface: GraphQL Analytics is aggregate-only, Logpush requires +Business+ plans, so Worker push is the only universal access path. + +The push direction is safe because canonry is single-tenant per +deployment — the Worker only ever talks to the operator's own canonry +instance, never to a canonry-hosted SaaS relay. + +## Key Files + +| File | Role | +|------|------| +| `src/script.ts` | `generateWorkerScript` — produces the JS string with embedded source-id, bearer, HMAC secret, version, bot keyword constants. `generateWranglerToml` companion. `DEFAULT_BOT_LIST` is the canonical edge-side keyword set. | +| `src/normalize.ts` | `normalizeCloudflareWorkerEvent` — one ingest event → `NormalizedTrafficRequest`. Returns `null` when path/observedAt/eventId are missing. | +| `src/verify.ts` | `verifyRequestSignature` — timestamp window + HMAC-SHA256 check. Constant-time once inputs are well-formed. | +| `src/types.ts` | `CloudflareWorkerBotList`, `GenerateWorkerScriptOptions` | +| `src/index.ts` | Re-exports public API | + +## Patterns + +- **Edge filter is generic; canonry classifier is strict.** The Worker + forwards on a broad UA keyword match, a broad referer host pattern, or + Cloudflare bot signals (`cf.botManagement.verifiedBot` / + `cf.botManagement.score`). The authoritative bot-id / operator + decisions happen in `packages/integration-traffic` once the event lands + server-side. Updating the strict list does not require a Worker + redeploy; updating the generic list does. +- **Versioned bot list.** `CloudflareWorkerBotList.version` is baked into + the generated script and stored on the source row as + `configJson.expectedBotListVersion`. The Worker reports its + `workerVersion` on every ingest call; the receiver records it on + `traffic_sources.lastWorkerVersion` so the `cloudflare.worker.version-stale` + doctor check can flag drift. +- **HMAC-SHA256 with timestamp binding.** The Worker signs + `timestamp + "." + body` with the per-source HMAC secret and sends + `X-Canonry-Timestamp` + `X-Canonry-Signature`. The receiver verifies a + ±300s window then runs constant-time equality. Failure reasons are + intentionally specific (`timestamp_invalid` / `timestamp_expired` / + `signature_invalid` / `signature_mismatch`) for receiver-side logging, + but **never echoed back to the Worker** — an attacker who knows which + leg failed can enumerate the rest. +- **Bearer + HMAC secrets live in `~/.canonry/config.yaml`.** The DB + stores only the sha256 of the bearer (`traffic_sources.ingestTokenHash`). + The HMAC secret never goes to the DB in any form. Both are inlined into + the Worker script at generation time. +- **`waitUntil` for forwards.** The generated Worker uses + `event.waitUntil(fetch(...))` so the forward never blocks the customer + response. Errors are swallowed — AI traffic is statistical, not + transactional; dropped events are acceptable, and surfacing the failure + would mask the customer response. +- **`cf-ray` as event id.** Cloudflare assigns a unique `cf-ray` per + request. The normalizer namespaces it as `cloudflare-worker:` so + it cannot collide with another adapter's event id. +- **`cf-connecting-ip` enables IP verification.** Cloudflare exposes the + real client IP on every plan via this header, so unlike the Vercel + adapter, Cloudflare-Worker sources can promote `claimed_unverified` → + `verified` via `packages/integration-traffic/src/ip-verify.ts`. +- **No classification, no DB, no I/O.** This package only generates, + normalizes, and verifies. The HTTP route + DB writes live in + `packages/api-routes/src/traffic.ts`. The classifier + rollup live in + `packages/integration-traffic`. + +## Common Mistakes + +- **Echoing the verifier's failure reason in the HTTP response.** Use a + single 401 envelope; do not let the Worker (or anything else) learn + which leg of the auth failed. +- **Putting the HMAC secret in `traffic_sources.configJson`.** Both + shared secrets belong in `~/.canonry/config.yaml`; only the bearer hash + goes to the DB. +- **Adding bot-id or operator classification in this package.** The + classifier lives in `packages/integration-traffic` for one-place rule + evolution across every adapter. +- **Storing or reading the Worker bot list anywhere but `DEFAULT_BOT_LIST`.** + The Worker is regenerated from this constant; updates must rev the + `version` field so the staleness check picks up the drift. + +## See Also + +- `plans/cloudflare-worker-traffic-source.md` — design plan +- `packages/contracts/src/traffic.ts` — `cloudflareWorkerEventSchema`, + `cloudflareWorkerIngestRequestSchema`, + `cloudflareWorkerSourceConfigSchema`, + `trafficConnectCloudflareRequestSchema`, + `trafficConnectCloudflareResponseSchema` +- `packages/integration-traffic/AGENTS.md` — classifier + rollup that the + ingest route hands off to +- `packages/integration-vercel/AGENTS.md` — sibling adapter (pull, not + push) — mirror file layout, different delivery shape diff --git a/packages/integration-cloudflare-worker/CLAUDE.md b/packages/integration-cloudflare-worker/CLAUDE.md new file mode 100644 index 00000000..43c994c2 --- /dev/null +++ b/packages/integration-cloudflare-worker/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/packages/integration-cloudflare-worker/package.json b/packages/integration-cloudflare-worker/package.json new file mode 100644 index 00000000..031f16b6 --- /dev/null +++ b/packages/integration-cloudflare-worker/package.json @@ -0,0 +1,22 @@ +{ + "name": "@ainyc/canonry-integration-cloudflare-worker", + "version": "0.0.0", + "private": true, + "type": "module", + "license": "FSL-1.1-ALv2", + "exports": { + ".": { + "types": "./src/index.ts", + "default": "./src/index.ts" + } + }, + "types": "./src/index.ts", + "scripts": { + "typecheck": "tsc --noEmit -p tsconfig.json", + "test": "vitest run", + "lint": "eslint src/ test/" + }, + "dependencies": { + "@ainyc/canonry-contracts": "workspace:*" + } +} diff --git a/packages/integration-cloudflare-worker/src/index.ts b/packages/integration-cloudflare-worker/src/index.ts new file mode 100644 index 00000000..6c2b8840 --- /dev/null +++ b/packages/integration-cloudflare-worker/src/index.ts @@ -0,0 +1,4 @@ +export * from './normalize.js' +export * from './script.js' +export * from './verify.js' +export type * from './types.js' diff --git a/packages/integration-cloudflare-worker/src/normalize.ts b/packages/integration-cloudflare-worker/src/normalize.ts new file mode 100644 index 00000000..281aeb3a --- /dev/null +++ b/packages/integration-cloudflare-worker/src/normalize.ts @@ -0,0 +1,90 @@ +import { + TrafficEventConfidences, + TrafficEvidenceKinds, + TrafficSourceTypes, + type CloudflareWorkerEvent, + type NormalizedTrafficRequest, +} from '@ainyc/canonry-contracts' + +function emptyToNull(value: string | null): string | null { + if (value === null || value.trim() === '') return null + return value +} + +function maybeLabel(value: string | number | boolean | null): string | undefined { + if (value === null) return undefined + if (typeof value === 'string' && value === '') return undefined + return String(value) +} + +function buildProviderLabels(cf: CloudflareWorkerEvent['cf']): Record { + if (!cf) return {} + const candidates: Record = { + verifiedBot: maybeLabel(cf.verifiedBot), + botScore: maybeLabel(cf.botScore), + country: maybeLabel(cf.country), + asn: maybeLabel(cf.asn), + asOrganization: maybeLabel(cf.asOrganization), + } + return Object.fromEntries( + Object.entries(candidates).filter( + (entry): entry is [string, string] => typeof entry[1] === 'string', + ), + ) +} + +/** + * Convert a Worker-forwarded event into the provider-neutral + * `NormalizedTrafficRequest` consumed by `integration-traffic`. + * + * Returns `null` when the event is missing one of the three minimum fields + * (`path`, `observedAt`, `eventId`) — the inbound Zod schema rejects these + * upstream, but this defensive check lets the normalizer be safely reused + * outside the HTTP path (e.g. against replayed log lines for tests). + * + * Cloudflare-specific signals like `verifiedBot`, `botScore`, `country`, + * and `asn` ride through on `providerLabels` so the classifier can pick + * them up — `integration-traffic/ip-verify` still has the authoritative + * say on the `verified` vs `claimed_unverified` tier, but `verifiedBot` + * is the most reliable signal Cloudflare exposes on the verified bot + * side and downstream code may consume it directly. + */ +export function normalizeCloudflareWorkerEvent( + event: CloudflareWorkerEvent, +): NormalizedTrafficRequest | null { + if (!event.path) return null + if (!event.observedAt) return null + if (!event.eventId) return null + + const host = emptyToNull(event.host) + const path = event.path + const queryString = emptyToNull(event.queryString) + const requestUrl = host + ? `https://${host}${path}${queryString ? `?${queryString}` : ''}` + : null + + return { + sourceType: TrafficSourceTypes.cloudflare, + evidenceKind: TrafficEvidenceKinds['raw-request'], + confidence: TrafficEventConfidences.observed, + eventId: `cloudflare-worker:${event.eventId}`, + observedAt: event.observedAt, + method: emptyToNull(event.method), + requestUrl, + host, + path, + queryString, + status: event.status, + userAgent: emptyToNull(event.userAgent), + remoteIp: emptyToNull(event.remoteIp), + referer: emptyToNull(event.referer), + latencyMs: null, + requestSizeBytes: null, + responseSizeBytes: null, + providerResource: { + type: 'cloudflare_zone', + labels: {}, + }, + providerLabels: buildProviderLabels(event.cf), + } +} diff --git a/packages/integration-cloudflare-worker/src/script.ts b/packages/integration-cloudflare-worker/src/script.ts new file mode 100644 index 00000000..f4769eac --- /dev/null +++ b/packages/integration-cloudflare-worker/src/script.ts @@ -0,0 +1,250 @@ +import type { CloudflareWorkerBotList, GenerateWorkerScriptOptions } from './types.js' + +/** + * Generic edge-side filter list. Intentionally broad — the strict + * bot/referer classification happens server-side in + * `packages/integration-traffic`. Bump `version` whenever this set + * structurally changes so the `cloudflare.worker.version-stale` + * doctor check can flag stale deployments. + */ +export const DEFAULT_BOT_LIST: CloudflareWorkerBotList = { + version: '2026-05-27', + uaKeywords: [ + 'bot', + 'crawler', + 'spider', + 'agent', + 'gpt', + 'claude', + 'ai', + 'perplexity', + 'chatgpt', + 'openai', + 'anthropic', + ], + refererHostSuffixes: [ + '.openai.com', + '.anthropic.com', + '.perplexity.ai', + '.you.com', + '.phind.com', + ], + refererHostKeywords: ['gpt', 'claude', 'chat', 'perplexity', 'copilot'], +} + +const DEFAULT_BOT_SCORE_MAX_FORWARD = 30 +const WORKER_COMPATIBILITY_DATE = '2026-05-01' + +function jsString(value: string): string { + return JSON.stringify(value) +} + +function jsArray(values: readonly string[]): string { + return `[${values.map((v) => jsString(v)).join(', ')}]` +} + +/** + * Render the JavaScript source the operator drops into a Cloudflare zone. + * The script runs on every request, applies a broad edge-side filter, and + * `fetch()`-es each match to the configured canonry ingest URL via + * `event.waitUntil` so the forward never blocks the response. + * + * Auth: each forward carries a bearer token plus an HMAC-SHA256 signature + * over `timestamp + "." + body`. Both secrets are embedded at generation + * time — the operator never sees them in cleartext after the connect + * response is consumed. + */ +export function generateWorkerScript(opts: GenerateWorkerScriptOptions): string { + const botScoreMax = opts.botScoreMaxForward ?? DEFAULT_BOT_SCORE_MAX_FORWARD + + return `// canonry traffic Worker — generated; do not edit by hand +// source: ${opts.sourceId} +// worker version: ${opts.workerVersion} +// bot-list version: ${opts.botList.version} + +const CANONRY_SOURCE_ID = ${jsString(opts.sourceId)} +const CANONRY_INGEST_URL = ${jsString(opts.ingestUrl)} +const CANONRY_BEARER_TOKEN = ${jsString(opts.bearerToken)} +const CANONRY_HMAC_SECRET = ${jsString(opts.hmacSecret)} +const CANONRY_WORKER_VERSION = ${jsString(opts.workerVersion)} +const CANONRY_BOT_LIST_VERSION = ${jsString(opts.botList.version)} +const UA_KEYWORDS = ${jsArray(opts.botList.uaKeywords)} +const REFERER_HOST_SUFFIXES = ${jsArray(opts.botList.refererHostSuffixes)} +const REFERER_HOST_KEYWORDS = ${jsArray(opts.botList.refererHostKeywords)} +const BOT_SCORE_MAX_FORWARD = ${String(botScoreMax)} + +function lower(value) { + return typeof value === 'string' ? value.toLowerCase() : '' +} + +function uaMatches(ua) { + const lc = lower(ua) + if (!lc) return false + for (const kw of UA_KEYWORDS) { + if (lc.indexOf(kw) !== -1) return true + } + return false +} + +function refererMatches(referer) { + if (!referer) return false + let host = '' + try { + host = new URL(referer).hostname.toLowerCase() + } catch (_) { + return false + } + for (const suffix of REFERER_HOST_SUFFIXES) { + if (host.endsWith(suffix)) return true + } + for (const kw of REFERER_HOST_KEYWORDS) { + if (host.indexOf(kw) !== -1) return true + } + return false +} + +function botSignals(cf) { + if (!cf) return false + const bm = cf.botManagement + if (bm) { + if (bm.verifiedBot === true) return true + if (typeof bm.score === 'number' && bm.score < BOT_SCORE_MAX_FORWARD) return true + } + if (typeof cf.botScore === 'number' && cf.botScore < BOT_SCORE_MAX_FORWARD) return true + return false +} + +function shouldForward(request) { + const ua = request.headers.get('user-agent') || '' + if (uaMatches(ua)) return true + const referer = request.headers.get('referer') || '' + if (refererMatches(referer)) return true + return botSignals(request.cf) +} + +function toHex(buffer) { + const bytes = new Uint8Array(buffer) + let out = '' + for (let i = 0; i < bytes.length; i++) { + out += bytes[i].toString(16).padStart(2, '0') + } + return out +} + +async function signBody(timestamp, body) { + const key = await crypto.subtle.importKey( + 'raw', + new TextEncoder().encode(CANONRY_HMAC_SECRET), + { name: 'HMAC', hash: 'SHA-256' }, + false, + ['sign'], + ) + const sig = await crypto.subtle.sign( + 'HMAC', + key, + new TextEncoder().encode(timestamp + '.' + body), + ) + return toHex(sig) +} + +function pickCf(cf) { + if (!cf) return null + const bm = cf.botManagement || {} + return { + verifiedBot: typeof bm.verifiedBot === 'boolean' ? bm.verifiedBot : null, + botScore: typeof bm.score === 'number' ? bm.score : (typeof cf.botScore === 'number' ? cf.botScore : null), + country: typeof cf.country === 'string' ? cf.country : null, + asn: typeof cf.asn === 'number' ? cf.asn : null, + asOrganization: typeof cf.asOrganization === 'string' ? cf.asOrganization : null, + } +} + +function buildEvent(request) { + const url = new URL(request.url) + return { + eventId: request.headers.get('cf-ray') || crypto.randomUUID(), + observedAt: new Date().toISOString(), + method: request.method || null, + host: url.hostname || null, + path: url.pathname || '/', + queryString: url.search ? url.search.slice(1) : null, + status: null, + userAgent: request.headers.get('user-agent') || null, + remoteIp: request.headers.get('cf-connecting-ip') || null, + referer: request.headers.get('referer') || null, + cf: pickCf(request.cf), + } +} + +async function forward(event, request, status) { + try { + const payload = buildEvent(request) + payload.status = typeof status === 'number' ? status : payload.status + const body = JSON.stringify({ + schemaVersion: 1, + workerVersion: CANONRY_WORKER_VERSION, + events: [payload], + }) + const timestamp = String(Math.floor(Date.now() / 1000)) + const signature = await signBody(timestamp, body) + await fetch(CANONRY_INGEST_URL, { + method: 'POST', + headers: { + 'content-type': 'application/json', + 'Authorization': 'Bearer ' + CANONRY_BEARER_TOKEN, + 'X-Canonry-Timestamp': timestamp, + 'X-Canonry-Signature': signature, + 'X-Canonry-Worker-Version': CANONRY_WORKER_VERSION, + 'X-Canonry-Source-Id': CANONRY_SOURCE_ID, + }, + body, + }) + } catch (_err) { + // Swallow — AI traffic is statistical, not transactional. Dropped + // events are acceptable; surfacing the error would mask the customer + // response. Cloudflare's own Worker logs capture the failure. + } +} + +addEventListener('fetch', (event) => { + const request = event.request + const shouldLog = shouldForward(request) + const responsePromise = fetch(request) + event.respondWith( + responsePromise.then((response) => { + if (shouldLog) { + event.waitUntil(forward(event, request, response.status)) + } + return response + }).catch((err) => { + if (shouldLog) { + event.waitUntil(forward(event, request, null)) + } + throw err + }), + ) +}) +` +} + +export interface GenerateWranglerTomlOptions { + sourceId: string +} + +/** + * Companion `wrangler.toml` for operators who prefer `wrangler deploy` + * over pasting into the Cloudflare dashboard. + */ +export function generateWranglerToml(opts: GenerateWranglerTomlOptions): string { + return `name = "canonry-traffic-${opts.sourceId}" +main = "worker.js" +compatibility_date = "${WORKER_COMPATIBILITY_DATE}" + +# Edit and deploy this Worker via: +# wrangler deploy +# After deploy, route it at: +# *your-domain.com/* +# in the Cloudflare dashboard or via: +# wrangler route add "*your-domain.com/*" +` +} diff --git a/packages/integration-cloudflare-worker/src/types.ts b/packages/integration-cloudflare-worker/src/types.ts new file mode 100644 index 00000000..f8176a8c --- /dev/null +++ b/packages/integration-cloudflare-worker/src/types.ts @@ -0,0 +1,29 @@ +/** + * Bot list manifest baked into the generated Worker script. Bumping + * `version` means the deployed Worker is out of date — the + * `cloudflare.worker.version-stale` doctor check reads this field on the + * source row and compares it against the current package constant. + * + * Edge-side classification is intentionally broad — the server-side + * classifier in `packages/integration-traffic` does the real + * bot-id/operator decisions. Keep this list large enough to catch any + * AI-related signal even when canonry doesn't yet have a specific rule + * for it. + */ +export interface CloudflareWorkerBotList { + version: string + uaKeywords: readonly string[] + refererHostSuffixes: readonly string[] + refererHostKeywords: readonly string[] +} + +export interface GenerateWorkerScriptOptions { + sourceId: string + ingestUrl: string + bearerToken: string + hmacSecret: string + workerVersion: string + botList: CloudflareWorkerBotList + /** Optional `cf.botManagement.score` threshold below which to forward. */ + botScoreMaxForward?: number +} diff --git a/packages/integration-cloudflare-worker/src/verify.ts b/packages/integration-cloudflare-worker/src/verify.ts new file mode 100644 index 00000000..fe0f79bd --- /dev/null +++ b/packages/integration-cloudflare-worker/src/verify.ts @@ -0,0 +1,65 @@ +import { createHmac, timingSafeEqual } from 'node:crypto' + +const DEFAULT_MAX_AGE_SECONDS = 300 + +export type VerifyRequestSignatureFailureReason = + | 'timestamp_invalid' + | 'timestamp_expired' + | 'signature_invalid' + | 'signature_mismatch' + +export type VerifyRequestSignatureResult = + | { ok: true } + | { ok: false; reason: VerifyRequestSignatureFailureReason } + +export interface VerifyRequestSignatureOptions { + timestamp: string + signature: string + body: string + secret: string + /** Override for tests; defaults to `Date.now() / 1000`. */ + nowSeconds?: number + /** Acceptable clock skew on either side of `nowSeconds`. */ + maxAgeSeconds?: number +} + +/** + * Verify the HMAC-SHA256 signature on a Cloudflare Worker → canonry ingest + * request. The signing convention is `hex(hmac_sha256(secret, timestamp + "." + body))` + * over the request's `X-Canonry-Timestamp` (Unix seconds) and the raw body + * string. Verification is constant-time once the inputs are well-formed. + * + * Failure reasons are intentionally specific (`timestamp_invalid` vs + * `timestamp_expired` vs `signature_invalid` vs `signature_mismatch`) so + * the receiver can log and rate-limit appropriately, but the caller MUST + * NOT echo the reason back to the Worker — exposing whether the failure + * was bearer/HMAC/timestamp lets an attacker enumerate which leg of the + * auth they're missing. + */ +export function verifyRequestSignature(opts: VerifyRequestSignatureOptions): VerifyRequestSignatureResult { + const { timestamp, signature, body, secret } = opts + if (timestamp === '' || !/^-?\d+$/.test(timestamp)) { + return { ok: false, reason: 'timestamp_invalid' } + } + const ts = Number(timestamp) + if (!Number.isFinite(ts)) return { ok: false, reason: 'timestamp_invalid' } + + const now = opts.nowSeconds ?? Math.floor(Date.now() / 1000) + const maxAge = opts.maxAgeSeconds ?? DEFAULT_MAX_AGE_SECONDS + if (Math.abs(now - ts) > maxAge) return { ok: false, reason: 'timestamp_expired' } + + if (signature === '' || !/^[0-9a-f]+$/i.test(signature)) { + return { ok: false, reason: 'signature_invalid' } + } + let provided: Buffer + try { + provided = Buffer.from(signature, 'hex') + } catch { + return { ok: false, reason: 'signature_invalid' } + } + + const expected = createHmac('sha256', secret).update(`${timestamp}.${body}`).digest() + if (provided.length !== expected.length) return { ok: false, reason: 'signature_invalid' } + if (!timingSafeEqual(provided, expected)) return { ok: false, reason: 'signature_mismatch' } + return { ok: true } +} diff --git a/packages/integration-cloudflare-worker/test/normalize.test.ts b/packages/integration-cloudflare-worker/test/normalize.test.ts new file mode 100644 index 00000000..d5a5a7f7 --- /dev/null +++ b/packages/integration-cloudflare-worker/test/normalize.test.ts @@ -0,0 +1,127 @@ +import { describe, expect, it } from 'vitest' +import { + TrafficEventConfidences, + TrafficEvidenceKinds, + TrafficSourceTypes, + type CloudflareWorkerEvent, +} from '@ainyc/canonry-contracts' +import { normalizeCloudflareWorkerEvent } from '../src/normalize.js' + +const FULL_EVENT: CloudflareWorkerEvent = { + eventId: '8a3d2b0c-cf-ray', + observedAt: '2026-05-27T15:30:00.123Z', + method: 'GET', + host: 'example.com', + path: '/blog/post', + queryString: 'utm_source=chatgpt', + status: 200, + userAgent: 'GPTBot/1.2', + remoteIp: '20.171.207.34', + referer: 'https://chat.openai.com/', + cf: { + verifiedBot: true, + botScore: 30, + country: 'US', + asn: 8075, + asOrganization: 'Microsoft Corporation', + }, +} + +describe('normalizeCloudflareWorkerEvent', () => { + it('produces a NormalizedTrafficRequest tagged with the cloudflare source type', () => { + const result = normalizeCloudflareWorkerEvent(FULL_EVENT) + expect(result).not.toBeNull() + expect(result?.sourceType).toBe(TrafficSourceTypes.cloudflare) + expect(result?.evidenceKind).toBe(TrafficEvidenceKinds['raw-request']) + expect(result?.confidence).toBe(TrafficEventConfidences.observed) + }) + + it('reconstructs the full request URL when host and queryString are present', () => { + const result = normalizeCloudflareWorkerEvent(FULL_EVENT) + expect(result?.requestUrl).toBe('https://example.com/blog/post?utm_source=chatgpt') + }) + + it('returns a null requestUrl when host is missing', () => { + const result = normalizeCloudflareWorkerEvent({ ...FULL_EVENT, host: null }) + expect(result?.requestUrl).toBeNull() + expect(result?.host).toBeNull() + expect(result?.path).toBe('/blog/post') + }) + + it('omits the query string from requestUrl when queryString is null', () => { + const result = normalizeCloudflareWorkerEvent({ ...FULL_EVENT, queryString: null }) + expect(result?.requestUrl).toBe('https://example.com/blog/post') + }) + + it('preserves remoteIp so IP-range verification can promote claimed → verified', () => { + const result = normalizeCloudflareWorkerEvent(FULL_EVENT) + expect(result?.remoteIp).toBe('20.171.207.34') + }) + + it('passes verifiedBot through provider labels for downstream classification', () => { + const result = normalizeCloudflareWorkerEvent(FULL_EVENT) + expect(result?.providerLabels.verifiedBot).toBe('true') + expect(result?.providerLabels.botScore).toBe('30') + expect(result?.providerLabels.country).toBe('US') + expect(result?.providerLabels.asn).toBe('8075') + expect(result?.providerLabels.asOrganization).toBe('Microsoft Corporation') + }) + + it('handles cf=null without dropping the event', () => { + const result = normalizeCloudflareWorkerEvent({ ...FULL_EVENT, cf: null }) + expect(result).not.toBeNull() + expect(result?.providerLabels).toEqual({}) + }) + + it('omits null cf.* properties from provider labels', () => { + const result = normalizeCloudflareWorkerEvent({ + ...FULL_EVENT, + cf: { verifiedBot: null, botScore: 30, country: null, asn: null, asOrganization: null }, + }) + expect(result?.providerLabels).toEqual({ botScore: '30' }) + }) + + it('returns null when path is missing (defensive — schema already enforces)', () => { + const result = normalizeCloudflareWorkerEvent({ ...FULL_EVENT, path: '' }) + expect(result).toBeNull() + }) + + it('returns null when observedAt is missing', () => { + const result = normalizeCloudflareWorkerEvent({ ...FULL_EVENT, observedAt: '' }) + expect(result).toBeNull() + }) + + it('returns null when eventId is missing', () => { + const result = normalizeCloudflareWorkerEvent({ ...FULL_EVENT, eventId: '' }) + expect(result).toBeNull() + }) + + it('namespaces the eventId so it cannot collide with other adapters', () => { + const result = normalizeCloudflareWorkerEvent(FULL_EVENT) + expect(result?.eventId).toBe('cloudflare-worker:8a3d2b0c-cf-ray') + }) + + it('sets the providerResource type to cloudflare_zone', () => { + const result = normalizeCloudflareWorkerEvent(FULL_EVENT) + expect(result?.providerResource.type).toBe('cloudflare_zone') + }) + + it('threads requestSizeBytes/responseSizeBytes/latencyMs as null (Worker has no native source)', () => { + const result = normalizeCloudflareWorkerEvent(FULL_EVENT) + expect(result?.requestSizeBytes).toBeNull() + expect(result?.responseSizeBytes).toBeNull() + expect(result?.latencyMs).toBeNull() + }) + + it('empties strings (host, queryString) are treated as absent in providerLabels', () => { + const result = normalizeCloudflareWorkerEvent({ + ...FULL_EVENT, + cf: { verifiedBot: false, botScore: 99, country: '', asn: 0, asOrganization: '' }, + }) + expect(result?.providerLabels.country).toBeUndefined() + expect(result?.providerLabels.asOrganization).toBeUndefined() + expect(result?.providerLabels.verifiedBot).toBe('false') + expect(result?.providerLabels.botScore).toBe('99') + expect(result?.providerLabels.asn).toBe('0') + }) +}) diff --git a/packages/integration-cloudflare-worker/test/script.test.ts b/packages/integration-cloudflare-worker/test/script.test.ts new file mode 100644 index 00000000..65216867 --- /dev/null +++ b/packages/integration-cloudflare-worker/test/script.test.ts @@ -0,0 +1,121 @@ +import { describe, expect, it } from 'vitest' +import { + DEFAULT_BOT_LIST, + generateWorkerScript, + generateWranglerToml, +} from '../src/script.js' + +const BASE_OPTS = { + sourceId: 'src_abc123', + ingestUrl: 'https://canonry.example.com/api/v1/projects/foo/traffic/cloudflare/ingest', + bearerToken: 'tok_secret_value', + hmacSecret: 'hmac_secret_value', + workerVersion: '1.0.0', + botList: DEFAULT_BOT_LIST, +} + +describe('generateWorkerScript', () => { + it('produces a non-empty JS string', () => { + const script = generateWorkerScript(BASE_OPTS) + expect(script).toMatch(/addEventListener\s*\(\s*['"]fetch['"]/) + expect(script.length).toBeGreaterThan(500) + }) + + it('embeds every required constant', () => { + const script = generateWorkerScript(BASE_OPTS) + expect(script).toContain('src_abc123') + expect(script).toContain('https://canonry.example.com/api/v1/projects/foo/traffic/cloudflare/ingest') + expect(script).toContain('tok_secret_value') + expect(script).toContain('hmac_secret_value') + expect(script).toContain('1.0.0') + }) + + it('bakes in the bot UA keywords from the supplied bot list', () => { + const script = generateWorkerScript(BASE_OPTS) + for (const keyword of DEFAULT_BOT_LIST.uaKeywords) { + expect(script).toContain(keyword) + } + }) + + it('bakes in the referer host suffixes from the supplied bot list', () => { + const script = generateWorkerScript(BASE_OPTS) + for (const suffix of DEFAULT_BOT_LIST.refererHostSuffixes) { + expect(script).toContain(suffix) + } + }) + + it('records the bot list version somewhere the operator (or doctor) can read it', () => { + const script = generateWorkerScript({ + ...BASE_OPTS, + botList: { ...DEFAULT_BOT_LIST, version: '2099-12-31' }, + }) + expect(script).toContain('2099-12-31') + }) + + it('uses event.waitUntil so the forward fetch never blocks the response', () => { + const script = generateWorkerScript(BASE_OPTS) + expect(script).toMatch(/event\.waitUntil/) + }) + + it('forwards with the documented headers (Authorization, Timestamp, Signature, Version)', () => { + const script = generateWorkerScript(BASE_OPTS) + expect(script).toContain('Authorization') + expect(script).toContain('Bearer') + expect(script).toContain('X-Canonry-Timestamp') + expect(script).toContain('X-Canonry-Signature') + expect(script).toContain('X-Canonry-Worker-Version') + }) + + it('signs with HMAC-SHA256 via Web Crypto SubtleCrypto', () => { + const script = generateWorkerScript(BASE_OPTS) + expect(script).toContain('HMAC') + expect(script).toContain('SHA-256') + expect(script).toMatch(/crypto\.subtle/) + }) + + it('uses POST as the forward method', () => { + const script = generateWorkerScript(BASE_OPTS) + expect(script).toMatch(/method\s*:\s*['"]POST['"]/) + }) + + it('parses as JavaScript (smoke test)', () => { + const script = generateWorkerScript(BASE_OPTS) + expect(() => new Function(script)).not.toThrow() + }) + + it('treats a custom botScoreMaxForward as the score threshold', () => { + const script = generateWorkerScript({ ...BASE_OPTS, botScoreMaxForward: 42 }) + expect(script).toContain('42') + }) +}) + +describe('DEFAULT_BOT_LIST', () => { + it('includes the canonical AI UA tokens', () => { + expect(DEFAULT_BOT_LIST.uaKeywords).toEqual( + expect.arrayContaining(['bot', 'crawler', 'gpt', 'claude', 'perplexity', 'openai', 'anthropic']), + ) + }) + + it('includes the canonical AI referer hosts', () => { + expect(DEFAULT_BOT_LIST.refererHostSuffixes).toEqual( + expect.arrayContaining(['.openai.com', '.anthropic.com', '.perplexity.ai']), + ) + }) + + it('has a non-empty, dated version string so the staleness check can compare', () => { + expect(DEFAULT_BOT_LIST.version).toMatch(/\d{4}-\d{2}-\d{2}/) + }) +}) + +describe('generateWranglerToml', () => { + it('emits a name and a main field for wrangler deploy', () => { + const toml = generateWranglerToml({ sourceId: 'src_abc123' }) + expect(toml).toMatch(/^name\s*=\s*"canonry-traffic-src_abc123"/m) + expect(toml).toMatch(/^main\s*=/m) + }) + + it('sets compatibility_date to a recent ISO date', () => { + const toml = generateWranglerToml({ sourceId: 'src_abc123' }) + expect(toml).toMatch(/compatibility_date\s*=\s*"\d{4}-\d{2}-\d{2}"/) + }) +}) diff --git a/packages/integration-cloudflare-worker/test/verify.test.ts b/packages/integration-cloudflare-worker/test/verify.test.ts new file mode 100644 index 00000000..7b591843 --- /dev/null +++ b/packages/integration-cloudflare-worker/test/verify.test.ts @@ -0,0 +1,163 @@ +import { createHmac } from 'node:crypto' +import { describe, expect, it } from 'vitest' +import { verifyRequestSignature } from '../src/verify.js' + +const SECRET = 'shared-hmac-secret' + +function sign(timestamp: number | string, body: string, secret = SECRET): string { + return createHmac('sha256', secret).update(`${timestamp}.${body}`).digest('hex') +} + +describe('verifyRequestSignature', () => { + const now = Math.floor(Date.parse('2026-05-27T15:30:00Z') / 1000) + const body = '{"events":[{"eventId":"r1"}]}' + + it('accepts a correct signature within the timestamp window', () => { + const ts = String(now) + const signature = sign(ts, body) + expect(verifyRequestSignature({ + timestamp: ts, + signature, + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: true }) + }) + + it('rejects a mutated body', () => { + const ts = String(now) + const signature = sign(ts, body) + expect(verifyRequestSignature({ + timestamp: ts, + signature, + body: body + 'x', + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: false, reason: 'signature_mismatch' }) + }) + + it('rejects a mutated timestamp', () => { + const ts = String(now) + const signature = sign(ts, body) + expect(verifyRequestSignature({ + timestamp: String(now + 1), + signature, + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: false, reason: 'signature_mismatch' }) + }) + + it('rejects the wrong secret', () => { + const ts = String(now) + const signature = sign(ts, body, 'wrong-secret') + expect(verifyRequestSignature({ + timestamp: ts, + signature, + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: false, reason: 'signature_mismatch' }) + }) + + it('rejects an expired timestamp (older than maxAgeSeconds)', () => { + const ts = String(now - 301) + const signature = sign(ts, body) + expect(verifyRequestSignature({ + timestamp: ts, + signature, + body, + secret: SECRET, + nowSeconds: now, + maxAgeSeconds: 300, + })).toEqual({ ok: false, reason: 'timestamp_expired' }) + }) + + it('rejects a timestamp too far in the future', () => { + const ts = String(now + 301) + const signature = sign(ts, body) + expect(verifyRequestSignature({ + timestamp: ts, + signature, + body, + secret: SECRET, + nowSeconds: now, + maxAgeSeconds: 300, + })).toEqual({ ok: false, reason: 'timestamp_expired' }) + }) + + it('rejects a non-numeric timestamp', () => { + const signature = sign('not-a-number', body) + expect(verifyRequestSignature({ + timestamp: 'not-a-number', + signature, + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: false, reason: 'timestamp_invalid' }) + }) + + it('rejects an empty timestamp', () => { + expect(verifyRequestSignature({ + timestamp: '', + signature: sign('', body), + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: false, reason: 'timestamp_invalid' }) + }) + + it('rejects a malformed hex signature', () => { + expect(verifyRequestSignature({ + timestamp: String(now), + signature: 'not-hex!!', + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: false, reason: 'signature_invalid' }) + }) + + it('rejects a signature of the wrong byte length', () => { + expect(verifyRequestSignature({ + timestamp: String(now), + signature: 'aabb', + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: false, reason: 'signature_invalid' }) + }) + + it('verifies correctly when body is empty', () => { + const ts = String(now) + const signature = sign(ts, '') + expect(verifyRequestSignature({ + timestamp: ts, + signature, + body: '', + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: true }) + }) + + it('uses 300s default window when maxAgeSeconds is omitted', () => { + const ts = String(now - 299) + const signature = sign(ts, body) + expect(verifyRequestSignature({ + timestamp: ts, + signature, + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: true }) + + const expiredTs = String(now - 301) + const expiredSig = sign(expiredTs, body) + expect(verifyRequestSignature({ + timestamp: expiredTs, + signature: expiredSig, + body, + secret: SECRET, + nowSeconds: now, + })).toEqual({ ok: false, reason: 'timestamp_expired' }) + }) +}) diff --git a/packages/integration-cloudflare-worker/tsconfig.json b/packages/integration-cloudflare-worker/tsconfig.json new file mode 100644 index 00000000..65aaab0c --- /dev/null +++ b/packages/integration-cloudflare-worker/tsconfig.json @@ -0,0 +1,7 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "./src" + }, + "include": ["src/**/*.ts"] +} diff --git a/plans/cloudflare-worker-traffic-source.md b/plans/cloudflare-worker-traffic-source.md new file mode 100644 index 00000000..4b3ee34d --- /dev/null +++ b/plans/cloudflare-worker-traffic-source.md @@ -0,0 +1,519 @@ +# Cloudflare Worker Traffic Source Plan + +Status: design plan for implementation +Last updated: 2026-05-27 + +## Context + +Canonry already supports three server-side traffic sources — `cloud-run`, +`wordpress`, `vercel` — all pull-shaped: canonry calls an upstream HTTP API +on a cursor schedule, normalizes rows, classifies via +`packages/integration-traffic`, and rolls into hourly buckets. + +Cloudflare is the next adapter, and it's structurally different. Cloudflare +exposes three data surfaces: + +1. **GraphQL Analytics API** — aggregate-only, even on the AI Crawl Control + dataset. No raw request rows. +2. **Logpush** — raw rows, but Business plan or higher only. +3. **Workers** — universal raw-row access on any plan, including free, via + a customer-deployed Worker that forwards filtered requests to a canonry + ingestion endpoint. + +Free/Pro plans cover most prospective canonry customers, so building only +Logpush would leave the long tail out. The Worker path covers every plan +and also unlocks the long-considered "Cloudflare-as-proxy" story for hosts +that expose no logs at all (Shopify, Webflow, Wix, Ghost) — once a customer +puts Canonry's Worker on their zone, that zone is a fully ingestible +traffic source regardless of where the site is actually hosted. + +This plan covers the Worker adapter. A parallel `cloudflare-logpush` +adapter is reserved for later — same ingestion endpoint shape, different +delivery mechanism (Logpush HTTPS destination batches rows instead of the +Worker fetching them one at a time). + +See [`server-side-ai-traffic-ingestion.md`](./server-side-ai-traffic-ingestion.md) +for the overall traffic plan and +[`cloud-run-traffic-source-model-review.md`](./cloud-run-traffic-source-model-review.md) +for the raw-event vs aggregate-bucket rationale. + +## Goals + +1. Capture AI crawler hits + AI-referral hits on any Cloudflare-fronted site, + regardless of plan tier. +2. Reuse the existing classifier + rollup pipeline — the Worker only + forwards events, it does not classify. +3. Provide first-class IP verification: unlike Vercel (no client IP), + Cloudflare Workers expose `cf-connecting-ip` on every request, so + `claimed_unverified` → `verified` promotion via + `integration-traffic/ip-verify.ts` works out of the box. +4. Operator-deployable in Phase 1: canonry generates the Worker script, + operator pastes it into Cloudflare's dashboard or `wrangler deploy`s it. +5. Architecturally compatible with Phase 2 auto-deploy via Cloudflare API. + +## Non-Goals + +- **No Logpush implementation in this plan.** Same target endpoint, but the + delivery shape (batched gzipped JSONL POSTed by Cloudflare directly) + warrants its own adapter and is gated on Business+ plan availability. +- **No GraphQL Analytics API ingestion.** Aggregate-only; useful as an + aux signal on the dashboard, but not a primary ingestion path. +- **No Worker auto-deploy in Phase 1.** Operator-pasted/wrangler'd only. + Phase 2 adds a `--auto-deploy` flow that takes a Cloudflare API token + and provisions the Worker via Cloudflare's Workers API. +- **No Cloudflare-as-proxy zone provisioning.** That's a separate + feature — putting a non-Cloudflare-hosted site behind a canonry-created + Cloudflare zone — built on top of this adapter once it ships. +- **No edge-side classification.** Worker uses a broad, stable filter to + decide what to forward; the strict bot/referer list stays in + `packages/integration-traffic`. + +## Architecture + +``` ++-----------------------------+ +| Customer's Cloudflare zone | +| | +| Worker (canonry-issued) | +| - generic AI-signal filter | +| - event.waitUntil(fetch()) | ++--------------+--------------+ + | + | POST /api/v1/projects/:name/traffic/cloudflare/ingest + | Authorization: Bearer + | X-Canonry-Signature: + v ++--------------+--------------+ +| canonry serve / apps/api | +| | +| packages/api-routes/ | +| traffic.ts (new route) | +| | +| -> verify token + HMAC | +| -> normalize event | +| -> classify (existing) | +| -> upsert hourly rollups | ++-----------------------------+ +``` + +The ingestion path is the only structural divergence from existing +adapters: it is **push-receive** rather than pull. This is operationally +fine because canonry is single-tenant per deployment — the operator runs +their own canonry instance, the operator owns the Cloudflare zone, and +the Worker only ever talks to the operator's own canonry URL. No canonry +SaaS relay is introduced. The Vercel adapter explicitly deferred this +push shape; the Cloudflare Worker is where we accept it, scoped narrowly. + +## Worker Script Design + +### Filter (edge-side, broad, stable) + +The Worker forwards a request iff **any** of: + +- UA matches the generic keyword set: + `bot | crawler | spider | agent | gpt | claude | ai | perplexity | chatgpt | openai | anthropic` + (case-insensitive substring match) +- Referer host matches the generic suffix list: + `.openai.com | .anthropic.com | .perplexity.ai | chat. | search.` or the + host contains `gpt | claude | ai | chat` as a token +- Cloudflare bot signals: `request.cf.botManagement?.verifiedBot === true` + OR `request.cf.botManagement?.score < 30` (best-effort — only populated + on plans with Bot Management; safe to be absent) + +The filter is intentionally broader than canonry's classifier so the +Worker doesn't need a redeploy every time a new bot UA is identified — +the strict classifier on the server side does the real work. + +### Payload + +One request, one `fetch()`. Body is a single-element array (the array +shape lets the same endpoint accept Logpush batches later without an +endpoint version bump). + +```json +{ + "schemaVersion": 1, + "workerVersion": "1.0.0", + "events": [ + { + "eventId": "8a3d2b0c-cf-ray", + "observedAt": "2026-05-27T15:30:00.123Z", + "method": "GET", + "host": "example.com", + "path": "/blog/post", + "queryString": "utm_source=chatgpt", + "status": 200, + "userAgent": "GPTBot/1.2", + "remoteIp": "20.171.207.34", + "referer": "https://chat.openai.com/", + "cf": { + "verifiedBot": true, + "botScore": 30, + "country": "US", + "asn": 8075, + "asOrganization": "Microsoft Corporation" + } + } + ] +} +``` + +`eventId` is Cloudflare's `cf-ray` header — globally unique per request, +durable across retries. + +### Delivery semantics + +```typescript +addEventListener('fetch', (event) => { + event.respondWith(handle(event.request)) + if (shouldForward(event.request)) { + event.waitUntil(forward(event.request)) + } +}) +``` + +`waitUntil` keeps the forward `fetch()` alive after the response has been +sent, so it never adds to user-perceived latency. If the canonry endpoint +is unreachable, the event is dropped — no retry, no buffering. That's +acceptable: the AI traffic signal is statistical, not transactional, and +adding a Durable Object for retry queueing isn't worth the complexity in +Phase 1. + +### Auth + +Every forward carries: + +- `Authorization: Bearer ` — per-source token, generated at + `canonry traffic connect cloudflare` time, embedded in the Worker script + at generation, stored hashed in `traffic_sources.ingestTokenHash`. +- `X-Canonry-Timestamp: ` — request timestamp. +- `X-Canonry-Signature: hex(hmac_sha256(secret, timestamp + "." + body))` + — HMAC over `timestamp + "." + body`, using a per-source HMAC secret + stored in `~/.canonry/config.yaml` under `cloudflareTraffic.connections`. +- `X-Canonry-Worker-Version: ` — for the staleness doctor check. + +Receiver verifies in this order: timestamp within ±5 minutes → token hash +match → HMAC match. Any failure responds 401 with a structured error +envelope; no detail about which check failed (avoid token-vs-secret +disambiguation). + +### Versioning + +The Worker script carries a `WORKER_VERSION` constant. On each forwarded +event, the receiver records the version against the source. A new doctor +check flags drift: + +- `cloudflare.worker.version-stale`: + - `warn` when deployed version is > 2 releases behind canonry's current + - `fail` when > 5 releases behind + +The bot keyword set is generic enough that updates are quarterly-ish, not +per-bot. When the constants do change, canonry tags a new Worker release +and the operator regenerates+redeploys. + +## Server Side + +### New adapter package: `packages/integration-cloudflare-worker/` + +Following the file layout of `packages/integration-vercel/`: + +| File | Role | +|------|------| +| `src/script.ts` | `generateWorkerScript(opts)` — produces the JS string with embedded source-id, bearer, HMAC secret, version, bot-keyword constants | +| `src/normalize.ts` | `normalizeCloudflareWorkerEvent(payload)` — converts one inbound event to `NormalizedTrafficRequest` | +| `src/verify.ts` | `verifyRequestSignature(headers, body, secret)` — timestamp + HMAC check | +| `src/types.ts` | Inbound payload schema (Zod) + worker version manifest | +| `src/index.ts` | Re-exports | + +No client/drain module — there's nothing for canonry to pull. The Worker +pushes; the API route handles ingestion. + +### New API routes (in `packages/api-routes/src/traffic.ts`) + +- `POST /api/v1/projects/:name/traffic/connect/cloudflare` + - Body: `{ displayName?, zoneId?, accountId? }` + - Generates per-source bearer token + HMAC secret + - Stores hashed token + zone metadata in `traffic_sources` + - Stores cleartext bearer + HMAC secret in `~/.canonry/config.yaml` + - Returns the generated Worker script as a string field on the response + (with embedded constants) and a `wranglerToml` field so the operator + can `wrangler deploy` directly + - Idempotent on `(project, sourceType, zoneId?)` — rerunning rotates + the secrets and emits a new script + +- `POST /api/v1/projects/:name/traffic/cloudflare/ingest` + - The Worker target + - Bearer + HMAC auth (separate from canonry's `cnry_...` API keys) + - Body: see "Payload" above + - Verifies → normalizes → classifies (existing + `integration-traffic`) → upserts rollups via the existing hourly-rollup + pipeline shared with the pull adapters + - Returns `{ acceptedEvents, droppedEvents, workerVersionAck }` + - Rate-limited per source (e.g., 100 req/sec, 5000 req/min) to defend + against a runaway Worker; over-limit returns 429 with `Retry-After` + +- `POST /api/v1/projects/:name/traffic/cloudflare/rotate/:sourceId` + - Generates new bearer + HMAC secret + - Emits a new Worker script + - Old secrets stay valid for 5 minutes so the operator has time to + redeploy without a gap + +### Schema changes (`packages/db/src/schema.ts` + `migrate.ts`) + +`traffic_sources` already exists. Add two nullable columns: + +```typescript +ingestTokenHash: text('ingest_token_hash'), // sha256 of bearer +lastWorkerVersion: text('last_worker_version'), // most recent forwarded value +``` + +`configJson` for a `cloudflare` source: + +```typescript +{ + schemaVersion: 1, + workerVersion: '1.0.0', // version embedded at last generate + expectedBotListVersion: '2026-05-27', + zoneId: string | null, + accountId: string | null, +} +``` + +Credentials in `~/.canonry/config.yaml`: + +```yaml +cloudflareTraffic: + connections: + : + bearerToken: + hmacSecret: +``` + +Both stored only here; the DB has only the hashed bearer for verification +and never the HMAC secret in cleartext. + +### Migration version + +Append to `MIGRATION_VERSIONS` in `packages/db/src/migrate.ts`: + +```typescript +{ + version: , + name: 'cloudflare-worker-traffic-source', + statements: [ + `ALTER TABLE traffic_sources ADD COLUMN ingest_token_hash TEXT`, + `ALTER TABLE traffic_sources ADD COLUMN last_worker_version TEXT`, + ], +}, +``` + +## Contracts (`packages/contracts/src/traffic.ts`) + +Already has `'cloudflare'` in `trafficSourceTypeSchema` — no enum change. +Add: + +```typescript +export const cloudflareWorkerSourceConfigSchema = z.object({ + schemaVersion: z.literal(1), + workerVersion: z.string(), + expectedBotListVersion: z.string(), + zoneId: z.string().nullable(), + accountId: z.string().nullable(), +}) + +export const trafficConnectCloudflareRequestSchema = z.object({ + displayName: z.string().min(1).optional(), + zoneId: z.string().optional(), + accountId: z.string().optional(), +}) + +export const trafficConnectCloudflareResponseSchema = z.object({ + sourceId: z.string(), + workerScript: z.string(), + wranglerToml: z.string(), + workerVersion: z.string(), + instructions: z.string(), +}) + +export const cloudflareWorkerEventSchema = z.object({ + eventId: z.string().min(1), + observedAt: z.string().min(1), + method: z.string().nullable(), + host: z.string().nullable(), + path: z.string().min(1), + queryString: z.string().nullable(), + status: z.number().int().nullable(), + userAgent: z.string().nullable(), + remoteIp: z.string().nullable(), + referer: z.string().nullable(), + cf: z.object({ + verifiedBot: z.boolean().nullable(), + botScore: z.number().int().nullable(), + country: z.string().nullable(), + asn: z.number().int().nullable(), + asOrganization: z.string().nullable(), + }).nullable(), +}) + +export const cloudflareWorkerIngestRequestSchema = z.object({ + schemaVersion: z.literal(1), + workerVersion: z.string().min(1), + events: z.array(cloudflareWorkerEventSchema).min(1).max(100), +}) +``` + +## Doctor checks + +`packages/api-routes/src/doctor/checks/cloudflare-worker.ts` registers: + +- `cloudflare.worker.last-seen`: + - `ok` if a forwarded event arrived in last 24h + - `warn` if last in 24h–7d + - `fail` if no event ever / last > 7d +- `cloudflare.worker.version-stale`: + - `ok` if deployed version matches current + - `warn` if 1–2 releases behind + - `fail` if > 2 releases behind OR deployed version unknown +- `cloudflare.worker.signature-failures`: + - `ok` if no signature-failure events in last 24h + - `warn` if > 0 — likely a stale token after rotation + - Read from a new lightweight counter; sized to drop after 7 days + +The generic `traffic.source.recent-data` and `traffic.source.connected` +checks already cover the Cloudflare source type with no per-source code. + +The new check IDs go into the doctor table in `AGENTS.md`. + +## CLI surface + +```bash +canonry traffic connect cloudflare [--display-name ] \ + [--zone-id ] [--account-id ] [--out ] +# Generates a new source. Writes worker.js + wrangler.toml to --out (default: +# ./canonry-cloudflare-worker/) and prints next-step instructions. Secrets +# are inlined into worker.js and saved to ~/.canonry/config.yaml. + +canonry traffic rotate cloudflare --source [--out ] +# Rotates the bearer + HMAC secret. Old secrets remain valid 5 minutes. + +canonry traffic verify cloudflare --source +# Polls until at least one event has been forwarded, or 60s timeout. Useful +# for the post-deploy "did it work?" loop. +``` + +Each command supports `--format json` per the agent-first principle. + +## MCP / Aero + +The MCP tool registry picks up `traffic.connect.cloudflare`, +`traffic.rotate.cloudflare`, and `traffic.verify.cloudflare` automatically +once the API routes exist. Tag them under the `setup` toolkit (same as the +other `traffic.connect.*` tools). Aero inherits them through the +MCP-to-agent adapter; no separate Aero registration unless we want to +exclude the rotate tool from autonomous use. + +## Dashboard + +The Cloudflare source surfaces in the existing AI Traffic section with no +special-case UI — it shows up in the source list, the connect modal gets +a new "Cloudflare Worker" option that calls the connect endpoint and +renders the generated script + wrangler.toml for the operator to copy, +and the version-stale doctor check renders as a banner on the source's +detail card. + +## Testing strategy + +### Unit (`packages/integration-cloudflare-worker/test/`) + +- `script.test.ts` — generated script compiles (`new Function(...)` smoke + test); embedded constants land at expected placeholders; bot keyword set + matches manifest snapshot. +- `normalize.test.ts` — happy path, every nullable field individually + missing, malformed timestamp, malformed eventId. +- `verify.test.ts` — valid signature passes, mutated body fails, expired + timestamp fails, wrong secret fails, replay (same timestamp) fails on + second submission. + +### API (`packages/api-routes/test/`) + +- `traffic-cloudflare-connect.test.ts` — connect endpoint creates source, + writes secrets to config, returns script, idempotent on rerun, rotation + invalidates old token after grace. +- `traffic-cloudflare-ingest.test.ts` — auth happy path, signature replay + rejected, rate-limit triggers 429, events land in rollups identical to + what the equivalent pull adapter would produce, version is recorded. +- `probe-exclusion.test.ts` — add a case confirming probe runs don't + contaminate Cloudflare-source rollups (the source has no run-trigger + concept — but the rollup table is shared, so still worth asserting). +- `doctor-cloudflare-worker.test.ts` — every status × every code path. + +### Classifier (`packages/integration-traffic/test/`) + +- Existing tests already cover UA + referer rules. Add a case that uses a + Cloudflare-shaped `NormalizedTrafficRequest` (with `remoteIp` populated) + to confirm IP-range verification promotes to `verified` for the major + AI bots — this is the first adapter where IP verification actually + fires, so it's worth one explicit assertion. + +## Rollout + +1. Contracts + schema + migration. +2. `integration-cloudflare-worker/` package (script generator, normalizer, + verifier). +3. API routes: connect, ingest, rotate. Reuse existing classifier + rollup. +4. CLI commands. +5. Doctor checks. +6. MCP toolkit registration (automatic via tool-registry). +7. Dashboard connect-modal entry. +8. Docs: `packages/integration-cloudflare-worker/AGENTS.md`, update root + `AGENTS.md` doctor table, update `docs/data-model.md` if new columns + are visible to the schema diagram. + +## Phase 2 (deferred) + +- **Auto-deploy** — operator hands canonry a Cloudflare API token; canonry + uses the Workers API to create/update the Worker on the operator's + behalf. Reuses the same script generator. Adds a new credential to + `~/.canonry/config.yaml` and a `cloudflare.api.token` doctor check. +- **Cloudflare-as-proxy** — for hosts with no native logs, canonry walks + the operator through pointing nameservers at Cloudflare, creates the + zone, installs the Worker. This is where the auto-deploy story actually + pays off. +- **Logpush sibling adapter** — Business+ plan customers point Logpush at + the same ingest endpoint. Endpoint shape already accepts arrays, so this + is mostly a parser change + a separate connect flow + a separate doctor + category. + +## Open questions + +- **Drop-on-failure vs in-Worker buffering.** Phase 1 drops events the + canonry endpoint refuses (network error, 5xx). Acceptable for AI + traffic since the signal is statistical, but worth revisiting if early + customers want stronger guarantees. Durable Objects + a small queue + would add hard delivery semantics at a cost. +- **Rate-limit shape.** Static per-source limits (100/s, 5k/min) are a + reasonable starting point. A busy site under heavy AI-crawler load + might exceed it; sample-down at the Worker (1-in-N) might be worth + exposing as a config value if it becomes a problem. +- **Worker keyword list distribution.** Inlined constants vs fetched-from- + KV. Inlined is simpler (no KV reads on every request, no Cloudflare API + dependency); the cost is a redeploy when keywords change. Sticking with + inlined for Phase 1. +- **Signature-failure counter storage.** Want a lightweight per-source + counter without growing the schema much. Options: a new + `traffic_source_metrics` table with a small fixed set of counters, or + reuse `usage_counters`. Lean toward `usage_counters` if the scope/period + shape fits. + +## See Also + +- `plans/server-side-ai-traffic-ingestion.md` — overall traffic plan +- `plans/cloud-run-traffic-source-model-review.md` — raw-event model + rationale +- `packages/integration-vercel/AGENTS.md` — closest sibling adapter (pull, + not push, but same normalizer shape) +- `packages/integration-traffic/AGENTS.md` — classifier + rollup that the + ingest endpoint hands off to +- Cloudflare Workers Bot Management docs — `cf.botManagement` property + availability per plan tier (sanity-check before relying on `verifiedBot` + as a verification signal) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8a80f51c..fdb12d3d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -397,6 +397,12 @@ importers: specifier: workspace:* version: link:../contracts + packages/integration-cloudflare-worker: + dependencies: + '@ainyc/canonry-contracts': + specifier: workspace:* + version: link:../contracts + packages/integration-commoncrawl: dependencies: '@ainyc/canonry-contracts': diff --git a/vitest.config.ts b/vitest.config.ts index 073ff271..0a402cd6 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -33,6 +33,7 @@ const NODE_PACKAGES = [ 'db', 'integration-bing', 'integration-cloud-run', + 'integration-cloudflare-worker', 'integration-commoncrawl', 'integration-google', 'integration-google-analytics',