diff --git a/.gitignore b/.gitignore index 538660ab..00a0fee3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ node_modules/ .DS_Store coverage/ dist/ +.tmp/ # Web SPA build output (built by build-web.ts) packages/canonry/assets/web/ # Build-time copies of skills from repo root (copied by copy-agent-assets.ts) diff --git a/package.json b/package.json index 110fcc8e..99cb0c5a 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "canonry", "private": true, - "version": "4.10.1", + "version": "4.11.0", "type": "module", "packageManager": "pnpm@10.28.2", "scripts": { diff --git a/packages/canonry/package.json b/packages/canonry/package.json index 672266b7..739cd9cf 100644 --- a/packages/canonry/package.json +++ b/packages/canonry/package.json @@ -1,6 +1,6 @@ { "name": "@ainyc/canonry", - "version": "4.10.1", + "version": "4.11.0", "type": "module", "description": "Agent-first open-source AEO operating platform - track how answer engines cite your domain", "license": "FSL-1.1-ALv2", diff --git a/packages/contracts/src/index.ts b/packages/contracts/src/index.ts index 076646af..7f2360d6 100644 --- a/packages/contracts/src/index.ts +++ b/packages/contracts/src/index.ts @@ -26,3 +26,4 @@ export * from './citations.js' export * from './report.js' export * from './report-dedup.js' export * from './skills.js' +export * from './traffic.js' diff --git a/packages/contracts/src/traffic.ts b/packages/contracts/src/traffic.ts new file mode 100644 index 00000000..210abc45 --- /dev/null +++ b/packages/contracts/src/traffic.ts @@ -0,0 +1,70 @@ +import { z } from 'zod' + +export const trafficSourceTypeSchema = z.enum([ + 'cloud-run', + 'wordpress', + 'cloudflare', + 'vercel', + 'generic-log', +]) +export type TrafficSourceType = z.infer +export const TrafficSourceTypes = trafficSourceTypeSchema.enum + +export const trafficAdapterCapabilitySchema = z.enum([ + 'raw-request-events', + 'aggregate-request-metrics', + 'request-url', + 'status-code', + 'user-agent', + 'remote-ip', + 'referer', + 'cursor-pull', +]) +export type TrafficAdapterCapability = z.infer +export const TrafficAdapterCapabilities = trafficAdapterCapabilitySchema.enum + +export const trafficEvidenceKindSchema = z.enum(['raw-request', 'aggregate-bucket']) +export type TrafficEvidenceKind = z.infer +export const TrafficEvidenceKinds = trafficEvidenceKindSchema.enum + +export const trafficEventConfidenceSchema = z.enum(['observed', 'provider-aggregated', 'inferred']) +export type TrafficEventConfidence = z.infer +export const TrafficEventConfidences = trafficEventConfidenceSchema.enum + +export const trafficProviderResourceSchema = z.object({ + type: z.string().nullable(), + labels: z.record(z.string(), z.string()), +}) +export type TrafficProviderResource = z.infer + +export const normalizedTrafficRequestSchema = z.object({ + sourceType: trafficSourceTypeSchema, + evidenceKind: z.literal(TrafficEvidenceKinds['raw-request']), + confidence: z.literal(TrafficEventConfidences.observed), + eventId: z.string().min(1), + observedAt: z.string().min(1), + method: z.string().nullable(), + requestUrl: z.string().nullable(), + host: z.string().nullable(), + path: z.string().min(1), + queryString: z.string().nullable(), + status: z.number().int().nullable(), + userAgent: z.string().nullable(), + remoteIp: z.string().nullable(), + referer: z.string().nullable(), + latencyMs: z.number().nullable(), + requestSizeBytes: z.number().int().nullable(), + responseSizeBytes: z.number().int().nullable(), + providerResource: trafficProviderResourceSchema, + providerLabels: z.record(z.string(), z.string()), +}) +export type NormalizedTrafficRequest = z.infer + +export const normalizedTrafficPullPageSchema = z.object({ + events: z.array(normalizedTrafficRequestSchema), + rawEntryCount: z.number().int().nonnegative(), + skippedEntryCount: z.number().int().nonnegative(), + nextPageToken: z.string().optional(), + filter: z.string(), +}) +export type NormalizedTrafficPullPage = z.infer diff --git a/packages/contracts/test/traffic.test.ts b/packages/contracts/test/traffic.test.ts new file mode 100644 index 00000000..b37b3cc9 --- /dev/null +++ b/packages/contracts/test/traffic.test.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from 'vitest' +import { + TrafficEventConfidences, + TrafficEvidenceKinds, + TrafficSourceTypes, + normalizedTrafficRequestSchema, +} from '../src/traffic.js' + +describe('traffic contracts', () => { + it('accepts a raw request event from any server-side adapter', () => { + const parsed = normalizedTrafficRequestSchema.parse({ + sourceType: TrafficSourceTypes['cloud-run'], + evidenceKind: TrafficEvidenceKinds['raw-request'], + confidence: TrafficEventConfidences.observed, + eventId: 'cloud-run:2026-04-30T12:00:00.000Z:abc123', + observedAt: '2026-04-30T12:00:00.000Z', + method: 'GET', + requestUrl: 'https://example.com/blog/post?utm_source=chatgpt.com', + host: 'example.com', + path: '/blog/post', + queryString: 'utm_source=chatgpt.com', + status: 200, + userAgent: 'GPTBot/1.2', + remoteIp: '203.0.113.10', + referer: 'https://chatgpt.com/', + latencyMs: 123.4, + requestSizeBytes: 456, + responseSizeBytes: 789, + providerResource: { + type: 'cloud_run_revision', + labels: { + project_id: 'sample-project', + service_name: 'web', + location: 'us-central1', + }, + }, + providerLabels: {}, + }) + + expect(parsed.sourceType).toBe(TrafficSourceTypes['cloud-run']) + expect(parsed.evidenceKind).toBe(TrafficEvidenceKinds['raw-request']) + expect(parsed.confidence).toBe(TrafficEventConfidences.observed) + expect(parsed.path).toBe('/blog/post') + }) +}) diff --git a/packages/integration-cloud-run/AGENTS.md b/packages/integration-cloud-run/AGENTS.md new file mode 100644 index 00000000..0d75c090 --- /dev/null +++ b/packages/integration-cloud-run/AGENTS.md @@ -0,0 +1,36 @@ +# integration-cloud-run + +## Purpose + +Cloud Run / Cloud Logging integration — pulls request logs for `cloud_run_revision` resources via the Cloud Logging `entries.list` API and normalizes them into provider-neutral `NormalizedTrafficRequest` events for the traffic ingestion pipeline. + +## Key Files + +| File | Role | +|------|------| +| `src/client.ts` | `listCloudRunTrafficEvents` — paginated `entries.list` pull, page-token cursoring, `CloudRunLoggingApiError` | +| `src/filter.ts` | `buildCloudRunLogFilter` — composes the Cloud Logging query string from service/location/timestamp/url/UA narrowing options | +| `src/normalize.ts` | `normalizeCloudRunLogEntry` — converts a Cloud Logging `LogEntry.httpRequest` into a `NormalizedTrafficRequest` | +| `src/types.ts` | Adapter option/response shapes (`ListCloudRunTrafficEventsOptions`, `CloudRunTrafficEventsPage`, raw `LogEntry` types) | +| `src/index.ts` | Re-exports public API | + +## Patterns + +- **Bearer-token auth.** The caller supplies an OAuth access token (`logging.logEntries.list`-class scope). This package does not own the token — credentials live in `~/.canonry/config.yaml` and are exchanged by the consumer (CLI/script/server). +- **Pull-only, cursor-paginated.** `listCloudRunTrafficEvents` accepts `pageToken` / `pageSize` / `maxPages` so callers can do incremental syncs. No push, no SaaS relay. +- **Provider-neutral output.** Every adapter in the traffic stack normalizes to the same `NormalizedTrafficRequest` shape from `@ainyc/canonry-contracts`. Do not leak Cloud Logging types past the package boundary. +- **Narrow filters when possible.** `buildCloudRunLogFilter` composes filters incrementally (service, location, time window, request URL substring, user-agent substrings). Narrower filters lower Cloud Logging cost; the `--narrow-bots` mode in the probe script intentionally trades human-AI-referral coverage for crawler-only coverage. + +## Common Mistakes + +- **Calling `entries.list` without `resourceNames`.** Cloud Logging requires it; the client always passes `projects/`. +- **Storing access tokens in this package.** Tokens are short-lived and supplied per call. +- **Using this client for non-`cloud_run_revision` resources.** The filter and normalizer are scoped to Cloud Run request logs. Other resource types need a separate adapter. + +## See Also + +- `packages/contracts/src/traffic.ts` — the `NormalizedTrafficRequest` contract this package emits +- `packages/integration-traffic/` — provider-neutral classifier + rollup over normalized events +- `plans/cloud-run-traffic-source-model-review.md` — design rationale for the raw-event vs aggregate-bucket split +- `plans/server-side-ai-traffic-ingestion.md` — overall traffic ingestion plan +- `scripts/test-cloud-run-traffic-pull.ts` — local probe that exercises pull → normalize → analyze diff --git a/packages/integration-cloud-run/CLAUDE.md b/packages/integration-cloud-run/CLAUDE.md new file mode 100644 index 00000000..43c994c2 --- /dev/null +++ b/packages/integration-cloud-run/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/packages/integration-cloud-run/package.json b/packages/integration-cloud-run/package.json new file mode 100644 index 00000000..821a9c46 --- /dev/null +++ b/packages/integration-cloud-run/package.json @@ -0,0 +1,22 @@ +{ + "name": "@ainyc/canonry-integration-cloud-run", + "version": "0.0.0", + "private": true, + "type": "module", + "license": "FSL-1.1-ALv2", + "exports": { + ".": { + "types": "./src/index.ts", + "default": "./src/index.ts" + } + }, + "types": "./src/index.ts", + "scripts": { + "typecheck": "tsc --noEmit -p tsconfig.json", + "test": "vitest run", + "lint": "eslint src/ test/" + }, + "dependencies": { + "@ainyc/canonry-contracts": "workspace:*" + } +} diff --git a/packages/integration-cloud-run/src/client.ts b/packages/integration-cloud-run/src/client.ts new file mode 100644 index 00000000..1a7d2c45 --- /dev/null +++ b/packages/integration-cloud-run/src/client.ts @@ -0,0 +1,130 @@ +import { buildCloudRunLogFilter } from './filter.js' +import { normalizeCloudRunLogEntry } from './normalize.js' +import type { + CloudRunListLogEntriesResponse, + CloudRunTrafficEventsPage, + ListCloudRunTrafficEventsOptions, +} from './types.js' + +const CLOUD_LOGGING_ENTRIES_LIST_URL = 'https://logging.googleapis.com/v2/entries:list' +const DEFAULT_PAGE_SIZE = 1000 +const DEFAULT_MAX_PAGES = 1 +const DEFAULT_TIMEOUT_MS = 30_000 + +export class CloudRunLoggingApiError extends Error { + constructor( + message: string, + public readonly status: number, + public readonly body?: string, + ) { + super(message) + this.name = 'CloudRunLoggingApiError' + } +} + +function validateAccessToken(accessToken: string): void { + if (!accessToken.trim()) { + throw new CloudRunLoggingApiError('Cloud Logging access token is required', 400) + } +} + +function validateProjectId(gcpProjectId: string): void { + if (!gcpProjectId.trim()) { + throw new CloudRunLoggingApiError('GCP project ID is required', 400) + } +} + +function normalizePageSize(pageSize: number | undefined): number { + if (pageSize === undefined) return DEFAULT_PAGE_SIZE + if (!Number.isInteger(pageSize) || pageSize < 1) { + throw new CloudRunLoggingApiError('pageSize must be a positive integer', 400) + } + return pageSize +} + +function normalizeMaxPages(maxPages: number | undefined): number { + if (maxPages === undefined) return DEFAULT_MAX_PAGES + if (!Number.isInteger(maxPages) || maxPages < 1) { + throw new CloudRunLoggingApiError('maxPages must be a positive integer', 400) + } + return maxPages +} + +async function readErrorBody(response: Response): Promise { + const text = await response.text().catch(() => '') + if (!text) return undefined + return text.length <= 500 ? text : `${text.slice(0, 500)}... [truncated]` +} + +export async function listCloudRunTrafficEvents( + accessToken: string, + options: ListCloudRunTrafficEventsOptions, +): Promise { + validateAccessToken(accessToken) + validateProjectId(options.gcpProjectId) + + const filter = buildCloudRunLogFilter(options) + const pageSize = normalizePageSize(options.pageSize) + const maxPages = normalizeMaxPages(options.maxPages) + const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS + + let pageToken = options.pageToken + let rawEntryCount = 0 + let skippedEntryCount = 0 + const events: CloudRunTrafficEventsPage['events'] = [] + + for (let page = 0; page < maxPages; page += 1) { + const requestBody: Record = { + resourceNames: [`projects/${options.gcpProjectId}`], + filter, + orderBy: options.orderBy ?? 'timestamp asc', + pageSize, + } + if (pageToken) { + requestBody.pageToken = pageToken + } + + const response = await fetch(CLOUD_LOGGING_ENTRIES_LIST_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${accessToken}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + signal: AbortSignal.timeout(timeoutMs), + }) + + if (!response.ok) { + const body = await readErrorBody(response) + throw new CloudRunLoggingApiError( + `Cloud Logging entries.list failed with HTTP ${response.status}`, + response.status, + body, + ) + } + + const body = (await response.json()) as CloudRunListLogEntriesResponse + const entries = body.entries ?? [] + rawEntryCount += entries.length + + for (const entry of entries) { + const event = normalizeCloudRunLogEntry(entry) + if (event) { + events.push(event) + } else { + skippedEntryCount += 1 + } + } + + pageToken = body.nextPageToken + if (!pageToken) break + } + + return { + events, + rawEntryCount, + skippedEntryCount, + nextPageToken: pageToken, + filter, + } +} diff --git a/packages/integration-cloud-run/src/filter.ts b/packages/integration-cloud-run/src/filter.ts new file mode 100644 index 00000000..cdbd4ec7 --- /dev/null +++ b/packages/integration-cloud-run/src/filter.ts @@ -0,0 +1,65 @@ +import type { CloudRunLogFilterOptions } from './types.js' + +function assertNonEmpty(name: string, value: string): void { + if (!value.trim()) { + throw new Error(`${name} must be a non-empty string`) + } +} + +function quoteLogFilterValue(value: string): string { + return JSON.stringify(value) +} + +function normalizeTimestamp(value: string | Date): string { + const date = value instanceof Date ? value : new Date(value) + if (Number.isNaN(date.getTime())) { + throw new Error(`Invalid timestamp: ${String(value)}`) + } + return date.toISOString() +} + +export function buildCloudRunLogFilter(options: CloudRunLogFilterOptions = {}): string { + const clauses = ['resource.type="cloud_run_revision"'] + + if (options.serviceName !== undefined) { + assertNonEmpty('serviceName', options.serviceName) + clauses.push(`resource.labels.service_name=${quoteLogFilterValue(options.serviceName)}`) + } + + if (options.location !== undefined) { + assertNonEmpty('location', options.location) + clauses.push(`resource.labels.location=${quoteLogFilterValue(options.location)}`) + } + + if (options.startTime !== undefined) { + clauses.push(`timestamp >= ${quoteLogFilterValue(normalizeTimestamp(options.startTime))}`) + } + + if (options.endTime !== undefined) { + clauses.push(`timestamp < ${quoteLogFilterValue(normalizeTimestamp(options.endTime))}`) + } + + const userAgentSubstrings = (options.userAgentSubstrings ?? []) + .map((pattern) => pattern.trim()) + .filter(Boolean) + + if (userAgentSubstrings.length > 0) { + const uaClauses = userAgentSubstrings.map((pattern) => ( + `httpRequest.userAgent:${quoteLogFilterValue(pattern)}` + )) + clauses.push(`(${uaClauses.join(' OR ')})`) + } + + const requestUrlSubstrings = (options.requestUrlSubstrings ?? []) + .map((pattern) => pattern.trim()) + .filter(Boolean) + + if (requestUrlSubstrings.length > 0) { + const urlClauses = requestUrlSubstrings.map((pattern) => ( + `httpRequest.requestUrl:${quoteLogFilterValue(pattern)}` + )) + clauses.push(`(${urlClauses.join(' OR ')})`) + } + + return clauses.join(' AND ') +} diff --git a/packages/integration-cloud-run/src/index.ts b/packages/integration-cloud-run/src/index.ts new file mode 100644 index 00000000..9d17d252 --- /dev/null +++ b/packages/integration-cloud-run/src/index.ts @@ -0,0 +1,4 @@ +export * from './client.js' +export * from './filter.js' +export * from './normalize.js' +export type * from './types.js' diff --git a/packages/integration-cloud-run/src/normalize.ts b/packages/integration-cloud-run/src/normalize.ts new file mode 100644 index 00000000..e2dda508 --- /dev/null +++ b/packages/integration-cloud-run/src/normalize.ts @@ -0,0 +1,89 @@ +import { + TrafficEventConfidences, + TrafficEvidenceKinds, + TrafficSourceTypes, + type NormalizedTrafficRequest, +} from '@ainyc/canonry-contracts' +import type { CloudRunLogEntry } from './types.js' + +function numberOrNull(value: string | number | undefined): number | null { + if (value === undefined || value === null) return null + const parsed = typeof value === 'number' ? value : Number(value) + return Number.isFinite(parsed) ? parsed : null +} + +function latencyToMs(value: string | undefined): number | null { + if (!value) return null + const secondsMatch = /^([0-9]+(?:\.[0-9]+)?)s$/.exec(value.trim()) + if (!secondsMatch) return null + const seconds = Number(secondsMatch[1]) + return Number.isFinite(seconds) ? Math.round(seconds * 1_000_000) / 1000 : null +} + +function normalizeLabels(labels: Record | undefined): Record { + if (!labels) return {} + return Object.fromEntries( + Object.entries(labels) + .filter((entry): entry is [string, string] => ( + typeof entry[0] === 'string' && typeof entry[1] === 'string' + )), + ) +} + +function parseRequestUrl(requestUrl: string): { host: string | null; path: string; queryString: string | null } | null { + try { + const url = requestUrl.startsWith('/') + ? new URL(requestUrl, 'https://canonry.local') + : new URL(requestUrl) + return { + host: url.hostname === 'canonry.local' ? null : url.hostname, + path: url.pathname || '/', + queryString: url.search ? url.search.slice(1) : null, + } + } catch { + return null + } +} + +function buildEventId(entry: CloudRunLogEntry, observedAt: string, requestUrl: string): string { + if (entry.insertId?.trim()) { + return `cloud-run:${observedAt}:${entry.insertId}` + } + return `cloud-run:${observedAt}:${requestUrl}` +} + +export function normalizeCloudRunLogEntry(entry: CloudRunLogEntry): NormalizedTrafficRequest | null { + const request = entry.httpRequest + if (!request?.requestUrl) return null + + const observedAt = entry.timestamp ?? entry.receiveTimestamp + if (!observedAt) return null + + const urlParts = parseRequestUrl(request.requestUrl) + if (!urlParts) return null + + return { + sourceType: TrafficSourceTypes['cloud-run'], + evidenceKind: TrafficEvidenceKinds['raw-request'], + confidence: TrafficEventConfidences.observed, + eventId: buildEventId(entry, observedAt, request.requestUrl), + observedAt, + method: request.requestMethod ?? null, + requestUrl: request.requestUrl, + host: urlParts.host, + path: urlParts.path, + queryString: urlParts.queryString, + status: numberOrNull(request.status), + userAgent: request.userAgent ?? null, + remoteIp: request.remoteIp ?? null, + referer: request.referer ?? null, + latencyMs: latencyToMs(request.latency), + requestSizeBytes: numberOrNull(request.requestSize), + responseSizeBytes: numberOrNull(request.responseSize), + providerResource: { + type: entry.resource?.type ?? null, + labels: normalizeLabels(entry.resource?.labels), + }, + providerLabels: normalizeLabels(entry.labels), + } +} diff --git a/packages/integration-cloud-run/src/types.ts b/packages/integration-cloud-run/src/types.ts new file mode 100644 index 00000000..140702d6 --- /dev/null +++ b/packages/integration-cloud-run/src/types.ts @@ -0,0 +1,59 @@ +import type { NormalizedTrafficRequest } from '@ainyc/canonry-contracts' + +export interface CloudRunLogFilterOptions { + serviceName?: string + location?: string + startTime?: string | Date + endTime?: string | Date + userAgentSubstrings?: string[] + requestUrlSubstrings?: string[] +} + +export interface CloudRunHttpRequest { + requestMethod?: string + requestUrl?: string + requestSize?: string | number + status?: number + responseSize?: string | number + userAgent?: string + remoteIp?: string + serverIp?: string + referer?: string + latency?: string + protocol?: string +} + +export interface CloudRunLogEntry { + insertId?: string + timestamp?: string + receiveTimestamp?: string + logName?: string + resource?: { + type?: string + labels?: Record + } + labels?: Record + httpRequest?: CloudRunHttpRequest +} + +export interface ListCloudRunTrafficEventsOptions extends CloudRunLogFilterOptions { + gcpProjectId: string + pageSize?: number + pageToken?: string + orderBy?: 'timestamp asc' | 'timestamp desc' + maxPages?: number + timeoutMs?: number +} + +export interface CloudRunTrafficEventsPage { + events: NormalizedTrafficRequest[] + rawEntryCount: number + skippedEntryCount: number + nextPageToken?: string + filter: string +} + +export interface CloudRunListLogEntriesResponse { + entries?: CloudRunLogEntry[] + nextPageToken?: string +} diff --git a/packages/integration-cloud-run/test/cloud-run-client.test.ts b/packages/integration-cloud-run/test/cloud-run-client.test.ts new file mode 100644 index 00000000..99ffa8bc --- /dev/null +++ b/packages/integration-cloud-run/test/cloud-run-client.test.ts @@ -0,0 +1,168 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' +import { + buildCloudRunLogFilter, + listCloudRunTrafficEvents, + normalizeCloudRunLogEntry, +} from '../src/index.js' + +describe('buildCloudRunLogFilter', () => { + it('builds a selective Cloud Logging filter for one Cloud Run service', () => { + const filter = buildCloudRunLogFilter({ + serviceName: 'canonry-web', + location: 'us-central1', + startTime: '2026-04-30T10:00:00.000Z', + endTime: '2026-04-30T11:00:00.000Z', + }) + + expect(filter).toContain('resource.type="cloud_run_revision"') + expect(filter).toContain('resource.labels.service_name="canonry-web"') + expect(filter).toContain('resource.labels.location="us-central1"') + expect(filter).toContain('timestamp >= "2026-04-30T10:00:00.000Z"') + expect(filter).toContain('timestamp < "2026-04-30T11:00:00.000Z"') + }) + + it('escapes filter values and can add user-agent narrowing clauses', () => { + const filter = buildCloudRunLogFilter({ + serviceName: 'web"quoted', + userAgentSubstrings: ['GPTBot/', 'ClaudeBot/'], + requestUrlSubstrings: ['ainyc.ai'], + }) + + expect(filter).toContain('resource.labels.service_name="web\\"quoted"') + expect(filter).toContain('(httpRequest.userAgent:"GPTBot/" OR httpRequest.userAgent:"ClaudeBot/")') + expect(filter).toContain('(httpRequest.requestUrl:"ainyc.ai")') + }) +}) + +describe('normalizeCloudRunLogEntry', () => { + it('normalizes a Cloud Logging LogEntry httpRequest into Canonry request evidence', () => { + const event = normalizeCloudRunLogEntry({ + insertId: 'abc123', + timestamp: '2026-04-30T12:00:00.123Z', + resource: { + type: 'cloud_run_revision', + labels: { + project_id: 'sample-project', + service_name: 'canonry-web', + location: 'us-central1', + }, + }, + httpRequest: { + requestMethod: 'GET', + requestUrl: 'https://example.com/blog/post?utm_source=chatgpt.com', + status: 200, + userAgent: 'GPTBot/1.2', + remoteIp: '203.0.113.10', + referer: 'https://chatgpt.com/', + latency: '0.123400s', + requestSize: '456', + responseSize: '789', + }, + labels: { + 'run.googleapis.com/base_image_versions': 'ignored-but-preserved', + }, + }) + + expect(event).toMatchObject({ + eventId: 'cloud-run:2026-04-30T12:00:00.123Z:abc123', + observedAt: '2026-04-30T12:00:00.123Z', + method: 'GET', + requestUrl: 'https://example.com/blog/post?utm_source=chatgpt.com', + host: 'example.com', + path: '/blog/post', + queryString: 'utm_source=chatgpt.com', + status: 200, + userAgent: 'GPTBot/1.2', + remoteIp: '203.0.113.10', + referer: 'https://chatgpt.com/', + latencyMs: 123.4, + requestSizeBytes: 456, + responseSizeBytes: 789, + providerResource: { + type: 'cloud_run_revision', + labels: { + project_id: 'sample-project', + service_name: 'canonry-web', + location: 'us-central1', + }, + }, + providerLabels: { + 'run.googleapis.com/base_image_versions': 'ignored-but-preserved', + }, + }) + }) + + it('drops non-request log entries that have no httpRequest/requestUrl evidence', () => { + expect(normalizeCloudRunLogEntry({ + insertId: 'log-line', + timestamp: '2026-04-30T12:00:00.123Z', + resource: { type: 'cloud_run_revision', labels: {} }, + textPayload: 'application log', + })).toBeNull() + }) +}) + +describe('listCloudRunTrafficEvents', () => { + let fetchSpy: ReturnType + + beforeEach(() => { + fetchSpy = vi.spyOn(globalThis, 'fetch') + }) + + afterEach(() => { + fetchSpy.mockRestore() + }) + + it('calls entries.list with the Cloud Run filter and paginates normalized request events', async () => { + const bodies: unknown[] = [] + fetchSpy.mockImplementation(async (_input: string | URL | Request, init?: RequestInit) => { + bodies.push(JSON.parse(init?.body as string)) + const body = bodies[bodies.length - 1] as { pageToken?: string } + return new Response(JSON.stringify({ + entries: [ + { + insertId: body.pageToken ? 'page-2' : 'page-1', + timestamp: body.pageToken ? '2026-04-30T12:01:00.000Z' : '2026-04-30T12:00:00.000Z', + resource: { type: 'cloud_run_revision', labels: { service_name: 'web' } }, + httpRequest: { + requestMethod: 'GET', + requestUrl: body.pageToken ? 'https://example.com/two' : 'https://example.com/one', + status: 200, + userAgent: 'GPTBot/1.2', + }, + }, + ], + nextPageToken: body.pageToken ? undefined : 'next-page', + }), { status: 200, headers: { 'Content-Type': 'application/json' } }) + }) + + const result = await listCloudRunTrafficEvents('token-123', { + gcpProjectId: 'sample-project', + serviceName: 'web', + startTime: '2026-04-30T12:00:00.000Z', + maxPages: 2, + pageSize: 100, + }) + + expect(fetchSpy).toHaveBeenCalledTimes(2) + expect(String(fetchSpy.mock.calls[0]![0])).toBe('https://logging.googleapis.com/v2/entries:list') + expect(fetchSpy.mock.calls[0]![1]).toMatchObject({ + method: 'POST', + headers: { + Authorization: 'Bearer token-123', + 'Content-Type': 'application/json', + }, + }) + expect(bodies[0]).toMatchObject({ + resourceNames: ['projects/sample-project'], + orderBy: 'timestamp asc', + pageSize: 100, + }) + expect((bodies[0] as { filter: string }).filter).toContain('resource.type="cloud_run_revision"') + expect(bodies[1]).toMatchObject({ pageToken: 'next-page' }) + expect(result.events.map((event) => event.path)).toEqual(['/one', '/two']) + expect(result.rawEntryCount).toBe(2) + expect(result.skippedEntryCount).toBe(0) + expect(result.nextPageToken).toBeUndefined() + }) +}) diff --git a/packages/integration-cloud-run/tsconfig.json b/packages/integration-cloud-run/tsconfig.json new file mode 100644 index 00000000..65aaab0c --- /dev/null +++ b/packages/integration-cloud-run/tsconfig.json @@ -0,0 +1,7 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "./src" + }, + "include": ["src/**/*.ts"] +} diff --git a/packages/integration-cloud-run/vitest.config.ts b/packages/integration-cloud-run/vitest.config.ts new file mode 100644 index 00000000..d3311973 --- /dev/null +++ b/packages/integration-cloud-run/vitest.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'vitest/config' + +export default defineConfig({ + test: { + include: ['test/**/*.test.ts', 'test/**/*.test.tsx'], + }, +}) diff --git a/packages/integration-traffic/AGENTS.md b/packages/integration-traffic/AGENTS.md new file mode 100644 index 00000000..f711437c --- /dev/null +++ b/packages/integration-traffic/AGENTS.md @@ -0,0 +1,36 @@ +# integration-traffic + +## Purpose + +Provider-neutral traffic classifier and rollup. Takes `NormalizedTrafficRequest` events (from any traffic adapter — Cloud Run today, WordPress/Cloudflare/Vercel later) and produces hourly crawler / AI-referral buckets plus a debugging probe report. Lives between the raw adapters and the future DB persistence layer. + +## Key Files + +| File | Role | +|------|------| +| `src/rules.ts` | Bundled AI crawler UA patterns and known AI-referrer host rules | +| `src/classifier.ts` | `classifyTrafficRequest` — UA/referrer matching → `ClassifiedCrawler` / `ClassifiedAiReferral` (heuristic; verification status is `claimed_unverified` until IP/rDNS is wired) | +| `src/rollup.ts` | `buildTrafficProbeReport` — aggregates classified events into hourly buckets + top-N summaries | +| `src/types.ts` | Bucket shapes (`CrawlerEventHourlyBucket`, `AiReferralEventHourlyBucket`), classifier output types, probe report shape | +| `src/index.ts` | Re-exports public API | + +## Patterns + +- **Pure functions, no I/O.** Adapters fetch and normalize; this package only classifies and rolls up. Unit-testable end-to-end with fixture events. +- **Two evidence channels.** Crawlers via UA pattern match; human AI referrals via referer host (and UTM later). Never collapse the two — each maps to its own hourly bucket table per `plans/server-side-ai-traffic-ingestion.md`. +- **Verification status tiers.** UA-only matches stay `claimed_unverified`. Promotion to `verified` requires IP/rDNS verification, which is not yet implemented. The `unknown_ai_like` bucket is reserved for behavioral heuristics. +- **Path normalization.** Rollups key on the normalized path so query-string variants don't fragment the bucket counts. +- **Bounded sample tail.** `buildTrafficProbeReport` keeps a small sample slice for classifier debugging; the durable signal is the hourly bucket counts. + +## Common Mistakes + +- **Adding I/O here.** Network/file/DB calls belong in adapters or service code. Keep this package pure. +- **Treating UA-only matches as verified.** Anyone can spoof a user agent. Until IP/rDNS verification is wired, never promote UA-only matches above `claimed_unverified`. +- **Mixing crawler and referral signals into one bucket.** They answer different questions (machine activity vs human clicks); keep them in separate tables and separate API surfaces. + +## See Also + +- `packages/contracts/src/traffic.ts` — `NormalizedTrafficRequest` input contract +- `packages/integration-cloud-run/` — first raw-event adapter feeding this package +- `plans/server-side-ai-traffic-ingestion.md` — table layout, classifier tiers, surface plan +- `plans/cloud-run-traffic-source-model-review.md` — raw-event vs aggregate-bucket model rationale diff --git a/packages/integration-traffic/CLAUDE.md b/packages/integration-traffic/CLAUDE.md new file mode 100644 index 00000000..43c994c2 --- /dev/null +++ b/packages/integration-traffic/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/packages/integration-traffic/package.json b/packages/integration-traffic/package.json new file mode 100644 index 00000000..0a749560 --- /dev/null +++ b/packages/integration-traffic/package.json @@ -0,0 +1,22 @@ +{ + "name": "@ainyc/canonry-integration-traffic", + "version": "0.0.0", + "private": true, + "type": "module", + "license": "FSL-1.1-ALv2", + "exports": { + ".": { + "types": "./src/index.ts", + "default": "./src/index.ts" + } + }, + "types": "./src/index.ts", + "scripts": { + "typecheck": "tsc --noEmit -p tsconfig.json", + "test": "vitest run", + "lint": "eslint src/ test/" + }, + "dependencies": { + "@ainyc/canonry-contracts": "workspace:*" + } +} diff --git a/packages/integration-traffic/src/classifier.ts b/packages/integration-traffic/src/classifier.ts new file mode 100644 index 00000000..2c4bae62 --- /dev/null +++ b/packages/integration-traffic/src/classifier.ts @@ -0,0 +1,79 @@ +import type { NormalizedTrafficRequest } from '@ainyc/canonry-contracts' +import { DEFAULT_AI_CRAWLER_RULES, DEFAULT_AI_REFERRER_RULES } from './rules.js' +import type { ClassifiedAiReferral, ClassifiedCrawler } from './types.js' + +function normalizeHost(host: string): string { + return host.trim().toLowerCase().replace(/^www\./, '') +} + +function hostMatches(host: string, domain: string): boolean { + const normalizedHost = normalizeHost(host) + const normalizedDomain = normalizeHost(domain) + return normalizedHost === normalizedDomain || normalizedHost.endsWith(`.${normalizedDomain}`) +} + +function hostFromUrl(value: string | null): string | null { + if (!value) return null + try { + return normalizeHost(new URL(value).hostname) + } catch { + return null + } +} + +function utmSourceFromQuery(queryString: string | null): string | null { + if (!queryString) return null + const params = new URLSearchParams(queryString) + const source = params.get('utm_source') + return source ? normalizeHost(source) : null +} + +export function classifyCrawler(event: NormalizedTrafficRequest): ClassifiedCrawler | null { + const userAgent = event.userAgent?.trim() + if (!userAgent) return null + + for (const rule of DEFAULT_AI_CRAWLER_RULES) { + if (rule.userAgentPatterns.some((pattern) => pattern.test(userAgent))) { + return { + botId: rule.id, + operator: rule.operator, + product: rule.product, + purpose: rule.purpose, + verificationStatus: 'claimed_unverified', + matchedUserAgent: userAgent, + } + } + } + + return null +} + +export function classifyAiReferral(event: NormalizedTrafficRequest): ClassifiedAiReferral | null { + const refererHost = hostFromUrl(event.referer) + if (refererHost) { + const rule = DEFAULT_AI_REFERRER_RULES.find((candidate) => hostMatches(refererHost, candidate.domain)) + if (rule) { + return { + operator: rule.operator, + product: rule.product, + sourceDomain: refererHost, + evidenceType: 'referer', + } + } + } + + const utmSource = utmSourceFromQuery(event.queryString) + if (utmSource) { + const rule = DEFAULT_AI_REFERRER_RULES.find((candidate) => hostMatches(utmSource, candidate.domain)) + if (rule) { + return { + operator: rule.operator, + product: rule.product, + sourceDomain: utmSource, + evidenceType: 'utm', + } + } + } + + return null +} diff --git a/packages/integration-traffic/src/index.ts b/packages/integration-traffic/src/index.ts new file mode 100644 index 00000000..a16d98c8 --- /dev/null +++ b/packages/integration-traffic/src/index.ts @@ -0,0 +1,4 @@ +export * from './classifier.js' +export * from './rollup.js' +export * from './rules.js' +export type * from './types.js' diff --git a/packages/integration-traffic/src/rollup.ts b/packages/integration-traffic/src/rollup.ts new file mode 100644 index 00000000..5b994b81 --- /dev/null +++ b/packages/integration-traffic/src/rollup.ts @@ -0,0 +1,192 @@ +import type { NormalizedTrafficRequest } from '@ainyc/canonry-contracts' +import { classifyAiReferral, classifyCrawler } from './classifier.js' +import type { + AiReferralEventHourlyBucket, + BuildTrafficProbeReportOptions, + CrawlerEventHourlyBucket, + TrafficProbeReport, +} from './types.js' + +const DEFAULT_SAMPLE_LIMIT = 25 +const UUID_SEGMENT = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i +const LONG_HEX_SEGMENT = /^[0-9a-f]{16,}$/i +const NUMERIC_SEGMENT = /^\d+$/ + +export function normalizeTrafficPathPattern(path: string): string { + const cleanPath = path.trim() || '/' + const pathOnly = cleanPath.split('?')[0] || '/' + const segments = pathOnly.split('/').map((segment) => { + if (!segment) return segment + if (UUID_SEGMENT.test(segment) || LONG_HEX_SEGMENT.test(segment) || NUMERIC_SEGMENT.test(segment)) { + return ':id' + } + return segment + }) + const normalized = segments.join('/') + return normalized.startsWith('/') ? normalized : `/${normalized}` +} + +function hourBucket(value: string): string { + const date = new Date(value) + if (Number.isNaN(date.getTime())) return value + date.setUTCMinutes(0, 0, 0) + return date.toISOString() +} + +function sortCrawlerBuckets(a: CrawlerEventHourlyBucket, b: CrawlerEventHourlyBucket): number { + return a.tsHour.localeCompare(b.tsHour) || + a.botId.localeCompare(b.botId) || + a.pathNormalized.localeCompare(b.pathNormalized) || + String(a.status).localeCompare(String(b.status)) +} + +function sortReferralBuckets(a: AiReferralEventHourlyBucket, b: AiReferralEventHourlyBucket): number { + return a.tsHour.localeCompare(b.tsHour) || + a.product.localeCompare(b.product) || + a.sourceDomain.localeCompare(b.sourceDomain) || + a.landingPathNormalized.localeCompare(b.landingPathNormalized) || + String(a.status).localeCompare(String(b.status)) +} + +function topEntries>( + map: Map, + limit: number, +): Array { + return [...map.values()] + .sort((a, b) => b.hits - a.hits || JSON.stringify(a.fields).localeCompare(JSON.stringify(b.fields))) + .slice(0, limit) + .map((entry) => ({ ...entry.fields, hits: entry.hits })) +} + +export function buildTrafficProbeReport( + events: NormalizedTrafficRequest[], + options: BuildTrafficProbeReportOptions = {}, +): TrafficProbeReport { + const sampleLimit = options.sampleLimit ?? DEFAULT_SAMPLE_LIMIT + const crawlerBuckets = new Map() + const aiReferralBuckets = new Map() + const topBots = new Map() + const topCrawlerPaths = new Map() + const topAiReferrers = new Map() + const topAiReferralLandingPaths = new Map() + + let crawlerHits = 0 + let aiReferralHits = 0 + let unknownHits = 0 + const samples: TrafficProbeReport['samples'] = [] + + for (const event of events) { + const tsHour = hourBucket(event.observedAt) + const pathNormalized = normalizeTrafficPathPattern(event.path) + const crawler = classifyCrawler(event) + const aiReferral = classifyAiReferral(event) + + if (crawler) { + crawlerHits += 1 + const key = [ + tsHour, + crawler.botId, + crawler.verificationStatus, + pathNormalized, + event.status ?? 'null', + ].join('\t') + const existing = crawlerBuckets.get(key) + if (existing) { + existing.hits += 1 + } else { + crawlerBuckets.set(key, { + tsHour, + botId: crawler.botId, + operator: crawler.operator, + product: crawler.product, + verificationStatus: crawler.verificationStatus, + pathNormalized, + status: event.status, + hits: 1, + sampledUserAgent: event.userAgent, + }) + } + const botKey = `${crawler.botId}\t${crawler.operator}` + const botEntry = topBots.get(botKey) + if (botEntry) botEntry.hits += 1 + else topBots.set(botKey, { fields: { botId: crawler.botId, operator: crawler.operator }, hits: 1 }) + incrementBucket(topCrawlerPaths, pathNormalized, { pathNormalized }) + } + + if (aiReferral) { + aiReferralHits += 1 + const key = [ + tsHour, + aiReferral.product, + aiReferral.sourceDomain, + aiReferral.evidenceType, + pathNormalized, + event.status ?? 'null', + ].join('\t') + const existing = aiReferralBuckets.get(key) + if (existing) { + existing.hits += 1 + } else { + aiReferralBuckets.set(key, { + tsHour, + operator: aiReferral.operator, + product: aiReferral.product, + sourceDomain: aiReferral.sourceDomain, + evidenceType: aiReferral.evidenceType, + landingPathNormalized: pathNormalized, + status: event.status, + hits: 1, + }) + } + incrementBucket(topAiReferrers, aiReferral.sourceDomain, { + sourceDomain: aiReferral.sourceDomain, + product: aiReferral.product, + }) + incrementBucket(topAiReferralLandingPaths, pathNormalized, { landingPathNormalized: pathNormalized }) + } + + if (!crawler && !aiReferral) unknownHits += 1 + + if (samples.length < sampleLimit) { + samples.push({ + eventId: event.eventId, + observedAt: event.observedAt, + sourceType: event.sourceType, + path: event.path, + pathNormalized, + status: event.status, + userAgent: event.userAgent, + referer: event.referer, + crawler, + aiReferral, + }) + } + } + + return { + generatedAt: options.generatedAt ?? new Date().toISOString(), + totals: { + normalizedEvents: events.length, + crawlerHits, + aiReferralHits, + unknownHits, + }, + crawlerEventsHourly: [...crawlerBuckets.values()].sort(sortCrawlerBuckets), + aiReferralEventsHourly: [...aiReferralBuckets.values()].sort(sortReferralBuckets), + topBots: topEntries(topBots, 10), + topCrawlerPaths: topEntries(topCrawlerPaths, 10), + topAiReferrers: topEntries(topAiReferrers, 10), + topAiReferralLandingPaths: topEntries(topAiReferralLandingPaths, 10), + samples, + } +} + +function incrementBucket>( + map: Map, + key: string, + fields: T, +): void { + const existing = map.get(key) + if (existing) existing.hits += 1 + else map.set(key, { fields, hits: 1 }) +} diff --git a/packages/integration-traffic/src/rules.ts b/packages/integration-traffic/src/rules.ts new file mode 100644 index 00000000..7035ca13 --- /dev/null +++ b/packages/integration-traffic/src/rules.ts @@ -0,0 +1,125 @@ +import type { AiCrawlerRule, AiReferrerRule } from './types.js' + +export const DEFAULT_AI_CRAWLER_RULES: AiCrawlerRule[] = [ + { + id: 'openai-gptbot', + operator: 'OpenAI', + product: 'GPTBot', + purpose: 'training', + userAgentPatterns: [/GPTBot\//i], + }, + { + id: 'openai-searchbot', + operator: 'OpenAI', + product: 'OAI-SearchBot', + purpose: 'search', + userAgentPatterns: [/OAI-SearchBot\//i], + }, + { + id: 'openai-chatgpt-user', + operator: 'OpenAI', + product: 'ChatGPT-User', + purpose: 'user-agent', + userAgentPatterns: [/ChatGPT-User\//i], + }, + { + id: 'anthropic-claudebot', + operator: 'Anthropic', + product: 'ClaudeBot', + purpose: 'training', + userAgentPatterns: [/ClaudeBot\//i, /Claude-Web\//i, /anthropic-ai/i], + }, + { + id: 'perplexity-bot', + operator: 'Perplexity', + product: 'PerplexityBot', + purpose: 'search', + userAgentPatterns: [/PerplexityBot\//i], + }, + { + id: 'google-extended', + operator: 'Google', + product: 'Google-Extended', + purpose: 'training-control', + userAgentPatterns: [/Google-Extended/i], + }, + { + id: 'bytespider', + operator: 'ByteDance', + product: 'Bytespider', + purpose: 'training', + userAgentPatterns: [/Bytespider/i], + }, + { + id: 'applebot-extended', + operator: 'Apple', + product: 'Applebot-Extended', + purpose: 'training', + userAgentPatterns: [/Applebot-Extended/i], + }, + { + id: 'meta-externalagent', + operator: 'Meta', + product: 'meta-externalagent', + purpose: 'training', + userAgentPatterns: [/meta-externalagent/i], + }, + { + id: 'ccbot', + operator: 'Common Crawl', + product: 'CCBot', + purpose: 'crawl', + userAgentPatterns: [/CCBot\//i], + }, + { + id: 'cohere-ai', + operator: 'Cohere', + product: 'cohere-ai', + purpose: 'training', + userAgentPatterns: [/cohere-ai/i], + }, + { + id: 'diffbot', + operator: 'Diffbot', + product: 'Diffbot', + purpose: 'crawl', + userAgentPatterns: [/Diffbot/i], + }, + { + id: 'mistral-ai', + operator: 'Mistral AI', + product: 'MistralAI-User', + purpose: 'crawl', + userAgentPatterns: [/MistralAI/i], + }, +] + +export const DEFAULT_AI_CRAWLER_USER_AGENT_SUBSTRINGS = [ + 'GPTBot/', + 'OAI-SearchBot/', + 'ChatGPT-User/', + 'ClaudeBot/', + 'Claude-Web/', + 'anthropic-ai', + 'PerplexityBot/', + 'Google-Extended', + 'Bytespider', + 'Applebot-Extended', + 'meta-externalagent', + 'CCBot/', + 'cohere-ai', + 'Diffbot', + 'MistralAI', +] + +export const DEFAULT_AI_REFERRER_RULES: AiReferrerRule[] = [ + { domain: 'chatgpt.com', operator: 'OpenAI', product: 'ChatGPT' }, + { domain: 'chat.openai.com', operator: 'OpenAI', product: 'ChatGPT' }, + { domain: 'perplexity.ai', operator: 'Perplexity', product: 'Perplexity' }, + { domain: 'claude.ai', operator: 'Anthropic', product: 'Claude' }, + { domain: 'gemini.google.com', operator: 'Google', product: 'Gemini' }, + { domain: 'copilot.microsoft.com', operator: 'Microsoft', product: 'Copilot' }, + { domain: 'phind.com', operator: 'Phind', product: 'Phind' }, + { domain: 'you.com', operator: 'You.com', product: 'You.com' }, + { domain: 'meta.ai', operator: 'Meta', product: 'Meta AI' }, +] diff --git a/packages/integration-traffic/src/types.ts b/packages/integration-traffic/src/types.ts new file mode 100644 index 00000000..03423410 --- /dev/null +++ b/packages/integration-traffic/src/types.ts @@ -0,0 +1,92 @@ +import type { NormalizedTrafficRequest } from '@ainyc/canonry-contracts' + +export type CrawlerVerificationStatus = 'verified' | 'claimed_unverified' | 'unknown_ai_like' +export type AiReferralEvidenceType = 'referer' | 'utm' + +export interface AiCrawlerRule { + id: string + operator: string + product: string + purpose: string + userAgentPatterns: RegExp[] +} + +export interface AiReferrerRule { + domain: string + operator: string + product: string +} + +export interface ClassifiedCrawler { + botId: string + operator: string + product: string + purpose: string + verificationStatus: CrawlerVerificationStatus + matchedUserAgent: string +} + +export interface ClassifiedAiReferral { + operator: string + product: string + sourceDomain: string + evidenceType: AiReferralEvidenceType +} + +export interface CrawlerEventHourlyBucket { + tsHour: string + botId: string + operator: string + product: string + verificationStatus: CrawlerVerificationStatus + pathNormalized: string + status: number | null + hits: number + sampledUserAgent: string | null +} + +export interface AiReferralEventHourlyBucket { + tsHour: string + operator: string + product: string + sourceDomain: string + evidenceType: AiReferralEvidenceType + landingPathNormalized: string + status: number | null + hits: number +} + +export interface TrafficProbeSample { + eventId: string + observedAt: string + sourceType: NormalizedTrafficRequest['sourceType'] + path: string + pathNormalized: string + status: number | null + userAgent: string | null + referer: string | null + crawler: ClassifiedCrawler | null + aiReferral: ClassifiedAiReferral | null +} + +export interface TrafficProbeReport { + generatedAt: string + totals: { + normalizedEvents: number + crawlerHits: number + aiReferralHits: number + unknownHits: number + } + crawlerEventsHourly: CrawlerEventHourlyBucket[] + aiReferralEventsHourly: AiReferralEventHourlyBucket[] + topBots: Array<{ botId: string; operator: string; hits: number }> + topCrawlerPaths: Array<{ pathNormalized: string; hits: number }> + topAiReferrers: Array<{ sourceDomain: string; product: string; hits: number }> + topAiReferralLandingPaths: Array<{ landingPathNormalized: string; hits: number }> + samples: TrafficProbeSample[] +} + +export interface BuildTrafficProbeReportOptions { + generatedAt?: string + sampleLimit?: number +} diff --git a/packages/integration-traffic/test/analysis.test.ts b/packages/integration-traffic/test/analysis.test.ts new file mode 100644 index 00000000..f132c1bb --- /dev/null +++ b/packages/integration-traffic/test/analysis.test.ts @@ -0,0 +1,121 @@ +import { + TrafficEventConfidences, + TrafficEvidenceKinds, + TrafficSourceTypes, + type NormalizedTrafficRequest, +} from '@ainyc/canonry-contracts' +import { describe, expect, it } from 'vitest' +import { + buildTrafficProbeReport, + classifyAiReferral, + classifyCrawler, + normalizeTrafficPathPattern, +} from '../src/index.js' + +function event(overrides: Partial): NormalizedTrafficRequest { + return { + sourceType: TrafficSourceTypes['cloud-run'], + evidenceKind: TrafficEvidenceKinds['raw-request'], + confidence: TrafficEventConfidences.observed, + eventId: overrides.eventId ?? crypto.randomUUID(), + observedAt: overrides.observedAt ?? '2026-05-01T12:30:00.000Z', + method: overrides.method ?? 'GET', + requestUrl: overrides.requestUrl ?? 'https://example.com/', + host: overrides.host ?? 'example.com', + path: overrides.path ?? '/', + queryString: overrides.queryString ?? null, + status: overrides.status ?? 200, + userAgent: overrides.userAgent ?? 'Mozilla/5.0', + remoteIp: overrides.remoteIp ?? '203.0.113.10', + referer: overrides.referer ?? null, + latencyMs: overrides.latencyMs ?? null, + requestSizeBytes: overrides.requestSizeBytes ?? null, + responseSizeBytes: overrides.responseSizeBytes ?? null, + providerResource: overrides.providerResource ?? { type: 'cloud_run_revision', labels: {} }, + providerLabels: overrides.providerLabels ?? {}, + } +} + +describe('traffic analysis', () => { + it('classifies known AI crawler user agents', () => { + expect(classifyCrawler(event({ userAgent: 'Mozilla/5.0 GPTBot/1.2' }))).toMatchObject({ + botId: 'openai-gptbot', + operator: 'OpenAI', + verificationStatus: 'claimed_unverified', + }) + }) + + it('classifies explicit AI referrals from referer and UTM evidence', () => { + expect(classifyAiReferral(event({ referer: 'https://chatgpt.com/c/abc' }))).toMatchObject({ + product: 'ChatGPT', + operator: 'OpenAI', + evidenceType: 'referer', + sourceDomain: 'chatgpt.com', + }) + + expect(classifyAiReferral(event({ + referer: null, + queryString: 'utm_source=perplexity.ai&utm_medium=referral', + }))).toMatchObject({ + product: 'Perplexity', + operator: 'Perplexity', + evidenceType: 'utm', + sourceDomain: 'perplexity.ai', + }) + }) + + it('normalizes high-cardinality path IDs without rewriting ordinary slugs', () => { + expect(normalizeTrafficPathPattern('/blog/how-to-rank-in-ai')).toBe('/blog/how-to-rank-in-ai') + expect(normalizeTrafficPathPattern('/products/12345/reviews')).toBe('/products/:id/reviews') + expect(normalizeTrafficPathPattern('/orders/018f6ff2-34ab-7c12-a5c0-9c8a8f2d1111')).toBe('/orders/:id') + }) + + it('rolls normalized events into crawler and AI-referral buckets', () => { + const report = buildTrafficProbeReport([ + event({ + eventId: 'crawler-1', + observedAt: '2026-05-01T12:10:00.000Z', + path: '/blog/post-1', + userAgent: 'GPTBot/1.2', + }), + event({ + eventId: 'crawler-2', + observedAt: '2026-05-01T12:15:00.000Z', + path: '/blog/post-1', + userAgent: 'GPTBot/1.2', + }), + event({ + eventId: 'referral-1', + observedAt: '2026-05-01T13:05:00.000Z', + path: '/pricing', + userAgent: 'Mozilla/5.0', + referer: 'https://claude.ai/chat', + }), + ], { generatedAt: '2026-05-01T14:00:00.000Z' }) + + expect(report.totals).toMatchObject({ + normalizedEvents: 3, + crawlerHits: 2, + aiReferralHits: 1, + unknownHits: 0, + }) + expect(report.crawlerEventsHourly).toEqual([ + expect.objectContaining({ + tsHour: '2026-05-01T12:00:00.000Z', + botId: 'openai-gptbot', + pathNormalized: '/blog/post-1', + hits: 2, + }), + ]) + expect(report.aiReferralEventsHourly).toEqual([ + expect.objectContaining({ + tsHour: '2026-05-01T13:00:00.000Z', + product: 'Claude', + landingPathNormalized: '/pricing', + hits: 1, + }), + ]) + expect(report.topBots[0]).toEqual({ botId: 'openai-gptbot', operator: 'OpenAI', hits: 2 }) + expect(report.topAiReferrers[0]).toEqual({ sourceDomain: 'claude.ai', product: 'Claude', hits: 1 }) + }) +}) diff --git a/packages/integration-traffic/tsconfig.json b/packages/integration-traffic/tsconfig.json new file mode 100644 index 00000000..65aaab0c --- /dev/null +++ b/packages/integration-traffic/tsconfig.json @@ -0,0 +1,7 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "./src" + }, + "include": ["src/**/*.ts"] +} diff --git a/packages/integration-traffic/vitest.config.ts b/packages/integration-traffic/vitest.config.ts new file mode 100644 index 00000000..d3311973 --- /dev/null +++ b/packages/integration-traffic/vitest.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'vitest/config' + +export default defineConfig({ + test: { + include: ['test/**/*.test.ts', 'test/**/*.test.tsx'], + }, +}) diff --git a/plans/cloud-run-traffic-source-model-review.md b/plans/cloud-run-traffic-source-model-review.md new file mode 100644 index 00000000..79baceed --- /dev/null +++ b/plans/cloud-run-traffic-source-model-review.md @@ -0,0 +1,153 @@ +# Server-Side Traffic Source Model Review + +**Status:** implementation note for the Cloud Run stacked PR +**Last updated:** 2026-05-01 + +## Why this exists + +The server-side traffic plan needs one model that can handle Cloud Run, WordPress, +Cloudflare, Vercel, and future hosting providers without turning the API into a +set of provider-specific dashboards. The important split is not provider name; +it is evidence type. + +## Source facts checked + +- Cloud Run automatically sends service request logs to Cloud Logging, separate + from container and system logs. Google documents that these request logs are + created automatically for Cloud Run services: + https://docs.cloud.google.com/run/docs/logging +- Cloud Logging `entries.list` is a pull API over log entries. It accepts + `resourceNames`, a Logging query `filter`, `orderBy`, `pageSize`, and + `pageToken`, and it requires `logging.logEntries.list`-class permission: + https://docs.cloud.google.com/logging/docs/reference/v2/rest/v2/entries/list +- Cloud Logging `LogEntry.httpRequest` carries the request URL, status, user + agent, remote IP, referer, sizes, and latency fields Canonry needs for raw + server-side evidence: + https://docs.cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry#HttpRequest +- Cloudflare GraphQL Analytics is aggregate analytics over HTTP requests, and + Cloudflare AI Crawl Control exposes crawler-oriented filters such as + `userAgent_like`, `clientRequestPath_like`, `clientRefererHost_like`, and + `botDetectionIds_hasany` through that aggregate API: + https://developers.cloudflare.com/analytics/graphql-api/ + https://developers.cloudflare.com/ai-crawl-control/reference/graphql-api/ +- Vercel Log Drains include proxy fields such as method, host, path, user + agent, referer, status code, and client IP, but drains send logs to a + configured destination. That makes them raw-event capable, but not directly + pull-only unless the customer configures a destination Canonry can later pull + from: + https://vercel.com/docs/drains/reference/logs + https://vercel.com/docs/drains + +## Models considered + +### 1. Provider-specific tables + +Example: `cloud_run_request_logs`, `wordpress_crawler_hits`, +`cloudflare_ai_crawler_groups`, `vercel_proxy_events`. + +Rejected. It would make every API/CLI/dashboard query provider-aware, and it +would make citation-crawl-click joins expensive to maintain. It also bakes the +first provider's quirks into the product. + +### 2. Store full raw logs + +Example: persist every Cloud Logging `LogEntry`, Vercel drain row, or WordPress +request row. + +Rejected for the main database. Raw logs are high-cardinality, privacy-sensitive, +and provider-shaped. Canonry should retain only normalized evidence plus a small +sample tail for debugging. + +### 3. Aggregate-only model + +Example: every adapter writes hourly `{bot, path, status, hits}` buckets and +never preserves per-request evidence. + +Rejected as the only model. It fits Cloudflare GraphQL well, but loses the raw +IP/user-agent/referrer fields needed for verification, replaying classifier +improvements, and sampling. + +### 4. Canonical evidence model plus provider capabilities + +Recommended. Each adapter declares what it can supply: + +- `raw-request-events`: individual request evidence, e.g. Cloud Run request + logs and the WordPress plugin. +- `aggregate-request-metrics`: grouped request counts, e.g. Cloudflare + GraphQL Analytics. +- field capabilities: `request-url`, `status-code`, `user-agent`, `remote-ip`, + `referer`, `cursor-pull`. + +Raw adapters normalize into `NormalizedTrafficRequest`. Aggregate adapters +normalize into a future `NormalizedTrafficAggregateBucket`. Both feed the same +rollup tables and public API. Raw evidence can be reclassified as the bot +manifest improves; aggregate evidence carries lower replay/verification power. + +## Recommended provider mapping + +| Provider | Evidence model | Why | +|---|---|---| +| Cloud Run / Cloud Logging | Raw request events | Pull API, automatic request logs, `httpRequest` includes URL, status, UA, IP, referer. | +| WordPress plugin | Raw request events | Plugin runs server-side and Canonry pulls plugin-owned rows. | +| Cloudflare GraphQL | Aggregate request metrics | GraphQL Analytics groups/filter counts; AI Crawl Control can filter crawler/referrer dimensions, but it is not raw log replay. | +| Vercel Log Drains | Raw request events only if customer provides a pullable drain destination | Vercel sends drains to a destination, which conflicts with Canonry local-only pull unless the destination is user-owned storage/API. | + +## Cloud Run PR scope + +This stacked PR starts the Cloud Run path without adding a partial dashboard: + +1. Add provider-neutral traffic contract constants and `NormalizedTrafficRequest`. +2. Add a provider-neutral local traffic analysis package that rolls normalized + request evidence into crawler/referral buckets before DB persistence exists. +3. Add a Cloud Run integration package that: + - builds Cloud Logging filters for `cloud_run_revision`; + - optionally narrows by service, location, timestamp window, and user-agent + substrings; + - pulls `entries.list` with page tokens; + - normalizes `LogEntry.httpRequest` into Canonry request evidence. +4. Keep persistence/API/CLI for the next PR so public surfaces are introduced + only when the storage and sync semantics are complete. + +## Local pull and analysis probe + +Before adding Canonry DB/API/CLI surfaces, use the local probe to test the +pull-normalize-ingest-analyze loop: + +```bash +pnpm tsx scripts/test-cloud-run-traffic-pull.ts \ + --fixture scripts/fixtures/cloud-run-traffic-sample.json +``` + +For real Cloud Run logs: + +```bash +pnpm tsx scripts/test-cloud-run-traffic-pull.ts \ + --gcp-project \ + --service \ + --location \ + --since 6h \ + --url-contains ainyc.ai \ + --use-gcloud \ + --out .tmp/cloud-run-traffic-report.json +``` + +Use `--narrow-bots` to add known AI-crawler user-agent filters to the Cloud +Logging query. That lowers log volume, but it intentionally misses human AI +referrals because those requests normally have browser user agents. + +## Next implementation slice + +The next stacked PR should add the shared traffic persistence and public +surfaces: + +- DB tables: `traffic_sources`, `crawler_events_hourly`, + `ai_referral_events_hourly`, `raw_event_samples`. +- API/CLI: + - `traffic connect cloud-run` + - `traffic sync --source cloud-run` + - `traffic sources` + - `traffic crawlers` + - `traffic referrals` + - `traffic timeline` +- Config storage under `~/.canonry/config.yaml`, not SQLite. +- MCP registry entries only after the API and CLI exist. diff --git a/plans/server-side-ai-traffic-ingestion.md b/plans/server-side-ai-traffic-ingestion.md index fe3621c3..10a4bb76 100644 --- a/plans/server-side-ai-traffic-ingestion.md +++ b/plans/server-side-ai-traffic-ingestion.md @@ -1,7 +1,7 @@ # Server-Side AI Traffic & Crawler Ingestion Plan Status: design plan for implementation after GA known-AI landing page work -Last updated: 2026-04-30 +Last updated: 2026-05-01 ## Context @@ -12,6 +12,10 @@ Canonry already has two AI discovery signals: GA4 cannot recover crawler activity or referrer-stripped AI clicks. GA4 also automatically excludes known bot traffic before reporting and does not expose how much was excluded. Server-side ingestion is therefore a separate product layer: it captures request evidence before browser JavaScript, GA attribution, or GA bot filtering shape the data. +See [`cloud-run-traffic-source-model-review.md`](./cloud-run-traffic-source-model-review.md) +for the provider-capability review behind the Cloud Run starter adapter and the +raw-event vs aggregate-bucket split. + ## Goals 1. Show AI crawler activity by bot, verification confidence, path, status, and time window. diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bae0d21a..eacffac0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -358,6 +358,12 @@ importers: specifier: workspace:* version: link:../contracts + packages/integration-cloud-run: + dependencies: + '@ainyc/canonry-contracts': + specifier: workspace:* + version: link:../contracts + packages/integration-commoncrawl: dependencies: '@ainyc/canonry-contracts': @@ -380,6 +386,12 @@ importers: specifier: workspace:* version: link:../contracts + packages/integration-traffic: + dependencies: + '@ainyc/canonry-contracts': + specifier: workspace:* + version: link:../contracts + packages/integration-wordpress: dependencies: '@ainyc/canonry-contracts': diff --git a/scripts/fixtures/cloud-run-traffic-sample.json b/scripts/fixtures/cloud-run-traffic-sample.json new file mode 100644 index 00000000..999b4f69 --- /dev/null +++ b/scripts/fixtures/cloud-run-traffic-sample.json @@ -0,0 +1,76 @@ +{ + "entries": [ + { + "insertId": "crawler-1", + "timestamp": "2026-05-01T12:10:00.000Z", + "resource": { + "type": "cloud_run_revision", + "labels": { + "project_id": "sample-project", + "service_name": "web", + "location": "us-central1" + } + }, + "httpRequest": { + "requestMethod": "GET", + "requestUrl": "https://example.com/blog/ai-search", + "status": 200, + "userAgent": "GPTBot/1.2", + "remoteIp": "203.0.113.10", + "latency": "0.120s" + } + }, + { + "insertId": "referral-1", + "timestamp": "2026-05-01T12:15:00.000Z", + "resource": { + "type": "cloud_run_revision", + "labels": { + "project_id": "sample-project", + "service_name": "web", + "location": "us-central1" + } + }, + "httpRequest": { + "requestMethod": "GET", + "requestUrl": "https://example.com/pricing?utm_source=chatgpt.com", + "status": 200, + "userAgent": "Mozilla/5.0", + "remoteIp": "198.51.100.12", + "referer": "https://chatgpt.com/" + } + }, + { + "insertId": "ordinary-1", + "timestamp": "2026-05-01T12:20:00.000Z", + "resource": { + "type": "cloud_run_revision", + "labels": { + "project_id": "sample-project", + "service_name": "web", + "location": "us-central1" + } + }, + "httpRequest": { + "requestMethod": "GET", + "requestUrl": "https://example.com/orders/12345", + "status": 200, + "userAgent": "Mozilla/5.0", + "remoteIp": "198.51.100.13" + } + }, + { + "insertId": "application-log", + "timestamp": "2026-05-01T12:25:00.000Z", + "resource": { + "type": "cloud_run_revision", + "labels": { + "project_id": "sample-project", + "service_name": "web", + "location": "us-central1" + } + }, + "textPayload": "not a request log" + } + ] +} diff --git a/scripts/fixtures/ga4-ai-referrals-sample.json b/scripts/fixtures/ga4-ai-referrals-sample.json new file mode 100644 index 00000000..19916688 --- /dev/null +++ b/scripts/fixtures/ga4-ai-referrals-sample.json @@ -0,0 +1,49 @@ +{ + "rows": [ + { + "date": "2026-05-01", + "source": "chatgpt.com", + "medium": "referral", + "landingPage": "/pricing", + "sessions": 14, + "users": 12, + "sourceDimension": "session" + }, + { + "date": "2026-05-01", + "source": "perplexity.ai", + "medium": "referral", + "landingPage": "/blog/ai-search", + "sessions": 3, + "users": 3, + "sourceDimension": "session" + }, + { + "date": "2026-05-01", + "source": "claude.ai", + "medium": "referral", + "landingPage": "/docs/getting-started", + "sessions": 2, + "users": 2, + "sourceDimension": "session" + }, + { + "date": "2026-05-01", + "source": "chatgpt.com", + "medium": "referral", + "landingPage": "/pricing", + "sessions": 14, + "users": 12, + "sourceDimension": "first_user" + }, + { + "date": "2026-05-01", + "source": "chatgpt-recommendation", + "medium": "referral", + "landingPage": "/features", + "sessions": 5, + "users": 4, + "sourceDimension": "manual_utm" + } + ] +} diff --git a/scripts/test-cloud-run-traffic-pull.ts b/scripts/test-cloud-run-traffic-pull.ts new file mode 100644 index 00000000..411762d2 --- /dev/null +++ b/scripts/test-cloud-run-traffic-pull.ts @@ -0,0 +1,344 @@ +#!/usr/bin/env tsx +import { execFile } from 'node:child_process' +import fs from 'node:fs/promises' +import path from 'node:path' +import { promisify } from 'node:util' +import { + listCloudRunTrafficEvents, + normalizeCloudRunLogEntry, + type CloudRunLogEntry, +} from '../packages/integration-cloud-run/src/index.js' +import { + DEFAULT_AI_CRAWLER_USER_AGENT_SUBSTRINGS, + buildTrafficProbeReport, +} from '../packages/integration-traffic/src/index.js' + +const execFileAsync = promisify(execFile) + +interface Args { + gcpProjectId?: string + serviceName?: string + location?: string + since: string + until?: string + pageSize: number + maxPages: number + accessToken?: string + tokenEnv: string + useGcloud: boolean + narrowBots: boolean + urlContains: string[] + fixture?: string + out?: string + json: boolean + help: boolean +} + +interface PullResult { + events: ReturnType[] + rawEntryCount: number + skippedEntryCount: number + nextPageToken?: string + filter: string +} + +function usage(): string { + return [ + 'Usage:', + ' pnpm tsx scripts/test-cloud-run-traffic-pull.ts --gcp-project [--service ] [--location ]', + ' pnpm tsx scripts/test-cloud-run-traffic-pull.ts --fixture scripts/fixtures/cloud-run-traffic-sample.json', + '', + 'Options:', + ' --since Window start. Durations: 30m, 6h, 2d. Default: 1h', + ' --until Window end. Default: now', + ' --page-size Cloud Logging page size. Default: 1000', + ' --max-pages Max pages to pull. Default: 1', + ' --access-token Bearer token for Cloud Logging', + ' --token-env Env var containing token. Default: GOOGLE_CLOUD_ACCESS_TOKEN', + ' --use-gcloud Resolve token via `gcloud auth print-access-token`', + ' --narrow-bots Add UA filters for known AI crawlers. Misses human AI referrals.', + ' --url-contains Add request URL substring filter. Repeatable.', + ' --out Write JSON report to a file', + ' --json Print full JSON report to stdout', + ' --help Show this help', + ].join('\n') +} + +function parseArgs(argv: string[]): Args { + const args: Args = { + since: '1h', + pageSize: 1000, + maxPages: 1, + tokenEnv: 'GOOGLE_CLOUD_ACCESS_TOKEN', + useGcloud: false, + narrowBots: false, + urlContains: [], + json: false, + help: false, + } + + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i] + const next = () => { + const value = argv[i + 1] + if (!value || value.startsWith('--')) throw new Error(`Missing value for ${arg}`) + i += 1 + return value + } + + switch (arg) { + case '--gcp-project': + args.gcpProjectId = next() + break + case '--service': + args.serviceName = next() + break + case '--location': + args.location = next() + break + case '--since': + args.since = next() + break + case '--until': + args.until = next() + break + case '--page-size': + args.pageSize = parsePositiveInt('--page-size', next()) + break + case '--max-pages': + args.maxPages = parsePositiveInt('--max-pages', next()) + break + case '--access-token': + args.accessToken = next() + break + case '--token-env': + args.tokenEnv = next() + break + case '--use-gcloud': + args.useGcloud = true + break + case '--narrow-bots': + args.narrowBots = true + break + case '--url-contains': + args.urlContains.push(next()) + break + case '--fixture': + args.fixture = next() + break + case '--out': + args.out = next() + break + case '--json': + args.json = true + break + case '--help': + case '-h': + args.help = true + break + default: + throw new Error(`Unknown argument: ${arg}`) + } + } + + return args +} + +function parsePositiveInt(name: string, raw: string): number { + const value = Number(raw) + if (!Number.isInteger(value) || value < 1) { + throw new Error(`${name} must be a positive integer`) + } + return value +} + +function resolveWindow(since: string, until: string | undefined): { startTime: string; endTime: string } { + const end = until ? new Date(until) : new Date() + if (Number.isNaN(end.getTime())) throw new Error(`Invalid --until timestamp: ${until}`) + + const durationMatch = /^(\d+)(m|h|d)$/.exec(since.trim()) + if (durationMatch) { + const amount = Number(durationMatch[1]) + const unit = durationMatch[2] + const multiplier = unit === 'm' ? 60_000 : unit === 'h' ? 3_600_000 : 86_400_000 + return { + startTime: new Date(end.getTime() - amount * multiplier).toISOString(), + endTime: end.toISOString(), + } + } + + const start = new Date(since) + if (Number.isNaN(start.getTime())) { + throw new Error(`Invalid --since value: ${since}`) + } + if (start.getTime() >= end.getTime()) { + throw new Error('--since must be before --until') + } + return { startTime: start.toISOString(), endTime: end.toISOString() } +} + +async function resolveAccessToken(args: Args): Promise { + if (args.accessToken?.trim()) return args.accessToken.trim() + + const envToken = process.env[args.tokenEnv]?.trim() + if (envToken) return envToken + + if (args.useGcloud) { + const { stdout } = await execFileAsync('gcloud', ['auth', 'print-access-token']) + const token = stdout.trim() + if (token) return token + } + + throw new Error( + `No Cloud Logging access token found. Set ${args.tokenEnv}, pass --access-token, or use --use-gcloud.`, + ) +} + +function isLogEntryArray(value: unknown): value is CloudRunLogEntry[] { + return Array.isArray(value) +} + +function readEntriesFromFixture(value: unknown): CloudRunLogEntry[] { + if (isLogEntryArray(value)) return value + if (value && typeof value === 'object') { + const object = value as { + entries?: unknown + responses?: Array<{ entries?: unknown }> + } + if (isLogEntryArray(object.entries)) return object.entries + if (Array.isArray(object.responses)) { + return object.responses.flatMap((response) => ( + isLogEntryArray(response.entries) ? response.entries : [] + )) + } + } + throw new Error('Fixture must be an array of LogEntry objects, { entries }, or { responses: [{ entries }] }') +} + +async function pullFromFixture(fixturePath: string): Promise { + const raw = await fs.readFile(fixturePath, 'utf-8') + const entries = readEntriesFromFixture(JSON.parse(raw) as unknown) + const events: PullResult['events'] = [] + let skippedEntryCount = 0 + + for (const entry of entries) { + const event = normalizeCloudRunLogEntry(entry) + if (event) events.push(event) + else skippedEntryCount += 1 + } + + return { + events, + rawEntryCount: entries.length, + skippedEntryCount, + filter: `fixture:${fixturePath}`, + } +} + +async function pullFromCloudLogging(args: Args, startTime: string, endTime: string): Promise { + if (!args.gcpProjectId) throw new Error('--gcp-project is required unless --fixture is used') + const accessToken = await resolveAccessToken(args) + return listCloudRunTrafficEvents(accessToken, { + gcpProjectId: args.gcpProjectId, + serviceName: args.serviceName, + location: args.location, + startTime, + endTime, + pageSize: args.pageSize, + maxPages: args.maxPages, + userAgentSubstrings: args.narrowBots ? DEFAULT_AI_CRAWLER_USER_AGENT_SUBSTRINGS : undefined, + requestUrlSubstrings: args.urlContains.length > 0 ? args.urlContains : undefined, + }) +} + +function printSummary(output: { + probe: { + rawEntryCount: number + normalizedEventCount: number + skippedEntryCount: number + nextPageToken?: string + filter: string + outputPath?: string + } + report: ReturnType +}): void { + const { probe, report } = output + console.log('Cloud Run traffic probe') + console.log(`Raw entries: ${probe.rawEntryCount}`) + console.log(`Normalized events: ${probe.normalizedEventCount}`) + console.log(`Skipped entries: ${probe.skippedEntryCount}`) + console.log(`Crawler hits: ${report.totals.crawlerHits}`) + console.log(`Explicit AI referral hits: ${report.totals.aiReferralHits}`) + console.log(`Unknown hits: ${report.totals.unknownHits}`) + console.log(`Filter: ${probe.filter}`) + if (probe.nextPageToken) console.log(`Next page token: ${probe.nextPageToken}`) + if (probe.outputPath) console.log(`Wrote report: ${probe.outputPath}`) + + if (report.topBots.length > 0) { + console.log('\nTop crawler bots:') + for (const row of report.topBots) { + console.log(` ${row.botId} (${row.operator}): ${row.hits}`) + } + } + + if (report.topAiReferrers.length > 0) { + console.log('\nTop AI referrers:') + for (const row of report.topAiReferrers) { + console.log(` ${row.sourceDomain} (${row.product}): ${row.hits}`) + } + } +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)) + if (args.help) { + console.log(usage()) + return + } + + const { startTime, endTime } = resolveWindow(args.since, args.until) + const pull = args.fixture + ? await pullFromFixture(args.fixture) + : await pullFromCloudLogging(args, startTime, endTime) + + const events = pull.events.filter((event): event is NonNullable => event !== null) + const report = buildTrafficProbeReport(events) + const outputPath = args.out ? path.resolve(args.out) : undefined + const output = { + probe: { + source: args.fixture ? 'fixture' : 'cloud-run', + gcpProjectId: args.gcpProjectId ?? null, + serviceName: args.serviceName ?? null, + location: args.location ?? null, + startTime, + endTime, + pageSize: args.pageSize, + maxPages: args.maxPages, + narrowBots: args.narrowBots, + urlContains: args.urlContains, + rawEntryCount: pull.rawEntryCount, + normalizedEventCount: events.length, + skippedEntryCount: pull.skippedEntryCount, + nextPageToken: pull.nextPageToken, + filter: pull.filter, + outputPath, + }, + report, + } + + if (outputPath) { + await fs.mkdir(path.dirname(outputPath), { recursive: true }) + await fs.writeFile(outputPath, `${JSON.stringify(output, null, 2)}\n`) + } + + if (args.json) { + console.log(JSON.stringify(output, null, 2)) + } else { + printSummary(output) + } +} + +main().catch((error: unknown) => { + console.error(error instanceof Error ? error.message : String(error)) + process.exitCode = 1 +}) diff --git a/scripts/test-cloud-run-vs-ga-ai-traffic.ts b/scripts/test-cloud-run-vs-ga-ai-traffic.ts new file mode 100644 index 00000000..1b4a6fe7 --- /dev/null +++ b/scripts/test-cloud-run-vs-ga-ai-traffic.ts @@ -0,0 +1,726 @@ +#!/usr/bin/env tsx +/** + * Pull Cloud Run request logs and GA4 AI referrals over the same window, + * correlate per AI source and per landing path, and surface the gap. + * + * GA4 sees AI clicks that survive as a referrer or UTM tag; Cloud Run sees + * the raw request including AI crawler bots that GA never records. The + * script makes the asymmetry inspectable end-to-end before the persistence + * + public surface layer lands. + */ +import { execFile } from 'node:child_process' +import fs from 'node:fs/promises' +import path from 'node:path' +import { promisify } from 'node:util' +import { + listCloudRunTrafficEvents, + normalizeCloudRunLogEntry, + type CloudRunLogEntry, +} from '../packages/integration-cloud-run/src/index.js' +import { + DEFAULT_AI_CRAWLER_USER_AGENT_SUBSTRINGS, + DEFAULT_AI_REFERRER_RULES, + buildTrafficProbeReport, + classifyAiReferral, + classifyCrawler, + type AiReferrerRule, +} from '../packages/integration-traffic/src/index.js' +import { + getAccessToken as getGa4AccessToken, + fetchAiReferrals, + type GA4AiReferralRow, +} from '../packages/integration-google-analytics/src/index.js' +import { loadConfig } from '../packages/canonry/src/config.js' +import { getGa4Connection } from '../packages/canonry/src/ga4-config.js' + +const execFileAsync = promisify(execFile) + +interface Args { + // Window + since: string + until?: string + // Cloud Run + gcpProjectId?: string + serviceName?: string + location?: string + pageSize: number + maxPages: number + cloudRunAccessToken?: string + cloudRunTokenEnv: string + useGcloud: boolean + narrowBots: boolean + urlContains: string[] + cloudRunFixture?: string + // GA4 + canonryProject?: string + gaPropertyId?: string + gaKeyFile?: string + gaFixture?: string + // Output + out?: string + json: boolean + help: boolean +} + +interface CloudRunPullResult { + events: NonNullable>[] + rawEntryCount: number + skippedEntryCount: number + nextPageToken?: string + filter: string +} + +interface AiSourceComparisonRow { + domain: string + operator: string + product: string + cloudRunHits: number + cloudRunHitsByEvidence: { referer: number; utm: number } + gaSessions: number + delta: number + verdict: + | 'agree' + | 'cloud-run-higher' + | 'ga-higher' + | 'cloud-run-only' + | 'ga-only' + | 'neither' +} + +interface PathRow { + path: string + cloudRunTotalHits: number + cloudRunCrawlerHits: number + topCrawler: string | null + cloudRunReferralHits: number + topCrawlerReferer: string | null + gaAiSessions: number + topGaSource: string | null + verdict: 'crawled+clicked' | 'crawled-only' | 'clicked-only' | 'referred-only' +} + +interface CorrelationOutput { + window: { + cloudRun: { startTime: string; endTime: string } + ga: { days: number; note: string } + } + cloudRun: { + source: 'cloud-run' | 'fixture' + rawEntryCount: number + normalizedEventCount: number + skippedEntryCount: number + nextPageToken?: string + filter: string + crawlerHits: number + aiReferralHits: number + unknownHits: number + } + ga: { + source: 'ga4' | 'fixture' + rowsFetched: number + sessionsTotal: number + } + aiSourceComparison: AiSourceComparisonRow[] + pathJoin: PathRow[] + topCrawlerBots: Array<{ botId: string; operator: string; hits: number }> + topCrawlerPaths: Array<{ pathNormalized: string; hits: number }> +} + +function usage(): string { + return [ + 'Usage:', + ' pnpm tsx scripts/test-cloud-run-vs-ga-ai-traffic.ts \\', + ' --gcp-project --use-gcloud \\', + ' --canonry-project \\', + ' --since 24h', + '', + ' pnpm tsx scripts/test-cloud-run-vs-ga-ai-traffic.ts \\', + ' --cloud-run-fixture scripts/fixtures/cloud-run-traffic-sample.json \\', + ' --ga-fixture scripts/fixtures/ga4-ai-referrals-sample.json', + '', + 'Window:', + ' --since Window start. Durations: 30m, 6h, 7d. Default: 24h', + ' --until Window end. Default: now', + '', + 'Cloud Run:', + ' --gcp-project GCP project (required for live)', + ' --service Cloud Run service name (optional narrow filter)', + ' --location Cloud Run region (optional narrow filter)', + ' --page-size Cloud Logging page size. Default: 1000', + ' --max-pages Max pages to pull. Default: 1', + ' --access-token Bearer token for Cloud Logging', + ' --token-env Env var with token. Default: GOOGLE_CLOUD_ACCESS_TOKEN', + ' --use-gcloud Resolve token via `gcloud auth print-access-token`', + ' --narrow-bots Cloud Logging UA filter to known AI crawlers (misses human AI referrals)', + ' --url-contains Cloud Logging request-URL substring filter. Repeatable.', + ' --cloud-run-fixture Read Cloud Logging entries from a JSON fixture', + '', + 'GA4:', + ' --canonry-project Look up GA4 service-account credentials in ~/.canonry/config.yaml', + ' --ga-property GA4 property ID (used with --ga-key-file)', + ' --ga-key-file GA4 service-account JSON key file', + ' --ga-fixture Read GA4 AI-referral rows from a JSON fixture', + '', + 'Output:', + ' --out Write full JSON report to a file', + ' --json Print full JSON report to stdout', + ' --help Show this help', + ].join('\n') +} + +function parseArgs(argv: string[]): Args { + const args: Args = { + since: '24h', + pageSize: 1000, + maxPages: 1, + cloudRunTokenEnv: 'GOOGLE_CLOUD_ACCESS_TOKEN', + useGcloud: false, + narrowBots: false, + urlContains: [], + json: false, + help: false, + } + + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i] + const next = () => { + const value = argv[i + 1] + if (!value || value.startsWith('--')) throw new Error(`Missing value for ${arg}`) + i += 1 + return value + } + switch (arg) { + case '--since': args.since = next(); break + case '--until': args.until = next(); break + case '--gcp-project': args.gcpProjectId = next(); break + case '--service': args.serviceName = next(); break + case '--location': args.location = next(); break + case '--page-size': args.pageSize = parsePositiveInt('--page-size', next()); break + case '--max-pages': args.maxPages = parsePositiveInt('--max-pages', next()); break + case '--access-token': args.cloudRunAccessToken = next(); break + case '--token-env': args.cloudRunTokenEnv = next(); break + case '--use-gcloud': args.useGcloud = true; break + case '--narrow-bots': args.narrowBots = true; break + case '--url-contains': args.urlContains.push(next()); break + case '--cloud-run-fixture': args.cloudRunFixture = next(); break + case '--canonry-project': args.canonryProject = next(); break + case '--ga-property': args.gaPropertyId = next(); break + case '--ga-key-file': args.gaKeyFile = next(); break + case '--ga-fixture': args.gaFixture = next(); break + case '--out': args.out = next(); break + case '--json': args.json = true; break + case '--help': + case '-h': args.help = true; break + default: throw new Error(`Unknown argument: ${arg}`) + } + } + return args +} + +function parsePositiveInt(name: string, raw: string): number { + const value = Number(raw) + if (!Number.isInteger(value) || value < 1) { + throw new Error(`${name} must be a positive integer`) + } + return value +} + +function resolveWindow(since: string, until: string | undefined): { + startTime: string + endTime: string + hours: number +} { + const end = until ? new Date(until) : new Date() + if (Number.isNaN(end.getTime())) throw new Error(`Invalid --until timestamp: ${until}`) + + const durationMatch = /^(\d+)(m|h|d)$/.exec(since.trim()) + let startMs: number + if (durationMatch) { + const amount = Number(durationMatch[1]) + const unit = durationMatch[2] + const multiplier = unit === 'm' ? 60_000 : unit === 'h' ? 3_600_000 : 86_400_000 + startMs = end.getTime() - amount * multiplier + } else { + const start = new Date(since) + if (Number.isNaN(start.getTime())) throw new Error(`Invalid --since value: ${since}`) + if (start.getTime() >= end.getTime()) throw new Error('--since must be before --until') + startMs = start.getTime() + } + + const hours = (end.getTime() - startMs) / 3_600_000 + return { + startTime: new Date(startMs).toISOString(), + endTime: end.toISOString(), + hours, + } +} + +async function resolveCloudRunToken(args: Args): Promise { + if (args.cloudRunAccessToken?.trim()) return args.cloudRunAccessToken.trim() + const envToken = process.env[args.cloudRunTokenEnv]?.trim() + if (envToken) return envToken + if (args.useGcloud) { + const { stdout } = await execFileAsync('gcloud', ['auth', 'print-access-token']) + const token = stdout.trim() + if (token) return token + } + throw new Error( + `No Cloud Logging access token found. Set ${args.cloudRunTokenEnv}, pass --access-token, or use --use-gcloud.`, + ) +} + +function isLogEntryArray(value: unknown): value is CloudRunLogEntry[] { + return Array.isArray(value) +} + +function readEntriesFromFixture(value: unknown): CloudRunLogEntry[] { + if (isLogEntryArray(value)) return value + if (value && typeof value === 'object') { + const object = value as { entries?: unknown; responses?: Array<{ entries?: unknown }> } + if (isLogEntryArray(object.entries)) return object.entries + if (Array.isArray(object.responses)) { + return object.responses.flatMap((response) => + isLogEntryArray(response.entries) ? response.entries : [], + ) + } + } + throw new Error('Cloud Run fixture must be LogEntry[], { entries }, or { responses: [{ entries }] }') +} + +async function pullCloudRun(args: Args, startTime: string, endTime: string): Promise { + if (args.cloudRunFixture) { + const raw = await fs.readFile(args.cloudRunFixture, 'utf-8') + const entries = readEntriesFromFixture(JSON.parse(raw) as unknown) + const events: CloudRunPullResult['events'] = [] + let skippedEntryCount = 0 + for (const entry of entries) { + const event = normalizeCloudRunLogEntry(entry) + if (event) events.push(event) + else skippedEntryCount += 1 + } + return { + events, + rawEntryCount: entries.length, + skippedEntryCount, + filter: `fixture:${args.cloudRunFixture}`, + } + } + + if (!args.gcpProjectId) { + throw new Error('--gcp-project is required unless --cloud-run-fixture is used') + } + const accessToken = await resolveCloudRunToken(args) + const page = await listCloudRunTrafficEvents(accessToken, { + gcpProjectId: args.gcpProjectId, + serviceName: args.serviceName, + location: args.location, + startTime, + endTime, + pageSize: args.pageSize, + maxPages: args.maxPages, + userAgentSubstrings: args.narrowBots ? DEFAULT_AI_CRAWLER_USER_AGENT_SUBSTRINGS : undefined, + requestUrlSubstrings: args.urlContains.length > 0 ? args.urlContains : undefined, + }) + return { + events: page.events.filter((event): event is NonNullable => event !== null), + rawEntryCount: page.rawEntryCount, + skippedEntryCount: page.skippedEntryCount, + nextPageToken: page.nextPageToken, + filter: page.filter, + } +} + +function isAiReferralRowArray(value: unknown): value is GA4AiReferralRow[] { + return Array.isArray(value) +} + +function readGaRowsFromFixture(value: unknown): GA4AiReferralRow[] { + if (isAiReferralRowArray(value)) return value + if (value && typeof value === 'object') { + const object = value as { rows?: unknown } + if (isAiReferralRowArray(object.rows)) return object.rows + } + throw new Error('GA fixture must be GA4AiReferralRow[] or { rows: GA4AiReferralRow[] }') +} + +async function pullGa( + args: Args, + windowDays: number, +): Promise<{ rows: GA4AiReferralRow[]; source: 'ga4' | 'fixture' }> { + if (args.gaFixture) { + const raw = await fs.readFile(args.gaFixture, 'utf-8') + return { rows: readGaRowsFromFixture(JSON.parse(raw) as unknown), source: 'fixture' } + } + + let propertyId: string + let clientEmail: string + let privateKey: string + + if (args.canonryProject) { + const config = loadConfig() + const connection = getGa4Connection(config, args.canonryProject) + if (!connection) { + throw new Error( + `No GA4 connection found for canonry project "${args.canonryProject}". ` + + 'Run `canonry ga connect --property-id --service-account-key ` first.', + ) + } + propertyId = connection.propertyId + clientEmail = connection.clientEmail + privateKey = connection.privateKey + } else { + if (!args.gaPropertyId || !args.gaKeyFile) { + throw new Error( + 'GA4 source missing: pass --canonry-project , or --ga-property + --ga-key-file, or --ga-fixture .', + ) + } + const keyRaw = await fs.readFile(args.gaKeyFile, 'utf-8') + const parsed = JSON.parse(keyRaw) as { client_email?: string; private_key?: string } + if (!parsed.client_email || !parsed.private_key) { + throw new Error(`--ga-key-file ${args.gaKeyFile} is missing client_email or private_key`) + } + propertyId = args.gaPropertyId + clientEmail = parsed.client_email + privateKey = parsed.private_key + } + + const accessToken = await getGa4AccessToken(clientEmail, privateKey) + const rows = await fetchAiReferrals(accessToken, propertyId, windowDays) + return { rows, source: 'ga4' } +} + +/** + * GA4 fans the same session out across three attribution dimensions + * (`session`, `first_user`, `manual_utm`). Mirror the dedupe in + * `packages/api-routes/src/ga.ts`: take MAX sessions per + * (date, source, medium, landingPage) so a 14-session row in `session` + * and a 14-session row in `first_user` count as 14, not 28. + */ +function dedupeGaRows(rows: GA4AiReferralRow[]): GA4AiReferralRow[] { + const max = new Map() + for (const row of rows) { + const key = `${row.date}|${row.source}|${row.medium}|${row.landingPage}` + const existing = max.get(key) + if (!existing || row.sessions > existing.sessions) max.set(key, row) + } + return [...max.values()] +} + +function normalizePath(rawPath: string): string { + if (!rawPath) return '/' + try { + const withBase = rawPath.startsWith('http') ? new URL(rawPath) : new URL(rawPath, 'http://x') + const trimmed = withBase.pathname.replace(/\/+$/, '') + return trimmed.length === 0 ? '/' : trimmed + } catch { + const stripped = rawPath.split('?')[0].split('#')[0] + if (!stripped) return '/' + const trimmed = stripped.replace(/\/+$/, '') + return trimmed.length === 0 ? '/' : trimmed + } +} + +function gaSourceMatchesRule(gaSource: string, rule: AiReferrerRule): boolean { + const lower = gaSource.toLowerCase() + const domain = rule.domain.toLowerCase() + if (lower === domain) return true + // GA4 normalizes some sources to bare brand strings. Match the brand token + // (text before the first dot) so e.g. `chatgpt` ≈ `chatgpt.com`. + const brand = domain.split('.')[0] + if (brand && lower.includes(brand)) return true + // Match domain substring for hosts like `chat.openai.com` referenced under `openai`. + if (lower.includes(domain)) return true + return false +} + +function buildAiSourceComparison( + cloudRunEvents: CloudRunPullResult['events'], + gaRows: GA4AiReferralRow[], +): AiSourceComparisonRow[] { + const dedupedGa = dedupeGaRows(gaRows) + + // Group rules by product so we don't double-count when multiple rules + // (e.g. chatgpt.com + chat.openai.com) point at the same AI surface. + const rulesByProduct = new Map() + for (const rule of DEFAULT_AI_REFERRER_RULES) { + const list = rulesByProduct.get(rule.product) ?? [] + list.push(rule) + rulesByProduct.set(rule.product, list) + } + + // Use the shipped classifier so referer + UTM evidence both count. + const classifications = cloudRunEvents.map((event) => classifyAiReferral(event)) + + const rows: AiSourceComparisonRow[] = [] + for (const [product, rules] of rulesByProduct) { + let referer = 0 + let utm = 0 + for (const ai of classifications) { + if (!ai || ai.product !== product) continue + if (ai.evidenceType === 'referer') referer += 1 + else if (ai.evidenceType === 'utm') utm += 1 + } + const cloudRunHits = referer + utm + + const gaSessions = dedupedGa + .filter((row) => rules.some((rule) => gaSourceMatchesRule(row.source, rule))) + .reduce((sum, row) => sum + row.sessions, 0) + + const delta = cloudRunHits - gaSessions + let verdict: AiSourceComparisonRow['verdict'] + if (cloudRunHits === 0 && gaSessions === 0) verdict = 'neither' + else if (cloudRunHits === 0) verdict = 'ga-only' + else if (gaSessions === 0) verdict = 'cloud-run-only' + else if (delta === 0) verdict = 'agree' + else verdict = delta > 0 ? 'cloud-run-higher' : 'ga-higher' + + rows.push({ + domain: rules[0].domain, + operator: rules[0].operator, + product, + cloudRunHits, + cloudRunHitsByEvidence: { referer, utm }, + gaSessions, + delta, + verdict, + }) + } + return rows +} + +function buildPathJoin( + cloudRunEvents: CloudRunPullResult['events'], + gaRows: GA4AiReferralRow[], +): PathRow[] { + const dedupedGa = dedupeGaRows(gaRows) + + type PathAggregate = { + cloudRunTotalHits: number + cloudRunCrawlerHits: number + crawlerCounts: Map + cloudRunReferralHits: number + refererCounts: Map + gaAiSessions: number + gaSourceCounts: Map + } + const byPath = new Map() + const ensure = (key: string): PathAggregate => { + let row = byPath.get(key) + if (!row) { + row = { + cloudRunTotalHits: 0, + cloudRunCrawlerHits: 0, + crawlerCounts: new Map(), + cloudRunReferralHits: 0, + refererCounts: new Map(), + gaAiSessions: 0, + gaSourceCounts: new Map(), + } + byPath.set(key, row) + } + return row + } + + for (const event of cloudRunEvents) { + const key = normalizePath(event.path) + const row = ensure(key) + row.cloudRunTotalHits += 1 + + const crawler = classifyCrawler(event) + if (crawler) { + row.cloudRunCrawlerHits += 1 + row.crawlerCounts.set(crawler.product, (row.crawlerCounts.get(crawler.product) ?? 0) + 1) + } + + const ai = classifyAiReferral(event) + if (ai) { + row.cloudRunReferralHits += 1 + const label = `${ai.product} (${ai.evidenceType})` + row.refererCounts.set(label, (row.refererCounts.get(label) ?? 0) + 1) + } + } + + for (const row of dedupedGa) { + const key = normalizePath(row.landingPage) + const aggregate = ensure(key) + aggregate.gaAiSessions += row.sessions + aggregate.gaSourceCounts.set( + row.source, + (aggregate.gaSourceCounts.get(row.source) ?? 0) + row.sessions, + ) + } + + const top = (counts: Map): string | null => { + let best: { key: string; value: number } | null = null + for (const [key, value] of counts) { + if (!best || value > best.value) best = { key, value } + } + return best ? `${best.key} (${best.value})` : null + } + + const verdictFor = (row: PathAggregate): PathRow['verdict'] => { + const crawled = row.cloudRunCrawlerHits > 0 + const clicked = row.gaAiSessions > 0 + const referred = row.cloudRunReferralHits > 0 + if (crawled && clicked) return 'crawled+clicked' + if (crawled) return 'crawled-only' + if (clicked) return 'clicked-only' + return referred ? 'referred-only' : 'crawled-only' + } + + return [...byPath.entries()] + .map(([pathKey, aggregate]) => ({ + path: pathKey, + cloudRunTotalHits: aggregate.cloudRunTotalHits, + cloudRunCrawlerHits: aggregate.cloudRunCrawlerHits, + topCrawler: top(aggregate.crawlerCounts), + cloudRunReferralHits: aggregate.cloudRunReferralHits, + topCrawlerReferer: top(aggregate.refererCounts), + gaAiSessions: aggregate.gaAiSessions, + topGaSource: top(aggregate.gaSourceCounts), + verdict: verdictFor(aggregate), + })) + .filter( + (row) => + row.cloudRunCrawlerHits > 0 || + row.cloudRunReferralHits > 0 || + row.gaAiSessions > 0, + ) + .sort((a, b) => { + const aScore = a.cloudRunCrawlerHits + a.cloudRunReferralHits + a.gaAiSessions + const bScore = b.cloudRunCrawlerHits + b.cloudRunReferralHits + b.gaAiSessions + return bScore - aScore + }) +} + +function printSummary(output: CorrelationOutput): void { + console.log('Cloud Run × GA4 AI traffic correlation') + console.log(`Cloud Run window: ${output.window.cloudRun.startTime} → ${output.window.cloudRun.endTime}`) + console.log(`GA4 window: last ${output.window.ga.days}d (${output.window.ga.note})`) + console.log('') + console.log( + `Cloud Run: source=${output.cloudRun.source} raw=${output.cloudRun.rawEntryCount} ` + + `events=${output.cloudRun.normalizedEventCount} crawlers=${output.cloudRun.crawlerHits} ` + + `referrals=${output.cloudRun.aiReferralHits} unknown=${output.cloudRun.unknownHits}`, + ) + console.log( + `GA4: source=${output.ga.source} rows=${output.ga.rowsFetched} ` + + `sessions=${output.ga.sessionsTotal}`, + ) + + if (output.topCrawlerBots.length > 0) { + console.log('\nTop Cloud Run crawlers (no GA equivalent):') + for (const bot of output.topCrawlerBots) { + console.log(` ${bot.botId} (${bot.operator}): ${bot.hits}`) + } + } + + console.log('\nAI source comparison (Cloud Run referer+UTM vs GA sessions):') + console.log( + ' surface'.padEnd(30) + + 'CR'.padStart(6) + + 'ref'.padStart(6) + + 'utm'.padStart(6) + + 'GA'.padStart(6) + + 'delta'.padStart(8) + + ' verdict', + ) + for (const row of output.aiSourceComparison) { + if (row.verdict === 'neither') continue + const label = `${row.product} (${row.domain})` + console.log( + ` ${label.padEnd(28)}` + + `${row.cloudRunHits.toString().padStart(6)}` + + `${row.cloudRunHitsByEvidence.referer.toString().padStart(6)}` + + `${row.cloudRunHitsByEvidence.utm.toString().padStart(6)}` + + `${row.gaSessions.toString().padStart(6)}` + + `${row.delta.toString().padStart(8)} ${row.verdict}`, + ) + } + + if (output.pathJoin.length > 0) { + console.log('\nPath-level join (top 20):') + for (const row of output.pathJoin.slice(0, 20)) { + console.log(` ${row.path}`) + console.log( + ` crawlers=${row.cloudRunCrawlerHits}` + + (row.topCrawler ? ` top=${row.topCrawler}` : '') + + ` referrals=${row.cloudRunReferralHits}` + + (row.topCrawlerReferer ? ` ref=${row.topCrawlerReferer}` : '') + + ` ga=${row.gaAiSessions}` + + (row.topGaSource ? ` ga-src=${row.topGaSource}` : '') + + ` [${row.verdict}]`, + ) + } + } +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)) + if (args.help) { + console.log(usage()) + return + } + + const window = resolveWindow(args.since, args.until) + const gaDays = Math.max(1, Math.ceil(window.hours / 24)) + + const cloudRunPull = await pullCloudRun(args, window.startTime, window.endTime) + const gaPull = await pullGa(args, gaDays) + + const probe = buildTrafficProbeReport(cloudRunPull.events) + const aiSourceComparison = buildAiSourceComparison(cloudRunPull.events, gaPull.rows) + const pathJoin = buildPathJoin(cloudRunPull.events, gaPull.rows) + const sessionsTotal = dedupeGaRows(gaPull.rows).reduce((sum, row) => sum + row.sessions, 0) + + const output: CorrelationOutput = { + window: { + cloudRun: { startTime: window.startTime, endTime: window.endTime }, + ga: { + days: gaDays, + note: + window.hours <= 24 + ? 'GA4 minimum is 1 day; Cloud Run window is shorter, so GA bucket is wider' + : 'GA4 ceil-rounds to whole days from --since', + }, + }, + cloudRun: { + source: args.cloudRunFixture ? 'fixture' : 'cloud-run', + rawEntryCount: cloudRunPull.rawEntryCount, + normalizedEventCount: cloudRunPull.events.length, + skippedEntryCount: cloudRunPull.skippedEntryCount, + nextPageToken: cloudRunPull.nextPageToken, + filter: cloudRunPull.filter, + crawlerHits: probe.totals.crawlerHits, + aiReferralHits: probe.totals.aiReferralHits, + unknownHits: probe.totals.unknownHits, + }, + ga: { + source: gaPull.source, + rowsFetched: gaPull.rows.length, + sessionsTotal, + }, + aiSourceComparison, + pathJoin, + topCrawlerBots: probe.topBots, + topCrawlerPaths: probe.topCrawlerPaths, + } + + if (args.out) { + const outputPath = path.resolve(args.out) + await fs.mkdir(path.dirname(outputPath), { recursive: true }) + await fs.writeFile(outputPath, `${JSON.stringify(output, null, 2)}\n`) + if (!args.json) console.log(`Wrote report: ${outputPath}`) + } + + if (args.json) { + console.log(JSON.stringify(output, null, 2)) + } else { + printSummary(output) + } +} + +main().catch((error: unknown) => { + console.error(error instanceof Error ? error.message : String(error)) + process.exitCode = 1 +})