diff --git a/package.json b/package.json index 99cb0c5a..4081becf 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "canonry", "private": true, - "version": "4.11.0", + "version": "4.11.1", "type": "module", "packageManager": "pnpm@10.28.2", "scripts": { diff --git a/packages/api-routes/src/ga.ts b/packages/api-routes/src/ga.ts index c6d14a25..54ecbbe7 100644 --- a/packages/api-routes/src/ga.ts +++ b/packages/api-routes/src/ga.ts @@ -41,6 +41,26 @@ function formatSharePct(numerator: number, total: number): string { return `${rounded}%` } +// For each tuple key, keep the row with the highest sessions and discard the +// others. GA4 returns one row per attribution dimension (session, first_user, +// manual_utm), but those dimensions are overlapping lenses on the same visit +// — summing them across dimensions would double-count. Result is sorted by +// sessions descending. +function pickWinningDimension( + rows: T[], + tupleKey: (row: T) => string, +): T[] { + const winners = new Map() + for (const row of rows) { + const key = tupleKey(row) + const existing = winners.get(key) + if (!existing || (row.sessions ?? 0) > (existing.sessions ?? 0)) { + winners.set(key, row) + } + } + return [...winners.values()].sort((a, b) => (b.sessions ?? 0) - (a.sessions ?? 0)) +} + export interface Ga4CredentialRecord { projectName: string propertyId: string @@ -707,7 +727,7 @@ export async function ga4Routes(app: FastifyInstance, opts: GA4RoutesOptions) { .limit(limit) .all() - const aiReferrals = app.db + const aiReferralRows = app.db .select({ source: gaAiReferrals.source, medium: gaAiReferrals.medium, @@ -718,10 +738,9 @@ export async function ga4Routes(app: FastifyInstance, opts: GA4RoutesOptions) { .from(gaAiReferrals) .where(and(...aiConditions)) .groupBy(gaAiReferrals.source, gaAiReferrals.medium, gaAiReferrals.sourceDimension) - .orderBy(sql`SUM(${gaAiReferrals.sessions}) DESC`) .all() - const aiReferralLandingPages = app.db + const aiReferralLandingPageRows = app.db .select({ source: gaAiReferrals.source, medium: gaAiReferrals.medium, @@ -738,9 +757,25 @@ export async function ga4Routes(app: FastifyInstance, opts: GA4RoutesOptions) { gaAiReferrals.sourceDimension, sql`COALESCE(${gaAiReferrals.landingPageNormalized}, ${gaAiReferrals.landingPage})`, ) - .orderBy(sql`SUM(${gaAiReferrals.sessions}) DESC`) .all() + // Dedupe across attribution dimensions: 'session', 'first_user', and + // 'manual_utm' are overlapping lenses on the same visit, not disjoint + // events. Returning all three would inflate the row count (e.g. 1 source + // showing as 6 rows). Keep the winning dimension — the one with the + // highest session count — per (source, medium) for `aiReferrals` and per + // (source, medium, landingPage) for `aiReferralLandingPages`. The cross- + // cutting / session-only totals (`aiSessionsDeduped`, `aiSessionsBySession`) + // are computed independently below and are unaffected. + const aiReferrals = pickWinningDimension( + aiReferralRows, + (r) => `${r.source}${r.medium}`, + ) + const aiReferralLandingPages = pickWinningDimension( + aiReferralLandingPageRows, + (r) => `${r.source}${r.medium}${r.landingPage}`, + ) + // Deduplicated AI totals: sessionSource, firstUserSource, and manualSource are // overlapping attribution lenses, not disjoint visits. To avoid double-counting, // first sum landing pages within each dimension, then take MAX(sessions) per diff --git a/packages/api-routes/test/ga.test.ts b/packages/api-routes/test/ga.test.ts index 96b88e4f..3a0e0d4e 100644 --- a/packages/api-routes/test/ga.test.ts +++ b/packages/api-routes/test/ga.test.ts @@ -1210,6 +1210,158 @@ describe('GA4 routes', () => { } }) + it('GET /ga/traffic dedupes aiReferrals + aiReferralLandingPages to the winning attribution dimension', async () => { + // Regression for the inflated-row-count bug: GA4 emits one row per + // attribution dimension (session, first_user, manual_utm) and our SQL + // groups by all three, so the pre-fix response had three rows for what + // the user perceives as a single source. The fix collapses to the + // winning dimension per (source, medium) and per (source, medium, + // landingPage) so the table reflects reality. + const now = new Date().toISOString() + credentials.set('test-project', { + projectName: 'test-project', + propertyId: '999888', + clientEmail: 'sa@test.iam.gserviceaccount.com', + privateKey: 'fake-key', + createdAt: now, + updatedAt: now, + }) + + // Seed three dimensions for the same (source, medium, landingPage) + // tuple. session=4 wins for chatgpt.com (>2 first_user, 1 manual_utm). + const idChatgptSession = crypto.randomUUID() + const idChatgptFirst = crypto.randomUUID() + const idChatgptUtm = crypto.randomUUID() + // Plus a separate (source, medium) where first_user beats session. + const idClaudeSession = crypto.randomUUID() + const idClaudeFirst = crypto.randomUUID() + const seededIds = [idChatgptSession, idChatgptFirst, idChatgptUtm, idClaudeSession, idClaudeFirst] + const dedupDate = '2025-11-15' + db.insert(gaAiReferrals).values([ + { + id: idChatgptSession, + projectId, + date: dedupDate, + source: 'dedup-chatgpt.com', + medium: 'referral', + sourceDimension: 'session', + landingPage: '/dedup-page', + sessions: 4, + users: 3, + syncedAt: now, + }, + { + id: idChatgptFirst, + projectId, + date: dedupDate, + source: 'dedup-chatgpt.com', + medium: 'referral', + sourceDimension: 'first_user', + landingPage: '/dedup-page', + sessions: 2, + users: 2, + syncedAt: now, + }, + { + id: idChatgptUtm, + projectId, + date: dedupDate, + source: 'dedup-chatgpt.com', + medium: 'referral', + sourceDimension: 'manual_utm', + landingPage: '/dedup-page', + sessions: 1, + users: 1, + syncedAt: now, + }, + { + id: idClaudeSession, + projectId, + date: dedupDate, + source: 'dedup-claude.ai', + medium: 'referral', + sourceDimension: 'session', + landingPage: '/dedup-other', + sessions: 3, + users: 2, + syncedAt: now, + }, + { + id: idClaudeFirst, + projectId, + date: dedupDate, + source: 'dedup-claude.ai', + medium: 'referral', + sourceDimension: 'first_user', + landingPage: '/dedup-other', + sessions: 9, + users: 7, + syncedAt: now, + }, + ]).run() + + try { + const res = await app.inject({ + method: 'GET', + url: '/api/v1/projects/test-project/ga/traffic', + }) + expect(res.statusCode).toBe(200) + const body = JSON.parse(res.payload) + + // Three rows seeded for chatgpt.com → exactly one row in the API + // response, surfacing the winning dimension. + const chatgptRows = body.aiReferrals.filter((r: { source: string }) => r.source === 'dedup-chatgpt.com') + expect(chatgptRows).toHaveLength(1) + expect(chatgptRows[0]).toMatchObject({ + source: 'dedup-chatgpt.com', + medium: 'referral', + sourceDimension: 'session', + sessions: 4, + }) + + // Two rows seeded for claude.ai → one winning-dimension row. + const claudeRows = body.aiReferrals.filter((r: { source: string }) => r.source === 'dedup-claude.ai') + expect(claudeRows).toHaveLength(1) + expect(claudeRows[0]).toMatchObject({ + source: 'dedup-claude.ai', + medium: 'referral', + sourceDimension: 'first_user', + sessions: 9, + }) + + // Landing-page table dedupes per (source, medium, landingPage). + const chatgptLanding = body.aiReferralLandingPages.filter( + (r: { source: string; landingPage: string }) => + r.source === 'dedup-chatgpt.com' && r.landingPage === '/dedup-page', + ) + expect(chatgptLanding).toHaveLength(1) + expect(chatgptLanding[0]).toMatchObject({ + source: 'dedup-chatgpt.com', + sourceDimension: 'session', + sessions: 4, + }) + const claudeLanding = body.aiReferralLandingPages.filter( + (r: { source: string; landingPage: string }) => + r.source === 'dedup-claude.ai' && r.landingPage === '/dedup-other', + ) + expect(claudeLanding).toHaveLength(1) + expect(claudeLanding[0]).toMatchObject({ + source: 'dedup-claude.ai', + sourceDimension: 'first_user', + sessions: 9, + }) + + // Output is sorted by sessions descending — the claude.ai winner (9) + // ranks above the chatgpt.com winner (4). + const dedupRows = body.aiReferrals.filter((r: { source: string }) => r.source.startsWith('dedup-')) + expect(dedupRows[0].source).toBe('dedup-claude.ai') + expect(dedupRows[1].source).toBe('dedup-chatgpt.com') + } finally { + db.delete(gaAiReferrals).where(inArray(gaAiReferrals.id, seededIds)).run() + credentials.delete('test-project') + } + }) + it('GET /ga/attribution-trend ai channel uses sessionSource only (matches breakdown cell)', async () => { const now = new Date().toISOString() const daysAgo = (n: number): string => { diff --git a/packages/canonry/package.json b/packages/canonry/package.json index 739cd9cf..c78e0614 100644 --- a/packages/canonry/package.json +++ b/packages/canonry/package.json @@ -1,6 +1,6 @@ { "name": "@ainyc/canonry", - "version": "4.11.0", + "version": "4.11.1", "type": "module", "description": "Agent-first open-source AEO operating platform - track how answer engines cite your domain", "license": "FSL-1.1-ALv2", diff --git a/packages/contracts/src/ga.ts b/packages/contracts/src/ga.ts index 345c051d..f1d9bf7f 100644 --- a/packages/contracts/src/ga.ts +++ b/packages/contracts/src/ga.ts @@ -29,6 +29,13 @@ export const ga4AiReferralDtoSchema = z.object({ medium: z.string(), sessions: z.number(), users: z.number(), + /** + * The winning attribution dimension for this (source, medium) tuple — the + * one with the highest session count. GA4 emits one row per dimension + * (session, first_user, manual_utm), but they're overlapping lenses on the + * same visit; only the dominant dimension is surfaced here so the table is + * not inflated. + */ sourceDimension: ga4SourceDimensionSchema, }) export type GA4AiReferralDto = z.infer @@ -36,6 +43,10 @@ export type GA4AiReferralDto = z.infer export const ga4AiReferralLandingPageDtoSchema = z.object({ source: z.string(), medium: z.string(), + /** + * The winning attribution dimension for this (source, medium, landingPage) + * tuple — the one with the highest session count. + */ sourceDimension: ga4SourceDimensionSchema, landingPage: z.string(), sessions: z.number(), @@ -179,7 +190,9 @@ export interface GaTrafficResponse { totalDirectSessions: number totalUsers: number topPages: Array<{ landingPage: string; sessions: number; organicSessions: number; directSessions: number; users: number }> + /** Deduped to the winning attribution dimension (highest sessions) per (source, medium). */ aiReferrals: Array<{ source: string; medium: string; sessions: number; users: number; sourceDimension: GA4SourceDimension }> + /** Deduped to the winning attribution dimension (highest sessions) per (source, medium, landingPage). */ aiReferralLandingPages: Array<{ source: string; medium: string; sourceDimension: GA4SourceDimension; landingPage: string; sessions: number; users: number }> /** Deduped AI session total: MAX(sessions) per date+source+medium across attribution dimensions, then summed. Cross-cutting: can overlap with Direct/Organic/Social via firstUserSource. */ aiSessionsDeduped: number