Skip to content
This repository was archived by the owner on Sep 18, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
"src/queries/revoke-domainkey.sql",
"src/queries/rotate-domainkeys.sql",
"src/queries/rum-404.sql",
"src/queries/rum-acquisition.sql",
"src/queries/rum-bounces.sql",
"src/queries/rum-bundles.sql",
"src/queries/rum-checkpoint-cwv-correlation.sql",
Expand Down
368 changes: 368 additions & 0 deletions src/queries/rum-acquisition.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,368 @@
--- description: Get page views by acquisition source for a given URL for a specified time period.
--- Authorization: none
--- Access-Control-Allow-Origin: *
--- interval: 30
--- offset: 0
--- startdate: 2020-01-01
--- enddate: 2020-12-31
--- granularity: 1
--- traffic_source: -
--- acquisition_type: -
--- url: -
--- timezone: UTC
--- domainkey: secret

-- Lars wrote: We have two distinct categories:
-- Organic vs. Paid: this is based on the utm-campagin checkpoint
-- Search vs. Social vs. Direct vs. Email vs. Display: this is based on the enter checkpoint
-- And then we can build a breakdown based on the combination.
-- I think traffic_source = search | social | display | email | direct and acquisition_type = paid | organic should be separate fields in the result

WITH daily_events AS (
SELECT
id,
hostname,
checkpoint,
source,
target,
weight,
-- date is used for event correlation
TIMESTAMP_TRUNC(time, HOUR, @timezone) AS date,
-- dategroup is used for response in the desired granularity
TIMESTAMP_TRUNC(time, DAY, @timezone) AS dategroup
FROM
helix_rum.EVENTS_V5(
@url, # url
CAST(@offset AS INT64), # offset
CAST(@interval AS INT64), # days to fetch
@startdate, # start date
@enddate, # end date
@timezone, # timezone
'all', # deviceclass
@domainkey # domain key to prevent data sharing
)
WHERE
checkpoint IN ('enter', 'utm')
AND user_agent NOT IN ('bot', 'undefined')
),

weekly_events AS (
SELECT
id,
hostname,
checkpoint,
source,
target,
weight,
TIMESTAMP_TRUNC(time, HOUR, @timezone) AS date,
TIMESTAMP_TRUNC(time, ISOWEEK, @timezone) AS dategroup
FROM
helix_rum.EVENTS_V5(
@url, # url
CAST(@offset AS INT64), # offset
CAST(@interval AS INT64), # days to fetch
@startdate, # start date
@enddate, # end date
@timezone, # timezone
'all', # deviceclass
@domainkey # domain key to prevent data sharing
)
WHERE
checkpoint IN ('enter', 'utm')
AND user_agent NOT IN ('bot', 'undefined')
),

monthly_events AS (
SELECT
id,
hostname,
checkpoint,
source,
target,
weight,
TIMESTAMP_TRUNC(time, HOUR, @timezone) AS date,
TIMESTAMP_TRUNC(time, MONTH, @timezone) AS dategroup
FROM
helix_rum.EVENTS_V5(
@url, # url
CAST(@offset AS INT64), # offset
CAST(@interval AS INT64), # days to fetch
@startdate, # start date
@enddate, # end date
@timezone, # timezone
'all', # deviceclass
@domainkey # domain key to prevent data sharing
)
WHERE
checkpoint IN ('enter', 'utm')
AND user_agent NOT IN ('bot', 'undefined')
),

events AS (
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
target,
weight
FROM daily_events
WHERE CAST(@granularity AS INT64) = 1
UNION ALL
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
target,
weight
FROM weekly_events
WHERE CAST(@granularity AS INT64) = 7
UNION ALL
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
target,
weight
FROM monthly_events
WHERE CAST(@granularity AS INT64) = 30
),

enter_events AS (
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
weight
FROM events
WHERE
checkpoint = 'enter'
),

utm_source_events AS (
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
target,
weight
FROM events
WHERE
checkpoint = 'utm'
AND source = 'utm_source'
),

utm_medium_events AS (
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
target,
weight
FROM events
WHERE
checkpoint = 'utm'
AND source = 'utm_medium'
),

utm_campaign_events AS (
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
target,
weight
FROM events
WHERE
checkpoint = 'utm'
AND source = 'utm_campaign'
),

utm_term_events AS (
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
target,
weight
FROM events
WHERE
checkpoint = 'utm'
AND source = 'utm_term'
),

utm_content_events AS (
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
target,
weight
FROM events
WHERE
checkpoint = 'utm'
AND source = 'utm_content'
),

utm_paid_events AS (
SELECT
id,
date,
dategroup,
hostname,
checkpoint,
source,
weight
FROM utm_medium_events
-- not all utm events are paid, filter further
WHERE (
LOWER(target) LIKE '%paid%'
OR LOWER(target) LIKE 'cp%'
OR LOWER(target) LIKE 'pp%'
)
),

enter_events_with_utm AS (
SELECT
e.id,
e.dategroup,
e.hostname,
e.checkpoint,
e.source,
e.weight,
ANY_VALUE(us.target) AS utm_source,
ANY_VALUE(um.target) AS utm_medium,
ANY_VALUE(uc.target) AS utm_campaign,
ANY_VALUE(ut.target) AS utm_term,
ANY_VALUE(uc2.target) AS utm_content,
IF(COUNT(up.source) = 0, 'organic', 'paid') AS acquisition_type
FROM enter_events AS e
LEFT JOIN utm_source_events AS us ON e.id = us.id AND e.date = us.date
LEFT JOIN utm_medium_events AS um ON e.id = um.id AND e.date = um.date
LEFT JOIN utm_campaign_events AS uc ON e.id = uc.id AND e.date = uc.date
LEFT JOIN utm_term_events AS ut ON e.id = ut.id AND e.date = ut.date
LEFT JOIN utm_content_events AS uc2 ON e.id = uc2.id AND e.date = uc2.date
LEFT JOIN utm_paid_events AS up ON e.id = up.id AND e.date = up.date
GROUP BY e.id, e.dategroup, e.hostname, e.checkpoint, e.source, e.weight
),

enter_events_grouped AS (
SELECT
dategroup AS date,
hostname,
checkpoint,
source,
weight,
utm_source,
utm_medium,
utm_campaign,
utm_term,
utm_content,
acquisition_type,
COUNT(source) AS count
FROM enter_events_with_utm
GROUP BY
dategroup,
hostname,
checkpoint,
source,
weight,
utm_source,
utm_medium,
utm_campaign,
utm_term,
utm_content,
acquisition_type
),

events_channels AS (
SELECT
acquisition_type,
STRING(date, @timezone) AS date,
COALESCE(
-- segmentation is based on first match so the sequence of matching logic is important
-- display
IF(utm_medium = 'display', 'display', null),
IF(utm_source = 'dbm', 'display', null),
IF(utm_source = 'dcm', 'display', null),
IF(utm_medium = 'dfa', 'display', null),
-- search
IF(source LIKE '%google%', 'search', null),
IF(source LIKE '%duckduckgo%', 'search', null),
IF(source LIKE '%yahoo%', 'search', null),
IF(source LIKE '%bing%', 'search', null),
IF(source LIKE '%ecosia%', 'search', null),
IF(source LIKE '%baidu%', 'search', null),
IF(source LIKE '%search%', 'search', null),
IF(source LIKE '%yandex%', 'search', null),
-- social
IF(source LIKE '%facebook%', 'social', null),
IF(source LIKE '%messenger%', 'social', null),
IF(source LIKE '%reddit%', 'social', null),
IF(source LIKE '%justanswer%', 'social', null),
IF(source LIKE '%pinterest%', 'social', null),
IF(source LIKE '%linkedin%', 'social', null),
IF(source LIKE '%tiktok%', 'social', null),
IF(source LIKE '%buzzfeed%', 'social', null),
IF(source LIKE '%youtube%', 'social', null),
-- email
IF(source LIKE '%mail%', 'email', null),
-- referral
-- affiliate
IF(utm_medium = 'affiliate', 'affiliate', null),
IF(utm_campaign = 'affiliate', 'affiliate', null),
IF(utm_term = 'affiliate', 'affiliate', null),
-- direct
IF(source = '', 'direct', null),
-- everything else
'unassigned'
) AS traffic_source,
SUM(weight * count) AS pageviews
FROM enter_events_grouped
GROUP BY
date,
source,
utm_source,
utm_medium,
utm_campaign,
utm_term,
acquisition_type
)

SELECT
date,
traffic_source,
acquisition_type,
SUM(pageviews) AS pageviews
FROM events_channels
WHERE
(@traffic_source = '-' OR @traffic_source = traffic_source)
AND (@acquisition_type = '-' OR @acquisition_type = acquisition_type)
GROUP BY date, traffic_source, acquisition_type
ORDER BY acquisition_type, traffic_source, date