Skip to content

Commit 94718f9

Browse files
Stable all.requests (#5)
* pages and legacy lighthouse * fix * first test * pages insert * date var * js rewrite * dataset * Initial commit * init * core_web_vitals * clean graph, tested * publish core_web_vitals.technologies * Dev (#1) * workspace init * pages and legacy lighthouse * fix * first test * pages insert * date var * js rewrite * dataset * core_web_vitals * clean graph, tested * publish core_web_vitals.technologies * technologies partitioning * past month date for cwv * 8pm * package-lock.json * ignore full-refresh * readme * updated tags and example assert * dependency assertions * current month commented * assert fix * all tables publish * incremental tables * node script * enable legacy * missing package name * table configs * all.requests and all.parsed_css * dev sampling vars * sampling instead of rank * readme upd * dev hints * dev sampling for tech report * tech report workflow * removed sampling * dates flexibility * fix * formatting * other legacy tables * docs and dependencies * comment * Update definitions/output/pages.js Co-authored-by: Barry Pollard <[email protected]> * Update definitions/output/technologies.js Co-authored-by: Barry Pollard <[email protected]> * Update package.json Co-authored-by: Barry Pollard <[email protected]> * Update workflow_settings.yaml Co-authored-by: Barry Pollard <[email protected]> * format * not dependent on all.pages * migrated to function trigger * cloud function * readme update * deployed function * readme updates * readme update * init stable copies * requests ready * adjusted requests pipeline * use release configs in prod * readme update * tags update * dev sampling * prune summary * sorted * false when target exists * dev sampling * newline * trigger cleanup * formatting * forEach iteration * create table with operate * new test tables script * tested * merge * JSON columns * job per client * native object pruning * Update definitions/output/all/reprocess_requests.js Co-authored-by: Barry Pollard <[email protected]> --------- Co-authored-by: Barry Pollard <[email protected]>
1 parent 6640ffe commit 94718f9

File tree

6 files changed

+169
-49
lines changed

6 files changed

+169
-49
lines changed

README.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# HTTP Archive BigQuery pipeline with Dataform
22

3-
## Tables
3+
This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.
4+
5+
## Pipelines
6+
7+
The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run.
48

59
### Crawl tables in `all` dataset
610

definitions/extra/test_env.js

+23-21
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,28 @@
1-
const two_months_ago = constants.fn_past_month(constants.fn_past_month(constants.current_month));
1+
const date = constants.fn_past_month(constants.current_month);
22

3-
operate("test_env", {
4-
hasOutput: true,
5-
disabled: true // MUST NOT be commented in main branch
6-
}).queries(ctx => `
7-
CREATE OR REPLACE TABLE ${ctx.ref("all", "pages")} AS
8-
SELECT *
9-
FROM httparchive.all.pages ${constants.dev_TABLESAMPLE}
10-
WHERE date = '${two_months_ago}';
3+
var resources_list = [
4+
//{datasetId: "all", tableId: "pages"},
5+
{datasetId: "all", tableId: "requests"},
6+
//{datasetId: "all", tableId: "parsed_css"},
7+
//{datasetId: "core_web_vitals", tableId: "technologies"},
8+
];
119

12-
CREATE OR REPLACE TABLE ${ctx.ref("all", "requests")} AS
13-
SELECT *
14-
FROM httparchive.all.requests ${constants.dev_TABLESAMPLE}
15-
WHERE date = '${two_months_ago}';
10+
resources_list.forEach(resource => {
11+
operate(`test_table ${resource.datasetId}_${resource.tableId}`, {
12+
disabled: !constants.is_dev_env // enabled when workflow variable env_name = "dev"
13+
}).tags([
14+
"test_tables"
15+
]).queries(ctx => `
16+
CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev;
1617
17-
CREATE OR REPLACE TABLE ${ctx.ref("all", "parsed_css")} AS
18-
SELECT *
19-
FROM httparchive.all.parsed_css ${constants.dev_TABLESAMPLE}
20-
WHERE date = '${two_months_ago}';
18+
DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId};
19+
20+
CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId}
21+
LIKE httparchive.${resource.datasetId}.${resource.tableId};
2122
22-
CREATE OR REPLACE TABLE ${ctx.ref("core_web_vitals", "technologies")} AS
23+
INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId}
2324
SELECT *
24-
FROM httparchive.core_web_vitals.technologies
25-
WHERE date = '${two_months_ago}'
26-
`)
25+
FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE}
26+
WHERE date = '${date}'
27+
`);
28+
})
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
operate(`all_requests_stable_pre`).tags(
2+
["all_requests_stable"]
3+
).queries(`
4+
CREATE SCHEMA IF NOT EXISTS all_dev;
5+
6+
DROP TABLE IF EXISTS \`all_dev.requests_stable\`;
7+
8+
CREATE TABLE \`all_dev.requests_stable\`
9+
(
10+
date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"),
11+
client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"),
12+
page STRING NOT NULL OPTIONS(description="The URL of the page being tested"),
13+
is_root_page BOOL OPTIONS(description="Whether the page is the root of the origin."),
14+
root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested"),
15+
rank INT64 OPTIONS(description="Site popularity rank, from CrUX"),
16+
url STRING NOT NULL OPTIONS(description="The URL of the request"),
17+
is_main_document BOOL NOT NULL OPTIONS(description="Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects"),
18+
type STRING OPTIONS(description="Simplified description of the type of resource (script, html, css, text, other, etc)"),
19+
index INT64 OPTIONS(description="The sequential 0-based index of the request"),
20+
payload JSON OPTIONS(description="JSON-encoded WebPageTest result data for this request"),
21+
summary JSON OPTIONS(description="JSON-encoded summarization of request data"),
22+
request_headers ARRAY<STRUCT<
23+
name STRING OPTIONS(description="Request header name"),
24+
value STRING OPTIONS(description="Request header value")
25+
>> OPTIONS(description="Request headers"),
26+
response_headers ARRAY<STRUCT<
27+
name STRING OPTIONS(description="Response header name"),
28+
value STRING OPTIONS(description="Response header value")
29+
>> OPTIONS(description="Response headers"),
30+
response_body STRING OPTIONS(description="Text-based response body")
31+
)
32+
PARTITION BY date
33+
CLUSTER BY client, is_root_page, type, rank
34+
OPTIONS(
35+
require_partition_filter=true
36+
);
37+
`);
38+
39+
const iterations = [];
40+
const clients = constants.clients;
41+
42+
for (
43+
let month = constants.current_month;
44+
month >= '2024-09-01'; // 2022-07-01
45+
month = constants.fn_past_month(month)) {
46+
clients.forEach((client) => {
47+
iterations.push({
48+
month: month,
49+
client: client
50+
})
51+
})
52+
}
53+
54+
iterations.forEach((iteration, i) => {
55+
operate(`all_requests_stable ${iteration.month} ${iteration.client}`).tags(
56+
["all_requests_stable"]
57+
).dependencies([
58+
i===0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i-1].month} ${iterations[i-1].client}`
59+
]).queries(ctx => `
60+
INSERT INTO \`all_dev.requests_stable\`
61+
SELECT
62+
requests.date,
63+
requests.client,
64+
requests.page,
65+
requests.is_root_page,
66+
requests.root_page,
67+
crux.rank,
68+
requests.url,
69+
requests.is_main_document,
70+
requests.type,
71+
requests.index,
72+
JSON_REMOVE(
73+
SAFE.PARSE_JSON(payload, wide_number_mode => 'round'),
74+
'$._headers'
75+
) AS payload,
76+
JSON_REMOVE(
77+
SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'),
78+
'$.firstHtml',
79+
'$.firstReq',
80+
'$.req_accept_encoding',
81+
'$.req_accept_language',
82+
'$.req_accept',
83+
'$.req_if_modified_since',
84+
'$.req_if_none_match',
85+
'$.req_referer',
86+
'$.req_user_agent',
87+
'$.reqOtherHeaders',
88+
'$.requestid',
89+
'$.resp_age',
90+
'$.resp_cache_control',
91+
'$.resp_content_length',
92+
'$.resp_content_type',
93+
'$.resp_date',
94+
'$.resp_etag',
95+
'$.resp_last_modified',
96+
'$.resp_server',
97+
'$.resp_vary',
98+
'$.respOtherHeaders',
99+
'$.startedDateTime',
100+
'$.url',
101+
'$.urlShort'
102+
) as summary,
103+
requests.request_headers,
104+
requests.response_headers,
105+
requests.response_body
106+
FROM (
107+
SELECT *
108+
FROM \`all.requests\` ${constants.dev_TABLESAMPLE}
109+
WHERE date = '${iteration.month}'
110+
AND client = '${iteration.client}') AS requests
111+
LEFT JOIN (
112+
SELECT DISTINCT
113+
CONCAT(origin, '/') AS page,
114+
experimental.popularity.rank AS rank
115+
FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")}
116+
WHERE yyyymm = ${constants.fn_past_month(iteration.month).substring(0, 7).replace('-', '')}
117+
) AS crux
118+
ON requests.root_page = crux.page;
119+
`)
120+
});

definitions/output/core_web_vitals/technologies.js

+14-14
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F
2424
);
2525
2626
CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING)
27-
RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC>
27+
RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC>
2828
LANGUAGE js AS '''
2929
try {
3030
const $ = JSON.parse(categories);
@@ -74,23 +74,23 @@ crux AS (
7474
END AS rank,
7575
CONCAT(origin, '/') AS root_page_url,
7676
IF(device = 'desktop', 'desktop', 'mobile') AS client,
77-
77+
7878
# CWV
7979
IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
8080
IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
8181
IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
8282
IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
8383
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
8484
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,
85-
85+
8686
(IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
8787
IS_GOOD(small_cls, medium_cls, large_cls) AND
8888
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024,
89-
89+
9090
(IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND
9191
IS_GOOD(small_cls, medium_cls, large_cls) AND
9292
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023,
93-
93+
9494
# WV
9595
IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
9696
IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
@@ -114,7 +114,7 @@ technologies AS (
114114
${ctx.resolve("all", "pages")},
115115
UNNEST(technologies) AS technology
116116
WHERE
117-
date = '${past_month}' AND
117+
date = '${past_month}' ${constants.dev_rank5000_filter} AND
118118
technology.technology IS NOT NULL AND
119119
technology.technology != ''
120120
UNION ALL
@@ -125,7 +125,7 @@ UNION ALL
125125
FROM
126126
${ctx.resolve("all", "pages")}
127127
WHERE
128-
date = '${past_month}'
128+
date = '${past_month}' ${constants.dev_rank5000_filter}
129129
),
130130
131131
categories AS (
@@ -137,7 +137,7 @@ categories AS (
137137
UNNEST(technologies) AS technology,
138138
UNNEST(technology.categories) AS category
139139
WHERE
140-
date = '${past_month}'
140+
date = '${past_month}' ${constants.dev_rank5000_filter}
141141
GROUP BY
142142
app
143143
UNION ALL
@@ -149,7 +149,7 @@ UNION ALL
149149
UNNEST(technologies) AS technology,
150150
UNNEST(technology.categories) AS category
151151
WHERE
152-
date = '${past_month}' AND
152+
date = '${past_month}' ${constants.dev_rank5000_filter} AND
153153
client = 'mobile'
154154
),
155155
@@ -165,7 +165,7 @@ summary_stats AS (
165165
FROM
166166
${ctx.resolve("all", "pages")}
167167
WHERE
168-
date = '${past_month}'
168+
date = '${past_month}' ${constants.dev_rank5000_filter}
169169
),
170170
171171
lab_data AS (
@@ -206,7 +206,7 @@ SELECT
206206
app,
207207
client,
208208
COUNT(0) AS origins,
209-
209+
210210
# CrUX data
211211
COUNTIF(good_fid) AS origins_with_good_fid,
212212
COUNTIF(good_cls) AS origins_with_good_cls,
@@ -227,19 +227,19 @@ SELECT
227227
SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv,
228228
SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024,
229229
SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023,
230-
230+
231231
# Lighthouse data
232232
APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS median_lighthouse_score_accessibility,
233233
APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS median_lighthouse_score_best_practices,
234234
APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS median_lighthouse_score_performance,
235235
APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS median_lighthouse_score_pwa,
236236
APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS median_lighthouse_score_seo,
237-
237+
238238
# Page weight stats
239239
APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS median_bytes_total,
240240
APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS median_bytes_js,
241241
APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS median_bytes_image
242-
242+
243243
FROM
244244
lab_data
245245
JOIN

definitions/sources/declares.js

+7-1
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@ for (const table of crux_tables) {
1616
});
1717

1818
assert(`${table}_not_empty`).query(ctx => `
19-
SELECT
19+
SELECT
2020
'No data for the specified date' AS error_message
2121
FROM ${ctx.ref("chrome-ux-report", "materialized", table)}
2222
WHERE yyyymm = ${past_month}
2323
GROUP BY yyyymm
2424
HAVING COUNT(1) = 0
2525
`);
2626
}
27+
28+
declare({
29+
database: "chrome-ux-report",
30+
schema: "experimental",
31+
name: "global",
32+
});

src/dataform.js

-12
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,6 @@ async function get_compilation_results(repoURI) {
1414
compilationResult: {
1515
releaseConfig: `${repoURI}/releaseConfigs/production`
1616
}
17-
}, dev_request = {
18-
parent: repoURI,
19-
compilationResult: {
20-
gitCommitish: 'dev'
21-
},
22-
codeCompilationConfig: {
23-
schemaSuffix: 'dev',
24-
tablePrefix: 'dev',
25-
vars: {
26-
current_month: '2024-08-01',
27-
},
28-
}
2917
};
3018

3119
console.log(`Creating Dataform compilation result: ${JSON.stringify(request, null, 2)}`);

0 commit comments

Comments
 (0)