-
-
Notifications
You must be signed in to change notification settings - Fork 183
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add web almanac 2024 SQL queries * Update video_source_types.todo.sql Linter prefers ' * Update video_media.sql Fixing inconsistencies in linter * Update video_media.sql Linter * Update video_formats.sql removing TABLESAMPLE SYSTEM (0.001 PERCENT) --------- Co-authored-by: Mike Gifford <[email protected]>
- Loading branch information
1 parent
0a512e8
commit fd87798
Showing
58 changed files
with
3,018 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#standardSQL | ||
# what is the bpp of animated vs non-animated GIFs? | ||
# animated_gif_bpp.sql | ||
|
||
|
||
# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that. | ||
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING) | ||
RETURNS INT64 | ||
LANGUAGE js AS r''' | ||
if (!imageMagickNumberString) { return null; } | ||
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/); | ||
const multiples = { | ||
'K': 1e3, | ||
'M': 1e6, | ||
'G': 1e9, | ||
'T': 1e12 | ||
} | ||
if ( matched && matched[1] ) { | ||
return Math.round( | ||
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 ) | ||
); | ||
} else { | ||
return null; | ||
} | ||
'''; | ||
|
||
# ImageMagick reports bytesizes in a friendly, human readable format. Just return bytes | ||
CREATE TEMPORARY FUNCTION magickBytes(imageMagickNumberString STRING) | ||
RETURNS INT64 | ||
LANGUAGE js AS r''' | ||
if (!imageMagickNumberString) { return 0; } | ||
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)$/); | ||
const multiples = { | ||
'B': 1, | ||
'KB': 1e3, | ||
'MB': 1e6, | ||
'GB': 1e9, | ||
'TB': 1e12 | ||
} | ||
if ( matched && matched[1] && matched[2] ) { | ||
return Math.round( | ||
parseFloat( matched[1] ) * multiples[ matched[2] ] | ||
); | ||
} else { | ||
return null; | ||
} | ||
'''; | ||
|
||
WITH gifs AS ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL) AS is_animated, | ||
(magickBytes(JSON_VALUE(payload, '$._image_details.magick.filesize')) * 8) / | ||
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) AS bits_per_pixel | ||
FROM | ||
`requests.2024_06_01_*` | ||
WHERE | ||
JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND | ||
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 AND | ||
JSON_VALUE(payload, '$._image_details.animated') IS NOT NULL | ||
) | ||
|
||
SELECT | ||
percentile, | ||
client, | ||
is_animated, | ||
APPROX_QUANTILES(bits_per_pixel, 1000)[OFFSET(percentile * 10)] AS bpp | ||
FROM | ||
gifs, | ||
UNNEST([10, 25, 50, 75, 90]) AS percentile | ||
GROUP BY | ||
percentile, | ||
client, | ||
is_animated | ||
ORDER BY | ||
percentile, | ||
client, | ||
is_animated |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#standardSQL | ||
# what percent of gifs are animated? | ||
# animated_gif_count.sql | ||
|
||
# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that. | ||
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING) | ||
RETURNS INT64 | ||
LANGUAGE js AS r''' | ||
if (!imageMagickNumberString) { return null; } | ||
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/); | ||
const multiples = { | ||
'K': 1e3, | ||
'M': 1e6, | ||
'G': 1e9, | ||
'T': 1e12 | ||
} | ||
if ( matched && matched[1] ) { | ||
return Math.round( | ||
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 ) | ||
); | ||
} else { | ||
return null; | ||
} | ||
'''; | ||
|
||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
COUNT(0) AS total_gifs, | ||
COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) AS total_animated_gifs, | ||
COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) / COUNT(0) AS pct_animated_gifs | ||
FROM | ||
`requests.2024_06_01_*` | ||
WHERE | ||
JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND | ||
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 | ||
GROUP BY | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#standardSQL | ||
# distribution of animated GIF framecounts | ||
# animated_gif_framecount.sql | ||
|
||
# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that. | ||
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING) | ||
RETURNS INT64 | ||
LANGUAGE js AS r''' | ||
if (!imageMagickNumberString) { return null; } | ||
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/); | ||
const multiples = { | ||
'K': 1e3, | ||
'M': 1e6, | ||
'G': 1e9, | ||
'T': 1e12 | ||
} | ||
if ( matched && matched[1] ) { | ||
return Math.round( | ||
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 ) | ||
); | ||
} else { | ||
return null; | ||
} | ||
'''; | ||
|
||
WITH framecounts AS ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
CAST(JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') AS NUMERIC) AS framecount | ||
FROM `requests.2024_06_01_*` | ||
WHERE | ||
JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') IS NOT NULL AND | ||
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 | ||
) | ||
|
||
SELECT | ||
percentile, | ||
client, | ||
APPROX_QUANTILES(framecount, 1000)[OFFSET(percentile * 10)] AS framecount | ||
FROM | ||
framecounts, | ||
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile | ||
GROUP BY | ||
percentile, | ||
client | ||
ORDER BY | ||
percentile, | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#standardSQL | ||
# What % of pages load at least one image? | ||
# ✅ at_least_one_image_request.sql | ||
|
||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
COUNTIF(reqImg > 0) AS atLeastOneImgReqCount, | ||
COUNT(0) AS total, | ||
SAFE_DIVIDE(COUNTIF(reqImg > 0), COUNT(0)) AS atLeastOneImgReqPct | ||
FROM | ||
`httparchive.summary_pages.2024_06_01_*` | ||
GROUP BY | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#standardSQL | ||
# What % of pages have at least one <img>? | ||
# at_least_one_img.sql | ||
|
||
CREATE TEMPORARY FUNCTION numberOfImages(images_string STRING) | ||
RETURNS INT64 | ||
LANGUAGE js AS ''' | ||
try { | ||
return JSON.parse(images_string).filter( i => parseInt(i.approximateResourceWidth) > 1 && parseInt(i.approximateResourceWidth) > 1 ).length; | ||
} catch { | ||
return 0; | ||
} | ||
'''; | ||
|
||
WITH numImgs AS ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
numberOfImages(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')) AS numberOfImages | ||
FROM | ||
`httparchive.pages.2024_06_01_*` | ||
) | ||
|
||
SELECT | ||
client, | ||
COUNTIF(numberOfImages > 0) AS atLeastOneCount, | ||
COUNT(0) AS total, | ||
SAFE_DIVIDE(COUNTIF(numberOfImages > 0), COUNT(0)) AS atLeastOnePct | ||
FROM | ||
numImgs | ||
GROUP BY | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
#standardSQL | ||
# Measuring img loaded bytes and dimensions | ||
# bytes_and_dimensions.sql | ||
|
||
CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING) | ||
RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, bitsPerPixel NUMERIC, isPixel BOOL, isDataURL BOOL, resourceFormat STRING>> | ||
LANGUAGE js AS ''' | ||
function pithyType( { contentType, url } ) { | ||
const subtypeMap = { | ||
'svg+xml': 'svg', | ||
'svgz': 'svg', | ||
'jpeg': 'jpg', | ||
'jfif': 'jpg', | ||
'x-png': 'png', | ||
'vnd.microsoft.icon': 'ico', | ||
'x-icon': 'ico', | ||
'jxr': 'jxr', | ||
'vnd.ms-photo': 'jxr', | ||
'hdp': 'jxr', | ||
'wdp': 'jxr', | ||
'jpf': 'jp2', | ||
'jpx': 'jp2', | ||
'jpm': 'jp2', | ||
'mj2': 'jp2', | ||
'x-jp2-container': 'jp2', | ||
'x-jp2-codestream': 'jp2', | ||
'x-jpeg2000-image': 'jp2', | ||
'heic': 'heif', | ||
'x-ms-bmp': 'bmp', | ||
'x-pict': 'pict', | ||
'tif': 'tiff', | ||
'x-tif': 'tiff', | ||
'x-tiff': 'tiff', | ||
'vnd.mozilla.apng': 'apng', | ||
// identities | ||
'apng': 'apng', | ||
'jpg': 'jpg', | ||
'jp2': 'jp2', | ||
'png': 'png', | ||
'gif': 'gif', | ||
'ico': 'ico', | ||
'webp': 'webp', | ||
'avif': 'avif', | ||
'tiff': 'tiff', | ||
'flif': 'flif', | ||
'heif': 'heif', | ||
'jxl': 'jxl', | ||
'avif-sequence': 'avif-sequence', // keep separate from single frames... | ||
'heic-sequence': 'heic-sequence', | ||
'bmp': 'bmp', | ||
'pict': 'pict' | ||
}; | ||
function normalizeSubtype( subtype ) { | ||
if ( subtypeMap[ subtype ] ) { | ||
return subtypeMap[ subtype ]; | ||
} | ||
return 'unknown'; // switch between: | ||
// `subtype` | ||
// to see everything, check if there's anything else worth capturing | ||
// `'unknown'` | ||
// to make results manageable | ||
} | ||
|
||
// if it's a data url, take the mime type from there, done. | ||
if ( url && | ||
typeof url === "string" ) { | ||
const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ ); | ||
if ( match && match[ 1 ] ) { | ||
return normalizeSubtype( match[ 1 ] ); | ||
} | ||
} | ||
// if we get a content-type header, use it! | ||
if ( contentType && | ||
typeof contentType === "string" ) { | ||
const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ ); | ||
if ( match && match[ 1 ] ) { | ||
return normalizeSubtype( match[ 1 ] ); | ||
} | ||
} | ||
// otherwise fall back to extension in the URL | ||
if ( url && | ||
typeof url === "string" ) { | ||
const splitOnSlashes = url.split("/"); | ||
if ( splitOnSlashes.length > 1 ) { | ||
const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ], | ||
splitOnDots = afterLastSlash.split("."); | ||
if ( splitOnDots.length > 1 ) { | ||
return normalizeSubtype( | ||
splitOnDots[ splitOnDots.length - 1 ] | ||
.toLowerCase() | ||
.replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params | ||
); | ||
} | ||
} | ||
} | ||
// otherwise throw up our hands | ||
return 'unknown'; | ||
} | ||
const parsed = JSON.parse( responsiveImagesJsonString ); | ||
if ( parsed && parsed.map ) { | ||
const dataRegEx = new RegExp('^data'); | ||
return parsed.map( d => ({ | ||
imgURL: d.url, | ||
approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ), | ||
approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ), | ||
byteSize: Math.floor( d.byteSize || 0 ), | ||
bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ), | ||
isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1, | ||
isDataURL: dataRegEx.test(d.url), | ||
resourceFormat: pithyType({ contentType: d.mimeType, url: d.url }) | ||
}) ); | ||
} | ||
'''; | ||
|
||
WITH imgs AS ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
url AS pageURL, | ||
imgURL, | ||
approximateResourceWidth, | ||
approximateResourceHeight, | ||
byteSize, | ||
bitsPerPixel, | ||
isPixel, | ||
isDataURL, | ||
(approximateResourceWidth * approximateResourceHeight) / 1000000 AS megapixels, | ||
(approximateResourceWidth / approximateResourceHeight) AS aspectRatio, | ||
resourceFormat | ||
FROM | ||
`httparchive.pages.2024_06_01_*`, | ||
UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images'))) | ||
), | ||
|
||
percentiles AS ( | ||
SELECT | ||
client, | ||
APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles, | ||
APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles, | ||
APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles, | ||
APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles, | ||
APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles, | ||
APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles, | ||
COUNT(0) AS imgCount | ||
FROM | ||
imgs | ||
WHERE | ||
approximateResourceWidth > 1 AND | ||
approximateResourceHeight > 1 | ||
GROUP BY | ||
client | ||
) | ||
|
||
SELECT | ||
percentile, | ||
client, | ||
imgCount, | ||
resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth, | ||
resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight, | ||
aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio, | ||
megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels, | ||
byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize, | ||
bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel | ||
FROM | ||
percentiles, | ||
UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile | ||
ORDER BY | ||
imgCount DESC, | ||
percentile |
Oops, something went wrong.