Skip to content

Commit

Permalink
Media 2024 queries (#3738)
Browse files Browse the repository at this point in the history
* Add web almanac 2024 SQL queries

* Update video_source_types.todo.sql

Linter prefers '

* Update video_media.sql

Fixing inconsistencies in linter

* Update video_media.sql

Linter

* Update video_formats.sql

removing  TABLESAMPLE SYSTEM (0.001 PERCENT)

---------

Co-authored-by: Mike Gifford <[email protected]>
  • Loading branch information
stefanjudis and mgifford authored Nov 9, 2024
1 parent 0a512e8 commit fd87798
Show file tree
Hide file tree
Showing 58 changed files with 3,018 additions and 0 deletions.
82 changes: 82 additions & 0 deletions sql/2024/media/animated_gif_bpp.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#standardSQL
# what is the bpp of animated vs non-animated GIFs?
# animated_gif_bpp.sql


# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
RETURNS INT64
LANGUAGE js AS r'''
if (!imageMagickNumberString) { return null; }
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
const multiples = {
'K': 1e3,
'M': 1e6,
'G': 1e9,
'T': 1e12
}
if ( matched && matched[1] ) {
return Math.round(
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
);
} else {
return null;
}
''';

# ImageMagick reports bytesizes in a friendly, human readable format. Just return bytes
CREATE TEMPORARY FUNCTION magickBytes(imageMagickNumberString STRING)
RETURNS INT64
LANGUAGE js AS r'''
if (!imageMagickNumberString) { return 0; }
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)$/);
const multiples = {
'B': 1,
'KB': 1e3,
'MB': 1e6,
'GB': 1e9,
'TB': 1e12
}
if ( matched && matched[1] && matched[2] ) {
return Math.round(
parseFloat( matched[1] ) * multiples[ matched[2] ]
);
} else {
return null;
}
''';

WITH gifs AS (
SELECT
_TABLE_SUFFIX AS client,
CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL) AS is_animated,
(magickBytes(JSON_VALUE(payload, '$._image_details.magick.filesize')) * 8) /
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) AS bits_per_pixel
FROM
`requests.2024_06_01_*`
WHERE
JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 AND
JSON_VALUE(payload, '$._image_details.animated') IS NOT NULL
)

SELECT
percentile,
client,
is_animated,
APPROX_QUANTILES(bits_per_pixel, 1000)[OFFSET(percentile * 10)] AS bpp
FROM
gifs,
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
percentile,
client,
is_animated
ORDER BY
percentile,
client,
is_animated
39 changes: 39 additions & 0 deletions sql/2024/media/animated_gif_count.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#standardSQL
# what percent of gifs are animated?
# animated_gif_count.sql

# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
RETURNS INT64
LANGUAGE js AS r'''
if (!imageMagickNumberString) { return null; }
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
const multiples = {
'K': 1e3,
'M': 1e6,
'G': 1e9,
'T': 1e12
}
if ( matched && matched[1] ) {
return Math.round(
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
);
} else {
return null;
}
''';

SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total_gifs,
COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) AS total_animated_gifs,
COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) / COUNT(0) AS pct_animated_gifs
FROM
`requests.2024_06_01_*`
WHERE
JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1
GROUP BY
client
50 changes: 50 additions & 0 deletions sql/2024/media/animated_gif_framecount.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#standardSQL
# distribution of animated GIF framecounts
# animated_gif_framecount.sql

# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
RETURNS INT64
LANGUAGE js AS r'''
if (!imageMagickNumberString) { return null; }
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
const multiples = {
'K': 1e3,
'M': 1e6,
'G': 1e9,
'T': 1e12
}
if ( matched && matched[1] ) {
return Math.round(
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
);
} else {
return null;
}
''';

WITH framecounts AS (
SELECT
_TABLE_SUFFIX AS client,
CAST(JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') AS NUMERIC) AS framecount
FROM `requests.2024_06_01_*`
WHERE
JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') IS NOT NULL AND
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1
)

SELECT
percentile,
client,
APPROX_QUANTILES(framecount, 1000)[OFFSET(percentile * 10)] AS framecount
FROM
framecounts,
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
GROUP BY
percentile,
client
ORDER BY
percentile,
client
13 changes: 13 additions & 0 deletions sql/2024/media/at_least_one_image_request.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#standardSQL
# What % of pages load at least one image?
# ✅ at_least_one_image_request.sql

SELECT
_TABLE_SUFFIX AS client,
COUNTIF(reqImg > 0) AS atLeastOneImgReqCount,
COUNT(0) AS total,
SAFE_DIVIDE(COUNTIF(reqImg > 0), COUNT(0)) AS atLeastOneImgReqPct
FROM
`httparchive.summary_pages.2024_06_01_*`
GROUP BY
client
31 changes: 31 additions & 0 deletions sql/2024/media/at_least_one_img.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#standardSQL
# What % of pages have at least one <img>?
# at_least_one_img.sql

CREATE TEMPORARY FUNCTION numberOfImages(images_string STRING)
RETURNS INT64
LANGUAGE js AS '''
try {
return JSON.parse(images_string).filter( i => parseInt(i.approximateResourceWidth) > 1 && parseInt(i.approximateResourceWidth) > 1 ).length;
} catch {
return 0;
}
''';

WITH numImgs AS (
SELECT
_TABLE_SUFFIX AS client,
numberOfImages(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')) AS numberOfImages
FROM
`httparchive.pages.2024_06_01_*`
)

SELECT
client,
COUNTIF(numberOfImages > 0) AS atLeastOneCount,
COUNT(0) AS total,
SAFE_DIVIDE(COUNTIF(numberOfImages > 0), COUNT(0)) AS atLeastOnePct
FROM
numImgs
GROUP BY
client
174 changes: 174 additions & 0 deletions sql/2024/media/bytes_and_dimensions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#standardSQL
# Measuring img loaded bytes and dimensions
# bytes_and_dimensions.sql

CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING)
RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, bitsPerPixel NUMERIC, isPixel BOOL, isDataURL BOOL, resourceFormat STRING>>
LANGUAGE js AS '''
function pithyType( { contentType, url } ) {
const subtypeMap = {
'svg+xml': 'svg',
'svgz': 'svg',
'jpeg': 'jpg',
'jfif': 'jpg',
'x-png': 'png',
'vnd.microsoft.icon': 'ico',
'x-icon': 'ico',
'jxr': 'jxr',
'vnd.ms-photo': 'jxr',
'hdp': 'jxr',
'wdp': 'jxr',
'jpf': 'jp2',
'jpx': 'jp2',
'jpm': 'jp2',
'mj2': 'jp2',
'x-jp2-container': 'jp2',
'x-jp2-codestream': 'jp2',
'x-jpeg2000-image': 'jp2',
'heic': 'heif',
'x-ms-bmp': 'bmp',
'x-pict': 'pict',
'tif': 'tiff',
'x-tif': 'tiff',
'x-tiff': 'tiff',
'vnd.mozilla.apng': 'apng',
// identities
'apng': 'apng',
'jpg': 'jpg',
'jp2': 'jp2',
'png': 'png',
'gif': 'gif',
'ico': 'ico',
'webp': 'webp',
'avif': 'avif',
'tiff': 'tiff',
'flif': 'flif',
'heif': 'heif',
'jxl': 'jxl',
'avif-sequence': 'avif-sequence', // keep separate from single frames...
'heic-sequence': 'heic-sequence',
'bmp': 'bmp',
'pict': 'pict'
};
function normalizeSubtype( subtype ) {
if ( subtypeMap[ subtype ] ) {
return subtypeMap[ subtype ];
}
return 'unknown'; // switch between:
// `subtype`
// to see everything, check if there's anything else worth capturing
// `'unknown'`
// to make results manageable
}

// if it's a data url, take the mime type from there, done.
if ( url &&
typeof url === "string" ) {
const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ );
if ( match && match[ 1 ] ) {
return normalizeSubtype( match[ 1 ] );
}
}
// if we get a content-type header, use it!
if ( contentType &&
typeof contentType === "string" ) {
const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ );
if ( match && match[ 1 ] ) {
return normalizeSubtype( match[ 1 ] );
}
}
// otherwise fall back to extension in the URL
if ( url &&
typeof url === "string" ) {
const splitOnSlashes = url.split("/");
if ( splitOnSlashes.length > 1 ) {
const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ],
splitOnDots = afterLastSlash.split(".");
if ( splitOnDots.length > 1 ) {
return normalizeSubtype(
splitOnDots[ splitOnDots.length - 1 ]
.toLowerCase()
.replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params
);
}
}
}
// otherwise throw up our hands
return 'unknown';
}
const parsed = JSON.parse( responsiveImagesJsonString );
if ( parsed && parsed.map ) {
const dataRegEx = new RegExp('^data');
return parsed.map( d => ({
imgURL: d.url,
approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ),
approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ),
byteSize: Math.floor( d.byteSize || 0 ),
bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ),
isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1,
isDataURL: dataRegEx.test(d.url),
resourceFormat: pithyType({ contentType: d.mimeType, url: d.url })
}) );
}
''';

WITH imgs AS (
SELECT
_TABLE_SUFFIX AS client,
url AS pageURL,
imgURL,
approximateResourceWidth,
approximateResourceHeight,
byteSize,
bitsPerPixel,
isPixel,
isDataURL,
(approximateResourceWidth * approximateResourceHeight) / 1000000 AS megapixels,
(approximateResourceWidth / approximateResourceHeight) AS aspectRatio,
resourceFormat
FROM
`httparchive.pages.2024_06_01_*`,
UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')))
),

percentiles AS (
SELECT
client,
APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles,
APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles,
APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles,
APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles,
APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles,
APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles,
COUNT(0) AS imgCount
FROM
imgs
WHERE
approximateResourceWidth > 1 AND
approximateResourceHeight > 1
GROUP BY
client
)

SELECT
percentile,
client,
imgCount,
resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth,
resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight,
aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio,
megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels,
byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize,
bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel
FROM
percentiles,
UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile
ORDER BY
imgCount DESC,
percentile
Loading

0 comments on commit fd87798

Please sign in to comment.