diff --git a/sql/2024/media/animated_gif_bpp.sql b/sql/2024/media/animated_gif_bpp.sql new file mode 100644 index 00000000000..7aa492a9bd3 --- /dev/null +++ b/sql/2024/media/animated_gif_bpp.sql @@ -0,0 +1,82 @@ +#standardSQL +# what is the bpp of animated vs non-animated GIFs? +# animated_gif_bpp.sql + + +# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that. +CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING) +RETURNS INT64 +LANGUAGE js AS r''' + +if (!imageMagickNumberString) { return null; } +const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/); +const multiples = { + 'K': 1e3, + 'M': 1e6, + 'G': 1e9, + 'T': 1e12 +} +if ( matched && matched[1] ) { + return Math.round( + parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 ) + ); +} else { + return null; +} + +'''; + +# ImageMagick reports bytesizes in a friendly, human readable format. Just return bytes +CREATE TEMPORARY FUNCTION magickBytes(imageMagickNumberString STRING) +RETURNS INT64 +LANGUAGE js AS r''' + +if (!imageMagickNumberString) { return 0; } +const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)$/); +const multiples = { + 'B': 1, + 'KB': 1e3, + 'MB': 1e6, + 'GB': 1e9, + 'TB': 1e12 +} +if ( matched && matched[1] && matched[2] ) { + return Math.round( + parseFloat( matched[1] ) * multiples[ matched[2] ] + ); +} else { + return null; +} + +'''; + +WITH gifs AS ( + SELECT + _TABLE_SUFFIX AS client, + CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL) AS is_animated, + (magickBytes(JSON_VALUE(payload, '$._image_details.magick.filesize')) * 8) / + magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) AS bits_per_pixel + FROM + `requests.2024_06_01_*` + WHERE + JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND + magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 AND + JSON_VALUE(payload, '$._image_details.animated') IS NOT NULL +) + +SELECT + percentile, + client, + is_animated, + APPROX_QUANTILES(bits_per_pixel, 1000)[OFFSET(percentile * 10)] AS bpp +FROM + gifs, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client, + is_animated +ORDER BY + percentile, + client, + is_animated diff --git a/sql/2024/media/animated_gif_count.sql b/sql/2024/media/animated_gif_count.sql new file mode 100644 index 00000000000..01af1506452 --- /dev/null +++ b/sql/2024/media/animated_gif_count.sql @@ -0,0 +1,39 @@ +#standardSQL +# what percent of gifs are animated? +# animated_gif_count.sql + +# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that. +CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING) +RETURNS INT64 +LANGUAGE js AS r''' + +if (!imageMagickNumberString) { return null; } +const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/); +const multiples = { + 'K': 1e3, + 'M': 1e6, + 'G': 1e9, + 'T': 1e12 +} +if ( matched && matched[1] ) { + return Math.round( + parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 ) + ); +} else { + return null; +} + +'''; + +SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_gifs, + COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) AS total_animated_gifs, + COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) / COUNT(0) AS pct_animated_gifs +FROM + `requests.2024_06_01_*` +WHERE + JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND + magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 +GROUP BY + client diff --git a/sql/2024/media/animated_gif_framecount.sql b/sql/2024/media/animated_gif_framecount.sql new file mode 100644 index 00000000000..531052b462b --- /dev/null +++ b/sql/2024/media/animated_gif_framecount.sql @@ -0,0 +1,50 @@ +#standardSQL +# distribution of animated GIF framecounts +# animated_gif_framecount.sql + +# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that. +CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING) +RETURNS INT64 +LANGUAGE js AS r''' + +if (!imageMagickNumberString) { return null; } +const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/); +const multiples = { + 'K': 1e3, + 'M': 1e6, + 'G': 1e9, + 'T': 1e12 +} +if ( matched && matched[1] ) { + return Math.round( + parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 ) + ); +} else { + return null; +} + +'''; + +WITH framecounts AS ( + SELECT + _TABLE_SUFFIX AS client, + CAST(JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') AS NUMERIC) AS framecount + FROM `requests.2024_06_01_*` + WHERE + JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') IS NOT NULL AND + magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 +) + +SELECT + percentile, + client, + APPROX_QUANTILES(framecount, 1000)[OFFSET(percentile * 10)] AS framecount +FROM + framecounts, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/at_least_one_image_request.sql b/sql/2024/media/at_least_one_image_request.sql new file mode 100644 index 00000000000..9d514b663b9 --- /dev/null +++ b/sql/2024/media/at_least_one_image_request.sql @@ -0,0 +1,13 @@ +#standardSQL +# What % of pages load at least one image? +# ✅ at_least_one_image_request.sql + +SELECT + _TABLE_SUFFIX AS client, + COUNTIF(reqImg > 0) AS atLeastOneImgReqCount, + COUNT(0) AS total, + SAFE_DIVIDE(COUNTIF(reqImg > 0), COUNT(0)) AS atLeastOneImgReqPct +FROM + `httparchive.summary_pages.2024_06_01_*` +GROUP BY + client diff --git a/sql/2024/media/at_least_one_img.sql b/sql/2024/media/at_least_one_img.sql new file mode 100644 index 00000000000..420af6df10f --- /dev/null +++ b/sql/2024/media/at_least_one_img.sql @@ -0,0 +1,31 @@ +#standardSQL +# What % of pages have at least one ? +# at_least_one_img.sql + +CREATE TEMPORARY FUNCTION numberOfImages(images_string STRING) +RETURNS INT64 +LANGUAGE js AS ''' +try { + return JSON.parse(images_string).filter( i => parseInt(i.approximateResourceWidth) > 1 && parseInt(i.approximateResourceWidth) > 1 ).length; +} catch { + return 0; +} +'''; + +WITH numImgs AS ( + SELECT + _TABLE_SUFFIX AS client, + numberOfImages(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')) AS numberOfImages + FROM + `httparchive.pages.2024_06_01_*` +) + +SELECT + client, + COUNTIF(numberOfImages > 0) AS atLeastOneCount, + COUNT(0) AS total, + SAFE_DIVIDE(COUNTIF(numberOfImages > 0), COUNT(0)) AS atLeastOnePct +FROM + numImgs +GROUP BY + client diff --git a/sql/2024/media/bytes_and_dimensions.sql b/sql/2024/media/bytes_and_dimensions.sql new file mode 100644 index 00000000000..9461f755995 --- /dev/null +++ b/sql/2024/media/bytes_and_dimensions.sql @@ -0,0 +1,174 @@ +#standardSQL +# Measuring img loaded bytes and dimensions +# bytes_and_dimensions.sql + +CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' + +function pithyType( { contentType, url } ) { + const subtypeMap = { + 'svg+xml': 'svg', + 'svgz': 'svg', + 'jpeg': 'jpg', + 'jfif': 'jpg', + 'x-png': 'png', + 'vnd.microsoft.icon': 'ico', + 'x-icon': 'ico', + 'jxr': 'jxr', + 'vnd.ms-photo': 'jxr', + 'hdp': 'jxr', + 'wdp': 'jxr', + 'jpf': 'jp2', + 'jpx': 'jp2', + 'jpm': 'jp2', + 'mj2': 'jp2', + 'x-jp2-container': 'jp2', + 'x-jp2-codestream': 'jp2', + 'x-jpeg2000-image': 'jp2', + 'heic': 'heif', + 'x-ms-bmp': 'bmp', + 'x-pict': 'pict', + 'tif': 'tiff', + 'x-tif': 'tiff', + 'x-tiff': 'tiff', + 'vnd.mozilla.apng': 'apng', + // identities + 'apng': 'apng', + 'jpg': 'jpg', + 'jp2': 'jp2', + 'png': 'png', + 'gif': 'gif', + 'ico': 'ico', + 'webp': 'webp', + 'avif': 'avif', + 'tiff': 'tiff', + 'flif': 'flif', + 'heif': 'heif', + 'jxl': 'jxl', + 'avif-sequence': 'avif-sequence', // keep separate from single frames... + 'heic-sequence': 'heic-sequence', + 'bmp': 'bmp', + 'pict': 'pict' + }; + + function normalizeSubtype( subtype ) { + if ( subtypeMap[ subtype ] ) { + return subtypeMap[ subtype ]; + } + return 'unknown'; // switch between: + // `subtype` + // to see everything, check if there's anything else worth capturing + // `'unknown'` + // to make results manageable + } + + // if it's a data url, take the mime type from there, done. + if ( url && + typeof url === "string" ) { + const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ ); + if ( match && match[ 1 ] ) { + return normalizeSubtype( match[ 1 ] ); + } + } + + // if we get a content-type header, use it! + if ( contentType && + typeof contentType === "string" ) { + const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ ); + if ( match && match[ 1 ] ) { + return normalizeSubtype( match[ 1 ] ); + } + } + + // otherwise fall back to extension in the URL + if ( url && + typeof url === "string" ) { + const splitOnSlashes = url.split("/"); + if ( splitOnSlashes.length > 1 ) { + const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ], + splitOnDots = afterLastSlash.split("."); + if ( splitOnDots.length > 1 ) { + return normalizeSubtype( + splitOnDots[ splitOnDots.length - 1 ] + .toLowerCase() + .replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params + ); + } + } + } + + // otherwise throw up our hands + return 'unknown'; + } + + const parsed = JSON.parse( responsiveImagesJsonString ); + if ( parsed && parsed.map ) { + const dataRegEx = new RegExp('^data'); + return parsed.map( d => ({ + imgURL: d.url, + approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ), + approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ), + byteSize: Math.floor( d.byteSize || 0 ), + bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ), + isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1, + isDataURL: dataRegEx.test(d.url), + resourceFormat: pithyType({ contentType: d.mimeType, url: d.url }) + }) ); + } +'''; + +WITH imgs AS ( + SELECT + _TABLE_SUFFIX AS client, + url AS pageURL, + imgURL, + approximateResourceWidth, + approximateResourceHeight, + byteSize, + bitsPerPixel, + isPixel, + isDataURL, + (approximateResourceWidth * approximateResourceHeight) / 1000000 AS megapixels, + (approximateResourceWidth / approximateResourceHeight) AS aspectRatio, + resourceFormat + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images'))) +), + +percentiles AS ( + SELECT + client, + APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles, + APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles, + APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles, + APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles, + APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles, + APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles, + COUNT(0) AS imgCount + FROM + imgs + WHERE + approximateResourceWidth > 1 AND + approximateResourceHeight > 1 + GROUP BY + client +) + +SELECT + percentile, + client, + imgCount, + resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth, + resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight, + aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio, + megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels, + byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize, + bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel +FROM + percentiles, + UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile +ORDER BY + imgCount DESC, + percentile diff --git a/sql/2024/media/bytes_and_dimensions_by_format.sql b/sql/2024/media/bytes_and_dimensions_by_format.sql new file mode 100644 index 00000000000..8bca387cbf4 --- /dev/null +++ b/sql/2024/media/bytes_and_dimensions_by_format.sql @@ -0,0 +1,174 @@ +#standardSQL +# Measuring img loaded bytes and dimensions, broken down by detected format +# bytes_and_dimensions_by_format.sql + +CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' + +function pithyType( { contentType, url } ) { + const subtypeMap = { + 'svg+xml': 'svg', + 'svgz': 'svg', + 'jpeg': 'jpg', + 'jfif': 'jpg', + 'x-png': 'png', + 'vnd.microsoft.icon': 'ico', + 'x-icon': 'ico', + 'jxr': 'jxr', + 'vnd.ms-photo': 'jxr', + 'hdp': 'jxr', + 'wdp': 'jxr', + 'jpf': 'jp2', + 'jpx': 'jp2', + 'jpm': 'jp2', + 'mj2': 'jp2', + 'x-jp2-container': 'jp2', + 'x-jp2-codestream': 'jp2', + 'x-jpeg2000-image': 'jp2', + 'heic': 'heif', + 'x-ms-bmp': 'bmp', + 'x-pict': 'pict', + 'tif': 'tiff', + 'x-tif': 'tiff', + 'x-tiff': 'tiff', + 'vnd.mozilla.apng': 'apng', + // identities + 'apng': 'apng', + 'jpg': 'jpg', + 'jp2': 'jp2', + 'png': 'png', + 'gif': 'gif', + 'ico': 'ico', + 'webp': 'webp', + 'avif': 'avif', + 'tiff': 'tiff', + 'flif': 'flif', + 'heif': 'heif', + 'jxl': 'jxl', + 'avif-sequence': 'avif-sequence', // keep separate from single frames... + 'heic-sequence': 'heic-sequence', + 'bmp': 'bmp', + 'pict': 'pict' + }; + + function normalizeSubtype( subtype ) { + if ( subtypeMap[ subtype ] ) { + return subtypeMap[ subtype ]; + } + return 'unknown'; // switch between: + // `subtype` + // to see everything, check if there's anything else worth capturing + // `'unknown'` + // to make results manageable + } + + // if it's a data url, take the mime type from there, done. + if ( url && typeof url === "string" ) { + const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ ); + if ( match && match[ 1 ] ) { + return normalizeSubtype( match[ 1 ] ); + } + } + + // if we get a content-type header, use it! + if ( contentType && typeof contentType === "string" ) { + const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ ); + if ( match && match[ 1 ] ) { + return normalizeSubtype( match[ 1 ] ); + } + } + + // otherwise fall back to extension in the URL + if ( url && typeof url === "string" ) { + const splitOnSlashes = url.split("/"); + if ( splitOnSlashes.length > 1 ) { + const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ], + splitOnDots = afterLastSlash.split("."); + if ( splitOnDots.length > 1 ) { + return normalizeSubtype( + splitOnDots[ splitOnDots.length - 1 ] + .toLowerCase() + .replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params + ); + } + } + } + + // otherwise throw up our hands + return 'unknown'; + } + + const parsed = JSON.parse( responsiveImagesJsonString ); + if ( parsed && parsed.map ) { + const dataRegEx = new RegExp('^data'); + return parsed.map( d => ({ + imgURL: d.url, + approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ), + approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ), + byteSize: Math.floor( d.byteSize || 0 ), + bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ), + isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1, + isDataURL: dataRegEx.test(d.url), + resourceFormat: pithyType({ contentType: d.mimeType, url: d.url }) + }) ); + } +'''; + +WITH imgs AS ( + SELECT + _TABLE_SUFFIX AS client, + url AS pageURL, + imgURL, + approximateResourceWidth, + approximateResourceHeight, + byteSize, + bitsPerPixel, + isPixel, + isDataURL, + (approximateResourceWidth * approximateResourceHeight) / 1000000 AS megapixels, + (approximateResourceWidth / approximateResourceHeight) AS aspectRatio, + resourceFormat + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images'))) +), + +percentiles AS ( + SELECT + client, + resourceFormat, + APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles, + APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles, + APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles, + APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles, + APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles, + APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles, + COUNT(0) AS imgCount + FROM + imgs + WHERE + approximateResourceWidth > 1 AND + approximateResourceHeight > 1 + GROUP BY + client, + resourceFormat +) + +SELECT + percentile, + client, + resourceFormat, + imgCount, + resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth, + resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight, + aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio, + megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels, + byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize, + bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel +FROM + percentiles, + UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile +ORDER BY + imgCount DESC, + percentile diff --git a/sql/2024/media/color_spaces_and_depth.sql b/sql/2024/media/color_spaces_and_depth.sql new file mode 100644 index 00000000000..3ba42cbac07 --- /dev/null +++ b/sql/2024/media/color_spaces_and_depth.sql @@ -0,0 +1,74 @@ +#standardSQL +# info on color depth and spaces – will need to be further filtered/aggregated, but this is everything +# color_spaces_and_depth.sql + +CREATE TEMPORARY FUNCTION magickMillions(imageMagickNumberString STRING) +RETURNS FLOAT64 +LANGUAGE js AS r''' + +if (!imageMagickNumberString) { return 0; } +const matched = imageMagickNumberString.match( /(\d+)\.?(\d+)?M$/ ); +if ( matched && matched[1] ) { + if ( matched[2] ) { + // input had a decimal (e.g. "1.23456M") + return `${ matched[1] }.${ matched[2] }e6`; + } else { + // input did not have a decimal (e.g. "1M") + return `${ matched[1] }e6`; + } +} else { + return imageMagickNumberString; +} + +'''; + +WITH color_info AS ( + SELECT + _TABLE_SUFFIX AS client, + JSON_EXTRACT(payload, '$._image_details.magick.depth') AS color_depth, + JSON_EXTRACT(payload, '$._image_details.magick.colorspace') AS color_space, + JSON_EXTRACT(payload, '$._image_details.magick.properties.icc:description') AS description + FROM + `requests.2024_06_01_*` + WHERE + JSON_QUERY(payload, '$._image_details.magick.colorspace') IS NOT NULL AND + magickMillions(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 +), + +totals AS ( + SELECT + client, + COUNT(0) AS total + FROM + color_info + GROUP BY + client +), + +counts AS ( + SELECT + client, + color_depth, + color_space, + description, + COUNT(0) AS count + FROM + color_info + GROUP BY + client, + color_depth, + color_space, + description +) + +SELECT + *, + count / total AS pct +FROM + counts +JOIN + totals +USING + (client) +ORDER BY + count DESC diff --git a/sql/2024/media/image_0x0.sql b/sql/2024/media/image_0x0.sql new file mode 100644 index 00000000000..f23f2c2e0a8 --- /dev/null +++ b/sql/2024/media/image_0x0.sql @@ -0,0 +1,48 @@ +#standardSQL +# s whose sources have no dimensions (mostly not loaded because loading=lazy) +# image_0x0.sql + +CREATE TEMPORARY FUNCTION getPixelInfo(responsiveImagesJsonString STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +const parsed = JSON.parse(responsiveImagesJsonString); +if (parsed && parsed.map) { + const dataRegEx = new RegExp('^data'); + return parsed.map(d => ({ + isPixel: d.approximateResourceWidth == 0 && d.approximateResourceHeight == 0, + isDataURL: dataRegEx.test(d.url) + })); +} +'''; + +WITH imgs AS ( + SELECT + _TABLE_SUFFIX AS client, + isPixel, + isDataURL + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getPixelInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images'))) +), + +counts AS ( + SELECT + client, + COUNT(0) AS total_imgs, + COUNTIF(isPixel) AS zero_pixel_imgs, + COUNTIF(isPixel AND isDataURL) AS zero_pixel_data_urls + FROM + imgs + GROUP BY + client +) + +SELECT + client, + total_imgs, + zero_pixel_imgs, + zero_pixel_data_urls, + SAFE_DIVIDE(zero_pixel_imgs, total_imgs) AS pct_zero_pixel_imgs, + SAFE_DIVIDE(zero_pixel_data_urls, total_imgs) AS pct_zero_pixel_data_urls +FROM + counts diff --git a/sql/2024/media/image_1x1.sql b/sql/2024/media/image_1x1.sql new file mode 100644 index 00000000000..9c78abd2649 --- /dev/null +++ b/sql/2024/media/image_1x1.sql @@ -0,0 +1,48 @@ +#standardSQL +# s whose sources are 1x1 (tracking beacons, mostly) +# image_1x1.sql + +CREATE TEMPORARY FUNCTION getPixelInfo(responsiveImagesJsonString STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +const parsed = JSON.parse(responsiveImagesJsonString); +if (parsed && parsed.map) { + const dataRegEx = new RegExp('^data'); + return parsed.map(d => ({ + isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1, + isDataURL: dataRegEx.test(d.url) + })); +} +'''; + +WITH imgs AS ( + SELECT + _TABLE_SUFFIX AS client, + isPixel, + isDataURL + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getPixelInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images'))) +), + +counts AS ( + SELECT + client, + COUNT(0) AS total_imgs, + COUNTIF(isPixel) AS one_pixel_imgs, + COUNTIF(isPixel AND isDataURL) AS one_pixel_data_urls + FROM + imgs + GROUP BY + client +) + +SELECT + client, + total_imgs, + one_pixel_imgs, + one_pixel_data_urls, + SAFE_DIVIDE(one_pixel_imgs, total_imgs) AS pct_one_pixel_imgs, + SAFE_DIVIDE(one_pixel_data_urls, total_imgs) AS pct_one_pixel_data_urls +FROM + counts diff --git a/sql/2024/media/image_alt.sql b/sql/2024/media/image_alt.sql new file mode 100644 index 00000000000..7f722189b7b --- /dev/null +++ b/sql/2024/media/image_alt.sql @@ -0,0 +1,49 @@ +#standardSQL +# usage of alt text in images +# image_alt.sql + +CREATE TEMPORARY FUNCTION get_markup_info(markup_string STRING) +RETURNS STRUCT< + total INT64, + alt_missing INT64, + alt_blank INT64, + alt_present INT64, + decode_lazy INT64 +> LANGUAGE js AS ''' +var result = {}; +try { + var markup = JSON.parse(markup_string); + + if (Array.isArray(markup) || typeof markup != 'object') return result; + + result.total = markup.images.img.total; + result.alt_missing = markup.images.img.alt.missing; + result.alt_blank = markup.images.img.alt.blank; + result.alt_present = markup.images.img.alt.present; + result.decode_lazy = markup.images.img.decoding || 0; + +} catch (e) {} +return result; +'''; + +SELECT + client, + SAFE_DIVIDE(COUNTIF(markup_info.total > 0), COUNT(0)) AS pages_with_img_pct, + SAFE_DIVIDE(COUNTIF(markup_info.alt_missing > 0), COUNT(0)) AS pages_with_alt_missing_pct, + SAFE_DIVIDE(COUNTIF(markup_info.alt_blank > 0), COUNT(0)) AS pages_with_alt_blank_pct, + SAFE_DIVIDE(COUNTIF(markup_info.alt_present > 0), COUNT(0)) AS pages_with_alt_present_pct, + SUM(markup_info.total) AS img_total, + SAFE_DIVIDE(SUM(markup_info.alt_missing), SUM(markup_info.total)) AS imgs_alt_missing_pct, + SAFE_DIVIDE(SUM(markup_info.alt_blank), SUM(markup_info.total)) AS img_alt_blank_pct, + SAFE_DIVIDE(SUM(markup_info.alt_present), SUM(markup_info.total)) AS img_alt_present_pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + url, + get_markup_info(JSON_EXTRACT_SCALAR(payload, '$._markup')) AS markup_info + FROM + `httparchive.pages.2024_06_01_*`) +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/media/image_cdn_usage.sql b/sql/2024/media/image_cdn_usage.sql new file mode 100644 index 00000000000..ef4540b7b5e --- /dev/null +++ b/sql/2024/media/image_cdn_usage.sql @@ -0,0 +1,37 @@ +#standardSQL +# Looking for some obvious signs in the URL that the image is dynamically resized +# image_cdn_usage.sql + +CREATE TEMPORARY FUNCTION get_images(images_string STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +var result = []; +try { + var images = JSON.parse(images_string); + for (const img of images){ + result.push({ + url: img.url + }) + } +} catch (e) {} +return result; +'''; + +SELECT + client, + COUNT(DISTINCT pageUrl) AS pages, + COUNT(0) AS images, + SAFE_DIVIDE(COUNTIF(imgcdn1), COUNT(0)) AS img_with_cdn1_pct, + SAFE_DIVIDE(COUNTIF(imgcdn2), COUNT(0)) AS img_with_cdn2_pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + a.url AS pageUrl, + imageurl.url, + REGEXP_CONTAINS(imageurl.url, r'.*[,\/]w_\d+.*') AS imgcdn1, + REGEXP_CONTAINS(imageurl.url, r'\?.*w=.*') AS imgcdn2 + FROM + `httparchive.pages.2024_06_01_*` AS a, + UNNEST(get_images(JSON_EXTRACT_SCALAR(payload, '$._Images'))) AS imageurl) +GROUP BY + client diff --git a/sql/2024/media/image_decode_usage.sql b/sql/2024/media/image_decode_usage.sql new file mode 100644 index 00000000000..e793b6285a2 --- /dev/null +++ b/sql/2024/media/image_decode_usage.sql @@ -0,0 +1,48 @@ +#standardSQL +# What percent of s have decode=async? +# image_decode_usage.sql + +CREATE TEMPORARY FUNCTION get_decode_info(images_string STRING) +RETURNS STRUCT< + total INT64, + decode_async INT64 +> LANGUAGE js AS ''' +let result = {}; +try { + let images = JSON.parse(images_string); + if (!Array.isArray(images)) { + return {}; + } + + result.total = images.length; + result.decode_async = 0; + + for(let img of images) { + if(img.decoding === 'async'){ + result.decode_async += 1 + } + } +} catch (e) {} +return result; +'''; + +SELECT + client, + COUNT(0) AS pages_total, + SAFE_DIVIDE(COUNTIF(images_info.total > 0), COUNT(0)) AS pages_with_img_pct, + COUNTIF(images_info.decode_async > 0) AS pages_with_decode_async, + SAFE_DIVIDE(COUNTIF(images_info.decode_async > 0), COUNT(0)) AS pages_with_decode_async_pct, + SUM(images_info.total) AS img_total, + SUM(images_info.decode_async) AS imgs_with_decode_async, + SAFE_DIVIDE(SUM(images_info.decode_async), SUM(images_info.total)) AS imgs_with_decode_async_pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + url, + get_decode_info(JSON_EXTRACT_SCALAR(payload, '$._Images')) AS images_info + FROM + `httparchive.pages.2024_06_01_*`) +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/media/image_layout_widths.sql b/sql/2024/media/image_layout_widths.sql new file mode 100644 index 00000000000..8b81c957f21 --- /dev/null +++ b/sql/2024/media/image_layout_widths.sql @@ -0,0 +1,42 @@ +#standardSQL +# Distribution of image layout widths +# image_layout_widths.sql + +CREATE TEMPORARY FUNCTION layoutDimensions(payload STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.map(({clientWidth, clientHeight}) => ({ + clientWidth, + clientHeight + })); +} catch (e) { + return []; +} +'''; + +SELECT + percentile, + client, + APPROX_QUANTILES(image.clientWidth, 1000)[OFFSET(percentile * 10)] AS clientWidth +FROM ( + SELECT + _TABLE_SUFFIX AS client, + image + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(layoutDimensions(payload)) AS image + WHERE + image.clientWidth > 1 +), +UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/image_sizes_attribute_strings.sql b/sql/2024/media/image_sizes_attribute_strings.sql new file mode 100644 index 00000000000..ea4df6040a0 --- /dev/null +++ b/sql/2024/media/image_sizes_attribute_strings.sql @@ -0,0 +1,24 @@ +#standardSQL +# What are the most common values for sizes attributes (lots of auto!) +# ❕ Updated in 2024 to use all.requests instead of almanac.summary_response_bodies +# image_sizes_attribute_strings.sql + +SELECT + client, + sizes, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY 0) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY 0) AS pct +FROM + `httparchive.all.requests`, + UNNEST(REGEXP_EXTRACT_ALL(response_body, r'(?im)<(?:source|img)[^>]*sizes=[\'"]?([^\'"]*)')) AS sizes +WHERE + date = '2024-06-01' AND + is_main_document +GROUP BY + client, + sizes +ORDER BY + freq DESC +LIMIT + 100 diff --git a/sql/2024/media/image_sizes_per_page_accuracy_impact.sql b/sql/2024/media/image_sizes_per_page_accuracy_impact.sql new file mode 100644 index 00000000000..0c5c08f9df3 --- /dev/null +++ b/sql/2024/media/image_sizes_per_page_accuracy_impact.sql @@ -0,0 +1,89 @@ +#standardSQL +# measure the impact of innacurate sizes attributes per page +# image_sizes_per_page_accuracy_impact.sql + +CREATE TEMPORARY FUNCTION pageUsesWDescriptors(payload STRING) +RETURNS BOOL +LANGUAGE js AS ''' +try { + + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.reduce( ( acc, cv ) => { + if ( cv.srcsetHasWDescriptors ) { + acc = true; + } + return acc; + }, false ); + +} catch (e) { + return false; +} +'''; + +CREATE TEMPORARY FUNCTION getTotalEstimatedWastedLoadedPixels(payload STRING) +RETURNS INT64 +LANGUAGE js AS ''' +try { + + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.reduce( ( acc, cv ) => { + if ( cv.actualSizesEstimatedWastedLoadedPixels > 0 ) { + acc += cv.actualSizesEstimatedWastedLoadedPixels; + } + return acc; + }, 0 ); + +} catch (e) { + return null; +} +'''; + +CREATE TEMPORARY FUNCTION getTotalEstimatedWastedLoadedBytes(payload STRING) +RETURNS FLOAT64 +LANGUAGE js AS ''' +try { + + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.reduce( ( acc, cv ) => { + if ( cv.actualSizesEstimatedWastedLoadedBytes > 0 ) { + acc += cv.actualSizesEstimatedWastedLoadedBytes; + } + return acc; + }, 0 ); + +} catch (e) { + return null; +} +'''; + +SELECT + percentile, + client, + APPROX_QUANTILES(totalEstimatedWastedLoadedPixels, 1000)[OFFSET(percentile * 10)] AS totalEstimatedWastedLoadedPixels, + APPROX_QUANTILES(totalEstimatedWastedLoadedBytes, 1000)[OFFSET(percentile * 10)] AS totalEstimatedWastedLoadedBytes +FROM ( + SELECT + _TABLE_SUFFIX AS client, + getTotalEstimatedWastedLoadedPixels(payload) AS totalEstimatedWastedLoadedPixels, + getTotalEstimatedWastedLoadedBytes(payload) AS totalEstimatedWastedLoadedBytes + FROM + `httparchive.pages.2024_06_01_*` + WHERE + pageUsesWDescriptors(payload) = TRUE +), +UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/image_sizing_extrinsic_intrinsic.sql b/sql/2024/media/image_sizing_extrinsic_intrinsic.sql new file mode 100644 index 00000000000..41d75a1e0eb --- /dev/null +++ b/sql/2024/media/image_sizing_extrinsic_intrinsic.sql @@ -0,0 +1,37 @@ +#standardSQL +# Are s sized extrinsically or intrinsically (both height and width)? +# image_sizing_extrinsic_intrinsic.sql + +CREATE TEMPORARY FUNCTION getImageSizing(payload STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.flatMap(({intrinsicOrExtrinsicSizing}) => ([ + {property: 'width', value: intrinsicOrExtrinsicSizing.width}, + {property: 'height', value: intrinsicOrExtrinsicSizing.height} + ])); +} catch (e) { + return []; +} +'''; + +SELECT + _TABLE_SUFFIX AS client, + image.property, + image.value, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX, image.property) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX, image.property) AS pct +FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getImageSizing(payload)) AS image +GROUP BY + client, + property, + value +ORDER BY + pct DESC diff --git a/sql/2024/media/image_srcset_candidates.sql b/sql/2024/media/image_srcset_candidates.sql new file mode 100644 index 00000000000..58ed7a7b558 --- /dev/null +++ b/sql/2024/media/image_srcset_candidates.sql @@ -0,0 +1,39 @@ +#standardSQL +# distribution of number of srcset candidates +# image_srcset_candidates.sql + +CREATE TEMPORARY FUNCTION getNumberOfSrcsetCandidates(payload STRING) +RETURNS ARRAY +LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.map(({srcsetCandidateDensities}) => + (srcsetCandidateDensities && srcsetCandidateDensities.length) ? srcsetCandidateDensities.length : null + ); +} catch (e) { + return []; +} +'''; + +SELECT + percentile, + client, + APPROX_QUANTILES(numberOfCandidates, 1000)[OFFSET(percentile * 10)] AS numberOfCandidates +FROM ( + SELECT + _TABLE_SUFFIX AS client, + numberOfCandidates + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getNumberOfSrcsetCandidates(payload)) AS numberOfCandidates +), +UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/image_srcset_densities.sql b/sql/2024/media/image_srcset_densities.sql new file mode 100644 index 00000000000..94457ad7847 --- /dev/null +++ b/sql/2024/media/image_srcset_densities.sql @@ -0,0 +1,41 @@ +#standardSQL +# distribution of the densities of srcset candidates +# image_srcset_densities.sql + +CREATE TEMPORARY FUNCTION getSrcsetDensities(payload STRING) +RETURNS ARRAY>> +LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.map(({currentSrcDensity, srcsetCandidateDensities}) => ({ + currentSrcDensity, + srcsetCandidateDensities: srcsetCandidateDensities.map(density => Math.round(density * 100) / 100) + })); +} catch (e) { + return []; +} +'''; + +SELECT + percentile, + client, + APPROX_QUANTILES(image.currentSrcDensity, 1000)[OFFSET(percentile * 10)] AS currentSrcDensity, + APPROX_QUANTILES(srcsetCandidateDensity, 1000)[OFFSET(percentile * 10)] AS srcsetCandidateDensity +FROM ( + SELECT + _TABLE_SUFFIX AS client, + image + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getSrcsetDensities(payload)) AS image), + UNNEST(image.srcsetCandidateDensities) AS srcsetCandidateDensity, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/image_srcset_descriptor.sql b/sql/2024/media/image_srcset_descriptor.sql new file mode 100644 index 00000000000..91f7141f85a --- /dev/null +++ b/sql/2024/media/image_srcset_descriptor.sql @@ -0,0 +1,41 @@ +#standardSQL +# images with srcset descriptor_x descriptor_w +# image_srcset_descriptor.sql + +CREATE TEMPORARY FUNCTION get_media_info(media_string STRING) +RETURNS STRUCT< + num_srcset_all INT64, + num_srcset_descriptor_x INT64, + num_srcset_descriptor_w INT64 +> LANGUAGE js AS ''' +var result = {}; +try { + var media = JSON.parse(media_string); + + if (Array.isArray(media) || typeof media != 'object') return result; + + result.num_srcset_all = media.num_srcset_all; + result.num_srcset_descriptor_x = media.num_srcset_descriptor_x; + result.num_srcset_descriptor_w = media.num_srcset_descriptor_w; + +} catch (e) {} +return result; +'''; + +SELECT + client, + SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct, + SAFE_DIVIDE(COUNTIF(media_info.num_srcset_descriptor_x > 0), COUNT(0)) AS pages_with_srcset_descriptor_x_pct, + SAFE_DIVIDE(COUNTIF(media_info.num_srcset_descriptor_w > 0), COUNT(0)) AS pages_with_srcset_descriptor_w_pct, + SAFE_DIVIDE(SUM(media_info.num_srcset_descriptor_x), SUM(media_info.num_srcset_all)) AS instances_of_srcset_descriptor_x_pct, + SAFE_DIVIDE(SUM(media_info.num_srcset_descriptor_w), SUM(media_info.num_srcset_all)) AS instances_of_srcset_descriptor_w_pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info + FROM + `httparchive.pages.2024_06_01_*`) +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/media/image_srcset_sizes.sql b/sql/2024/media/image_srcset_sizes.sql new file mode 100644 index 00000000000..7bc616039dd --- /dev/null +++ b/sql/2024/media/image_srcset_sizes.sql @@ -0,0 +1,39 @@ +#standardSQL +# images (and pages) with srcset w/wo sizes +# image_srcset_sizes.sql + +CREATE TEMPORARY FUNCTION get_media_info(media_string STRING) +RETURNS STRUCT< + num_srcset_all INT64, + num_srcset_sizes INT64 +> LANGUAGE js AS ''' +var result = {}; +try { + var media = JSON.parse(media_string); + + if (Array.isArray(media) || typeof media != 'object') return result; + + result.num_srcset_all = media.num_srcset_all; + result.num_srcset_sizes = media.num_srcset_sizes; + +} catch (e) {} +return result; +'''; + +SELECT + client, + SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct, + SAFE_DIVIDE(COUNTIF(media_info.num_srcset_sizes > 0), COUNT(0)) AS pages_with_srcset_sizes_pct, + SAFE_DIVIDE((COUNTIF(media_info.num_srcset_all > 0) - COUNTIF(media_info.num_srcset_sizes > 0)), COUNT(0)) AS pages_with_srcset_wo_sizes_pct, + SAFE_DIVIDE(SUM(media_info.num_srcset_sizes), SUM(media_info.num_srcset_all)) AS instances_of_srcset_sizes_pct, + SAFE_DIVIDE((SUM(media_info.num_srcset_all) - SUM(media_info.num_srcset_sizes)), SUM(media_info.num_srcset_all)) AS instances_of_srcset_wo_sizes_pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info + FROM + `httparchive.pages.2024_06_01_*`) +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/media/image_srcset_sizes_accuracy_pct.sql b/sql/2024/media/image_srcset_sizes_accuracy_pct.sql new file mode 100644 index 00000000000..fdbd99bd1d3 --- /dev/null +++ b/sql/2024/media/image_srcset_sizes_accuracy_pct.sql @@ -0,0 +1,39 @@ +#standardSQL +# How accurate are srcset and sizes attributes, and what is the impact of sizes innacuracies on srcset selection? +# image_srcset_sizes_accuracy_pct.sql + +CREATE TEMPORARY FUNCTION getSrcsetSizesAccuracy(payload STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.map(({srcsetHasWDescriptors, sizesAbsoluteError, sizesRelativeError, wDescriptorAbsoluteError, actualSizesEstimatedWastedLoadedPixels}) => ({ + srcsetHasWDescriptors, + sizesAbsoluteError, + sizesRelativeError, + wDescriptorAbsoluteError, + actualSizesEstimatedWastedLoadedPixels + })); +} catch (e) { + return []; +} +'''; + +SELECT + _TABLE_SUFFIX AS client, + COUNTIF(image.sizesRelativeError > 0.05) AS some_sizes_error, + COUNTIF(image.wDescriptorAbsoluteError > 0) AS any_w_descriptor_error, + COUNTIF(image.actualSizesEstimatedWastedLoadedPixels > 0) AS sizes_error_impacted_srcset_selection, + COUNT(0) AS total_images, + COUNTIF(image.srcsetHasWDescriptors) AS total_images_with_w_descriptors, + COUNTIF(image.sizesRelativeError > 0.05) / COUNTIF(image.srcsetHasWDescriptors) AS pct_some_sizes_error, + COUNTIF(image.wDescriptorAbsoluteError > 0) / COUNTIF(image.srcsetHasWDescriptors) AS pct_any_w_descriptor_error, + COUNTIF(image.actualSizesEstimatedWastedLoadedPixels > 0) / COUNTIF(image.srcsetHasWDescriptors) AS pct_sizes_error_impacted_srcset_selection +FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getSrcsetSizesAccuracy(payload)) AS image +GROUP BY + client diff --git a/sql/2024/media/image_srcset_sizes_per_image_accuracy_impact.sql b/sql/2024/media/image_srcset_sizes_per_image_accuracy_impact.sql new file mode 100644 index 00000000000..825d0023a35 --- /dev/null +++ b/sql/2024/media/image_srcset_sizes_per_image_accuracy_impact.sql @@ -0,0 +1,67 @@ +#standardSQL +# measure the impact of innacurate sizes attributes per +# image_srcset_sizes_per_image_accuracy_impact.sql + +CREATE TEMPORARY FUNCTION getSrcsetSizesAccuracy(payload STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.map(({ + sizesAbsoluteError, + sizesRelativeError, + wDescriptorAbsoluteError, + wDescriptorRelativeError, + idealSizesSelectedResourceEstimatedPixels, + actualSizesEstimatedWastedLoadedPixels, + actualSizesEstimatedWastedLoadedBytes + }) => { + let wastedLoadedPercent; + if ( idealSizesSelectedResourceEstimatedPixels > 0 ) { + wastedLoadedPercent = actualSizesEstimatedWastedLoadedPixels / idealSizesSelectedResourceEstimatedPixels; + } else { + wastedLoadedPercent = null; + } + return { + sizesAbsoluteError, + sizesRelativeError, + wDescriptorAbsoluteError, + wDescriptorRelativeError, + actualSizesEstimatedWastedLoadedPixels, + actualSizesEstimatedWastedLoadedBytes, + wastedLoadedPercent + }; + } +); +} catch (e) { + return []; +} +'''; + +SELECT + percentile, + client, + APPROX_QUANTILES(image.sizesAbsoluteError, 1000)[OFFSET(percentile * 10)] AS sizesAbsoluteError, + APPROX_QUANTILES(image.sizesRelativeError, 1000)[OFFSET(percentile * 10)] AS sizesRelativeError, + APPROX_QUANTILES(image.wDescriptorAbsoluteError, 1000)[OFFSET(percentile * 10)] AS wDescriptorAbsoluteError, + APPROX_QUANTILES(image.wDescriptorRelativeError, 1000)[OFFSET(percentile * 10)] AS wDescriptorRelativeError, + APPROX_QUANTILES(image.actualSizesEstimatedWastedLoadedPixels, 1000)[OFFSET(percentile * 10)] AS actualSizesEstimatedWastedLoadedPixels, + APPROX_QUANTILES(image.actualSizesEstimatedWastedLoadedBytes, 1000)[OFFSET(percentile * 10)] AS actualSizesEstimatedWastedLoadedBytes, + APPROX_QUANTILES(image.wastedLoadedPercent, 1000)[OFFSET(percentile * 10)] AS wastedLoadedPercent +FROM ( + SELECT + _TABLE_SUFFIX AS client, + image + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getSrcsetSizesAccuracy(payload)) AS image), + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/img_sizes_explicit_errors.sql b/sql/2024/media/img_sizes_explicit_errors.sql new file mode 100644 index 00000000000..f2c7db437a2 --- /dev/null +++ b/sql/2024/media/img_sizes_explicit_errors.sql @@ -0,0 +1,44 @@ +#standardSQL +# how many sizes attributes had explicit errors? +# also how many were implicit (100vw) +# img_sizes_explicit_errors.sql + +CREATE TEMPORARY FUNCTION get_responsive_settings(images_string STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +let result = []; +try { +const images_ = JSON.parse(images_string); +if (images_ && images_["responsive-images"]) { + const images = images_["responsive-images"]; + for(const img of images) { + result.push({ + sizes: img.hasSizes || false, + srcsetHasWDescriptors: img.srcsetHasWDescriptors || false, + sizesWasImplicit: img.sizesWasImplicit || false, + sizesParseError: img.sizesParseError || false + }) + } +} +} catch (e) {} +return result; +'''; +SELECT + client, + COUNT(0) AS images_with_sizes, + SAFE_DIVIDE(COUNTIF(respimg.sizesWasImplicit = TRUE), COUNT(0)) AS implicit_pct, + SAFE_DIVIDE(COUNTIF(respimg.sizesWasImplicit = FALSE), COUNT(0)) AS explicit_pct, + SAFE_DIVIDE(COUNTIF(respimg.sizesParseError = TRUE), COUNT(0)) AS parseError_pct, + SAFE_DIVIDE(COUNTIF(respimg.srcsetHasWDescriptors = TRUE), COUNT(0)) AS wDescriptor_pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + a.url AS pageUrl, + respimg + FROM + `httparchive.pages.2024_06_01_*` AS a, + UNNEST(get_responsive_settings(JSON_EXTRACT_SCALAR(payload, '$._responsive_images'))) AS respimg + WHERE + respimg.srcsetHasWDescriptors) +GROUP BY + client diff --git a/sql/2024/media/img_with_dimensions.sql b/sql/2024/media/img_with_dimensions.sql new file mode 100644 index 00000000000..0b8b1caf05d --- /dev/null +++ b/sql/2024/media/img_with_dimensions.sql @@ -0,0 +1,53 @@ +#standardSQL +# extracts the data about width, height and alt +# using this, counts and reports on the usage for each attribute +# img_with_dimensions.sql + +CREATE TEMPORARY FUNCTION get_image_info(responsiveString STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +try { + let result = Array() + const responsiveImages = JSON.parse(responsiveString) + if(responsiveImages && responsiveImages['responsive-images']){ + for(const image of responsiveImages["responsive-images"]){ + if ( image.approximateResourceWidth > 1 || image.approximateResourceHeight > 1 ) { + result.push({ + hasWidth: image.hasWidth ? 1 : 0, + hasHeight: image.hasHeight ? 1 : 0, + hasAlt: image.hasAlt ? 1 : 0, + hasReservedLayoutDimension: image.reservedLayoutDimensions ? 1 : 0 + }) + } + }} + return result +} catch(e) { + return []; +} +'''; + +SELECT + client, + COUNT(0) AS images, + COUNTIF(hasWidth = 1) AS hasWidth, + COUNTIF(hasHeight = 1) AS hasHeight, + COUNTIF(hasWidth = 1 AND hasHeight = 1) AS hasBoth, + COUNTIF(hasAlt = 1) AS hasAlt, + COUNTIF(hasReservedLayoutDimension = 1) AS hasDimensions, + SAFE_DIVIDE(COUNTIF(hasWidth = 1), COUNT(0)) AS percHasWidth, + SAFE_DIVIDE(COUNTIF(hasHeight = 1), COUNT(0)) AS percHasHeight, + SAFE_DIVIDE(COUNTIF(hasWidth = 1 AND hasHeight = 1), COUNT(0)) AS percHasBoth, + SAFE_DIVIDE(COUNTIF(hasAlt = 1), COUNT(0)) AS percHasAlt, + SAFE_DIVIDE(COUNTIF(hasReservedLayoutDimension = 1), COUNT(0)) AS percHasReservedLayoutDimensions +FROM ( + SELECT + _TABLE_SUFFIX AS client, + hasWidth, + hasHeight, + hasAlt, + hasReservedLayoutDimension + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(get_image_info(JSON_VALUE(payload, '$._responsive_images')))) +GROUP BY + client diff --git a/sql/2024/media/img_xdomain.sql b/sql/2024/media/img_xdomain.sql new file mode 100644 index 00000000000..e19e305173c --- /dev/null +++ b/sql/2024/media/img_xdomain.sql @@ -0,0 +1,36 @@ +#standardSQL +# Cross domain image requests +# Note to the future this would be more interesting if it looked at origins +# img_xdomain.sql + +CREATE TEMPORARY FUNCTION get_images(images_string STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +var result = []; +try { + var images = JSON.parse(images_string); + for (img of images){ + result.push({ + url: img.url + }); + } +} catch (e) {} +return result; +'''; +SELECT + client, + COUNT(DISTINCT pageUrl) AS pages, + count(0) AS images, + SAFE_DIVIDE(COUNTIF(pageDomain = imageDomain), COUNT(0)) AS img_xdomain_pct, + SAFE_DIVIDE(COUNTIF(pageDomain != imageDomain), COUNT(0)) AS img_samedomain_pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + a.url AS pageUrl, + FORMAT('%T', NET.REG_DOMAIN(a.url)) AS pageDomain, + FORMAT('%T', NET.REG_DOMAIN(imageurl.url)) AS imageDomain + FROM + `httparchive.pages.2024_06_01_*` a, + UNNEST(get_images(JSON_EXTRACT_SCALAR(payload, '$._Images'))) AS imageurl) +GROUP BY + client diff --git a/sql/2024/media/imgs_per_page.sql b/sql/2024/media/imgs_per_page.sql new file mode 100644 index 00000000000..875dbbd3467 --- /dev/null +++ b/sql/2024/media/imgs_per_page.sql @@ -0,0 +1,39 @@ +#standardSQL +# Distribution of number-of-images per page +# imgs_per_page.sql + +CREATE TEMPORARY FUNCTION numberOfImages(images_string STRING) +RETURNS INT64 +LANGUAGE js AS ''' +try { + return JSON.parse(images_string).filter( i => parseInt(i.approximateResourceWidth) > 1 && parseInt(i.approximateResourceWidth) > 1 ).length; +} catch { + return 0; +} +'''; + +WITH numImgs AS ( + SELECT + _TABLE_SUFFIX AS client, + numberOfImages(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')) AS numberOfImages + FROM + `httparchive.pages.2024_06_01_*` +), + +percentiles AS ( + SELECT + client, + APPROX_QUANTILES(numberOfImages, 1000) AS numberOfImagesPercentiles + FROM + numImgs + GROUP BY + client +) + +SELECT + client, + percentile, + numberOfImagesPercentiles[OFFSET(percentile * 10)] AS numberOfImages +FROM + percentiles, + UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile diff --git a/sql/2024/media/largest_image_per_page_bytes.sql b/sql/2024/media/largest_image_per_page_bytes.sql new file mode 100644 index 00000000000..106d9dbd457 --- /dev/null +++ b/sql/2024/media/largest_image_per_page_bytes.sql @@ -0,0 +1,44 @@ +#standardSQL +# what's the largest image per page? (by bytes) +# largest_image_per_page_bytes.sql + +CREATE TEMPORARY FUNCTION largestImage(payload STRING) +RETURNS INT64 +LANGUAGE js AS ''' +try { + + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.reduce( ( acc, cv ) => { + const bytes = cv.byteSize; + if ( bytes > acc ) { + acc = bytes; + } + return acc; + }, 0 ); + +} catch (e) { + return 0; +} +'''; + +SELECT + percentile, + client, + APPROX_QUANTILES(largestImageBytes, 1000)[OFFSET(percentile * 10)] AS largestImageBytesize +FROM ( + SELECT + _TABLE_SUFFIX AS client, + largestImage(payload) AS largestImageBytes + FROM + `httparchive.pages.2024_06_01_*` +), +UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/largest_image_per_page_layout.sql b/sql/2024/media/largest_image_per_page_layout.sql new file mode 100644 index 00000000000..1bb7d39e452 --- /dev/null +++ b/sql/2024/media/largest_image_per_page_layout.sql @@ -0,0 +1,44 @@ +#standardSQL +# what's the largest image per page? (by *css* pixels) +# largest_image_per_page_layout.sql + +CREATE TEMPORARY FUNCTION widestImage(payload STRING) +RETURNS INT64 +LANGUAGE js AS ''' +try { + + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.reduce( ( acc, cv ) => { + const w = cv.clientWidth; + if ( w > acc ) { + acc = w; + } + return acc; + }, 0 ); + +} catch (e) { + return 0; +} +'''; + +SELECT + percentile, + client, + APPROX_QUANTILES(widestImageCSSpx, 1000)[OFFSET(percentile * 10)] AS widestImageLayoutWidth +FROM ( + SELECT + _TABLE_SUFFIX AS client, + widestImage(payload) AS widestImageCSSpx + FROM + `httparchive.pages.2024_06_01_*` +), +UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/largest_image_per_page_pixels.sql b/sql/2024/media/largest_image_per_page_pixels.sql new file mode 100644 index 00000000000..9dbf77f2c75 --- /dev/null +++ b/sql/2024/media/largest_image_per_page_pixels.sql @@ -0,0 +1,44 @@ +#standardSQL +# what's the largest image per page? (by pixels) +# largest_image_per_page_pixels.sql + +CREATE TEMPORARY FUNCTION largestImage(payload STRING) +RETURNS INT64 +LANGUAGE js AS ''' +try { + + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.reduce( ( acc, cv ) => { + const pixels = cv.approximateResourceWidth * cv.approximateResourceHeight; + if ( pixels > acc ) { + acc = pixels; + } + return acc; + }, 0 ); + +} catch (e) { + return 0; +} +'''; + +SELECT + percentile, + client, + APPROX_QUANTILES(largestImageMegapixels, 1000)[OFFSET(percentile * 10)] AS largestImagePixelCount +FROM ( + SELECT + _TABLE_SUFFIX AS client, + largestImage(payload) / 1e6 AS largestImageMegapixels + FROM + `httparchive.pages.2024_06_01_*` +), +UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/lazy_loading_adoption_over_time.sql b/sql/2024/media/lazy_loading_adoption_over_time.sql new file mode 100644 index 00000000000..408d0d1807b --- /dev/null +++ b/sql/2024/media/lazy_loading_adoption_over_time.sql @@ -0,0 +1,19 @@ +#standardSQL +# How has lazy-loading adoption changed over time? +# copied from https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2021/resource-hints/imgLazy.sql +# lazy_loading_adoption_over_time.sql + +SELECT + SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, + IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + COUNT(DISTINCT IF(LOWER(attr) = '"lazy"', url, NULL)) / COUNT(DISTINCT url) AS percent +FROM + `httparchive.pages.*` +LEFT JOIN + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, "$['_img-loading-attr']"), '$')) AS attr +GROUP BY + date, + client +ORDER BY + date DESC, + client diff --git a/sql/2024/media/lcp_element_data.sql b/sql/2024/media/lcp_element_data.sql new file mode 100644 index 00000000000..8312b4cb146 --- /dev/null +++ b/sql/2024/media/lcp_element_data.sql @@ -0,0 +1,96 @@ +#standardSQL +# LCP element node details +# How many LCPs had images? (either s or as background images)? +# How many of those s were lazy-loaded (anti-pattern!) +# lcp_element_data.sql + +CREATE TEMP FUNCTION getLoadingAttr(attributes STRING) RETURNS STRING LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const loadingAttr = data.find(attr => attr["name"] === "loading") + return loadingAttr.value + } catch (e) { + return ""; + } +'''; + +CREATE TEMP FUNCTION getDecodingAttr(attributes STRING) RETURNS STRING LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const decodingAttr = data.find(attr => attr["name"] === "decoding") + return decodingAttr.value + } catch (e) { + return ""; + } +'''; + +CREATE TEMP FUNCTION getLoadingClasses(attributes STRING) RETURNS STRING LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const classes = data.find(attr => attr["name"] === "class").value + if (classes.indexOf('lazyload') !== -1) { + return classes + } else { + return "" + } + } catch (e) { + return ""; + } +'''; + + + +WITH +lcp_stats AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') AS nodeName, + JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.url') AS elementUrl, + CAST(JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.size') AS INT64) AS size, + CAST(JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.loadTime') AS FLOAT64) AS loadTime, + CAST(JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.startTime') AS FLOAT64) AS startTime, + CAST(JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.renderTime') AS FLOAT64) AS renderTime, + JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes') AS attributes, + getLoadingAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes')) AS loading, + getDecodingAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes')) AS decoding, + getLoadingClasses(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes')) AS classWithLazyload + FROM + `httparchive.pages.2024_06_01_*` +) + +SELECT + client, + nodeName, + COUNT(DISTINCT url) AS pages, + ANY_VALUE(total) AS total, + COUNT(DISTINCT url) / ANY_VALUE(total) AS pct, + COUNTIF(elementUrl != '') AS haveImages, + COUNTIF(elementUrl != '') / COUNT(DISTINCT url) AS pct_haveImages, + COUNTIF(loading = 'eager') AS native_eagerload, + COUNTIF(loading = 'lazy') AS native_lazyload, + COUNTIF(classWithLazyload != '') AS lazyload_class, + COUNTIF(classWithLazyload != '' OR loading = 'lazy') AS probably_lazyLoaded, + COUNTIF(classWithLazyload != '' OR loading = 'lazy') / COUNT(DISTINCT url) AS pct_prob_lazyloaded, + COUNTIF(decoding = 'async') AS async_decoding, + COUNTIF(decoding = 'sync') AS sync_decoding, + COUNTIF(decoding = 'auto') AS auto_decoding +FROM + lcp_stats +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total + FROM + `httparchive.summary_pages.2024_06_01_*` + GROUP BY + _TABLE_SUFFIX) +USING + (client) +GROUP BY + client, + nodeName +HAVING + pages > 1000 +ORDER BY + pct DESC diff --git a/sql/2024/media/media_formats.sql b/sql/2024/media/media_formats.sql new file mode 100644 index 00000000000..ba0ef393800 --- /dev/null +++ b/sql/2024/media/media_formats.sql @@ -0,0 +1,54 @@ +#standardSQL +# Format adoption +# media_formats.sql +# ❕ Updated in 2024 to use all.requests instead of almanac.requests + +CREATE TEMPORARY FUNCTION fixMimeType(mimeType STRING) +RETURNS STRING +LANGUAGE js AS ''' +if (mimeType === "image/avif") { + return "avif"; +} else if (mimeType === "image/webp" || mimeType==="webp") { + return "webp"; +} else if (mimeType === 'image/jpg' || mimeType === 'image/jpeg') { + return 'jpg'; +} else if (mimeType === 'image/png' || mimeType === 'Image/png') { + return 'png'; +} else if (mimeType === 'image/gif') { + return 'gif'; +} else if (mimeType === 'image/svg+xml') { + return 'svg'; +} else if (mimeType === 'image/x-icon' || mimeType === 'image/vnd.microsoft.icon') { + return 'ico'; +} else if (mimeType === 'image/bmp') { + return 'bmp'; +} else { + return 'other/unknown'; +} +'''; + +SELECT + client, + trueFormat, + COUNT(DISTINCT NET.HOST(url)) AS hosts, + COUNT(DISTINCT page) AS pages, + COUNT(0) AS freqImages, + SUM(COUNT(0)) OVER (PARTITION BY client) AS totalImages, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pctImages +FROM ( + SELECT + client, + page, + url, + fixMimeType(JSON_VALUE(payload, '$.response.content.mimeType')) AS trueFormat + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + type = 'image' AND + JSON_VALUE(summary, '$.respBodySize') IS NOT NULL) +GROUP BY + client, + trueFormat +ORDER BY + pctImages DESC diff --git a/sql/2024/media/picture_distribution.sql b/sql/2024/media/picture_distribution.sql new file mode 100644 index 00000000000..d4ecea5f8c4 --- /dev/null +++ b/sql/2024/media/picture_distribution.sql @@ -0,0 +1,24 @@ +#standardSQL +# Number of picture elements per page +# picture_distribution.sql + +SELECT + percentile, + client, + COUNTIF(picture > 0) AS pages, + COUNT(0) AS total, + COUNTIF(picture > 0) / COUNT(0) AS pct, + APPROX_QUANTILES(IF(picture > 0, picture, NULL), 1000)[OFFSET(percentile * 10)] AS picture_elements_per_page +FROM ( + SELECT + _TABLE_SUFFIX AS client, + CAST(JSON_QUERY(JSON_VALUE(payload, '$._media'), '$.num_picture_img') AS INT64) AS picture + FROM + `httparchive.pages.2024_06_01_*`), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/picture_format_distribution.sql b/sql/2024/media/picture_format_distribution.sql new file mode 100644 index 00000000000..099a96fbac2 --- /dev/null +++ b/sql/2024/media/picture_format_distribution.sql @@ -0,0 +1,59 @@ +#standardSQL +# picture formats distribution +# picture_format_distribution.sql + +# returns all the data we need from _media +CREATE TEMPORARY FUNCTION get_media_info(media_string STRING) +RETURNS STRUCT< + num_picture_img INT64, + num_picture_formats INT64, + picture_formats ARRAY +> LANGUAGE js AS ''' +var result = {}; +try { + var media = JSON.parse(media_string); + + if (Array.isArray(media) || typeof media != 'object') return result; + + // fix "picture_formats":"[]" + if (!Array.isArray(media.picture_formats)) + { + media.picture_formats = JSON.parse(media.picture_formats); + } + + // skip "picture_formats":[{}] + if (media.picture_formats.length == 1 && Object.keys(media.picture_formats[0]).length === 0) + { + media.picture_formats = []; + } + + result.picture_formats = media.picture_formats; + result.num_picture_img = media.num_picture_img; + result.num_picture_formats = result.picture_formats.length; + +} catch (e) {} +return result; +'''; + +SELECT + client, + SAFE_DIVIDE(COUNTIF(media_info.num_picture_formats > 0), COUNTIF(media_info.num_picture_img > 0)) AS pages_with_picture_formats_pct, + SAFE_DIVIDE(COUNTIF(media_info.num_picture_formats = 1), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_picture_formats_1_pct, + SAFE_DIVIDE(COUNTIF(media_info.num_picture_formats = 2), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_picture_formats_2_pct, + SAFE_DIVIDE(COUNTIF(media_info.num_picture_formats = 3), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_picture_formats_3_pct, + SAFE_DIVIDE(COUNTIF(media_info.num_picture_formats >= 4), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_picture_formats_4_and_more_pct, + SAFE_DIVIDE(COUNTIF('image/webp' IN UNNEST(media_info.picture_formats)), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_webp_pct, + SAFE_DIVIDE(COUNTIF('image/gif' IN UNNEST(media_info.picture_formats)), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_gif_pct, + SAFE_DIVIDE(COUNTIF('image/jpg' IN UNNEST(media_info.picture_formats)), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_jpg_pct, + SAFE_DIVIDE(COUNTIF('image/png' IN UNNEST(media_info.picture_formats)), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_png_pct, + SAFE_DIVIDE(COUNTIF('image/avif' IN UNNEST(media_info.picture_formats)), COUNTIF(media_info.num_picture_formats > 0)) AS pages_with_avif_pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info + FROM + `httparchive.pages.2024_06_01_*`) +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/media/picture_orientation.sql b/sql/2024/media/picture_orientation.sql new file mode 100644 index 00000000000..019a3e0988d --- /dev/null +++ b/sql/2024/media/picture_orientation.sql @@ -0,0 +1,40 @@ +#standardSQL +# picture using orientation +# picture_orientation.sql + +CREATE TEMPORARY FUNCTION get_media_info(media_string STRING) +RETURNS STRUCT< + num_picture_img INT64, + num_picture_using_orientation INT64 +> LANGUAGE js AS ''' +var result = {}; +try { + var media = JSON.parse(media_string); + + if (Array.isArray(media) || typeof media != 'object') return result; + + result.num_picture_img = media.num_picture_img; + result.num_picture_using_orientation = media.num_picture_using_orientation; + +} catch (e) {} +return result; +'''; + +SELECT + client, + COUNTIF(media_info.num_picture_using_orientation > 0) AS picture_orientation_pages, + COUNTIF(media_info.num_picture_img > 0) AS total_picture_pages, + SAFE_DIVIDE(COUNTIF(media_info.num_picture_using_orientation > 0), COUNTIF(media_info.num_picture_img > 0)) AS pct_picture_orientation_pages, + SUM(media_info.num_picture_using_orientation) AS picture_orientation_images, + SUM(media_info.num_picture_img) AS total_picture_images, + SAFE_DIVIDE(SUM(media_info.num_picture_using_orientation), SUM(media_info.num_picture_img)) AS pct_picture_orientation_images +FROM ( + SELECT + _TABLE_SUFFIX AS client, + get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info + FROM + `httparchive.pages.2024_06_01_*`) +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/media/picture_switching.sql b/sql/2024/media/picture_switching.sql new file mode 100644 index 00000000000..5ef4d108c74 --- /dev/null +++ b/sql/2024/media/picture_switching.sql @@ -0,0 +1,33 @@ +#standardSQL +# How many picture elements are doing media and type switching? +# picture_switching.sql + +CREATE TEMPORARY FUNCTION getPictureSwitching(payload STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var responsiveImages = JSON.parse($._responsive_images); + responsiveImages = responsiveImages['responsive-images']; + + return responsiveImages.filter(img => img.isInPicture).map(({pictureMediaSwitching, pictureTypeSwitching}) => ({ + pictureMediaSwitching, + pictureTypeSwitching + })); +} catch (e) { + return []; +} +'''; + +SELECT + _TABLE_SUFFIX AS client, + COUNTIF(image.pictureMediaSwitching) AS picture_media_switching, + COUNTIF(image.pictureTypeSwitching) AS picture_type_switching, + COUNT(0) AS total_picture, + COUNTIF(image.pictureMediaSwitching) / COUNT(0) AS pct_picture_media_switching, + COUNTIF(image.pictureTypeSwitching) / COUNT(0) AS pct_picture_type_switching +FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getPictureSwitching(payload)) AS image +GROUP BY + client diff --git a/sql/2024/media/picture_using_min_resolution.sql b/sql/2024/media/picture_using_min_resolution.sql new file mode 100644 index 00000000000..12d490fd5ef --- /dev/null +++ b/sql/2024/media/picture_using_min_resolution.sql @@ -0,0 +1,41 @@ +#standardSQL +# picture using min resolution +# picture_using_min_resolution.sql + +# returns all the data we need from _media +CREATE TEMPORARY FUNCTION get_media_info(media_string STRING) +RETURNS STRUCT< + num_picture_using_min_resolution INT64, + num_picture_img INT64 +> LANGUAGE js AS ''' +var result = {}; +try { + var media = JSON.parse(media_string); + + if (Array.isArray(media) || typeof media != 'object') return result; + + result.num_picture_using_min_resolution = media.num_picture_using_min_resolution; + result.num_picture_img = media.num_picture_img; + +} catch (e) {} +return result; +'''; + +SELECT + client, + COUNTIF(media_info.num_picture_using_min_resolution > 0) AS picture_min_resolution_pages, + COUNTIF(media_info.num_picture_img > 0) AS total_picture_pages, + SAFE_DIVIDE(COUNTIF(media_info.num_picture_using_min_resolution > 0), COUNTIF(media_info.num_picture_img > 0)) AS pct_picture_min_resolution_pages, + SUM(media_info.num_picture_using_min_resolution) AS picture_min_resolution_images, + SUM(media_info.num_picture_img) AS total_picture_images, + SAFE_DIVIDE(SUM(media_info.num_picture_using_min_resolution), SUM(media_info.num_picture_img)) AS pct_picture_min_resolution_images +FROM ( + SELECT + _TABLE_SUFFIX AS client, + get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info + FROM + `httparchive.pages.2024_06_01_*`) +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/media/pixel_volume.sql b/sql/2024/media/pixel_volume.sql new file mode 100644 index 00000000000..bde07985d8e --- /dev/null +++ b/sql/2024/media/pixel_volume.sql @@ -0,0 +1,59 @@ +#standardSQL +# pixel areas +# pixel_volume.sql + +CREATE TEMPORARY FUNCTION getCssPixels(payload STRING) +RETURNS INT64 LANGUAGE js AS ''' +try { + let data = JSON.parse(payload); + return data.reduce((a, c) => a + (c.width||0)*(c.height||0), 0) || 0; +} +catch (e) {} +return null; +'''; + +CREATE TEMPORARY FUNCTION getNaturalPixels(payload STRING) +RETURNS INT64 LANGUAGE js AS ''' +try { + let data = JSON.parse(payload); + return data.reduce((a, c) => a + (c.naturalWidth||0)*(c.naturalHeight||0), 0) || 0; +} +catch (e) {} +return null; +'''; + +SELECT + percentile, + client, + ANY_VALUE(viewport_height) AS viewport_height, + ANY_VALUE(viewport_width) AS viewport_width, + ANY_VALUE(dpr) AS dpr, + ANY_VALUE(viewport_height) * ANY_VALUE(viewport_width) AS display_px, + APPROX_QUANTILES(css_pixels, 1000)[OFFSET(percentile * 10)] AS css_pixels, + APPROX_QUANTILES(natural_pixels, 1000)[OFFSET(percentile * 10)] AS natural_pixels, + APPROX_QUANTILES(natural_pixels, 1000)[OFFSET(percentile * 10)] / (ANY_VALUE(viewport_height) * ANY_VALUE(viewport_width)) AS pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + getCssPixels(JSON_EXTRACT_SCALAR(payload, '$._Images')) AS css_pixels, + getNaturalPixels(JSON_EXTRACT_SCALAR(payload, '$._Images')) AS natural_pixels, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._Dpi'), '$.dppx') AS FLOAT64) AS dpr, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._Resolution'), '$.absolute.height') AS FLOAT64) AS viewport_height, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._Resolution'), '$.absolute.width') AS FLOAT64) AS viewport_width + FROM + `httparchive.pages.2024_06_01_*`), + UNNEST([10, 25, 50, 75, 90]) AS percentile +WHERE + # it appears the _Images array is populated only from tag requests and not CSS or favicon + # likewise the bigImageCount and smallImageCount only track images > 100,000 and < 10,000 respectively. + # Meaning images between 10KB and 100KB won't show up in the count + # https://github.com/WPO-Foundation/webpagetest/blob/master/www/breakdown.inc#L95 + css_pixels > 0 AND + natural_pixels > 0 +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/media/portrait_landscape_square.sql b/sql/2024/media/portrait_landscape_square.sql new file mode 100644 index 00000000000..344fb59ce90 --- /dev/null +++ b/sql/2024/media/portrait_landscape_square.sql @@ -0,0 +1,64 @@ +#standardSQL +# What are the aspect ratios of the web's images? +# portrait_landscape_square.sql + +CREATE TEMPORARY FUNCTION getAspectRatioInfo(responsiveImagesJsonString STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' + const parsed = JSON.parse( responsiveImagesJsonString ); + if ( parsed && parsed.map ) { + return parsed.map( d => { + const aspectRatio = ( d.approximateResourceWidth > 0 && d.approximateResourceHeight > 0 ? + Math.round( ( d.approximateResourceWidth / d.approximateResourceHeight ) * 1000 ) / 1000 : -1 ); + return { + imgURL: d.url, + approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ), + approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ), + aspectRatio: aspectRatio, + isPortrait: aspectRatio < 1 && aspectRatio > 0, + isLandscape: aspectRatio > 1, + isSquare: aspectRatio == 1 + } + }); + } +'''; + +WITH imgs AS ( + SELECT + _TABLE_SUFFIX AS client, + url AS pageURL, + imgURL, + approximateResourceWidth, + approximateResourceHeight, + isPortrait, + isLandscape, + isSquare + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getAspectRatioInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images'))) + WHERE + approximateResourceWidth > 1 AND + approximateResourceHeight > 1 +), + +counts_per_client AS ( + SELECT + client, + COUNTIF(isPortrait) AS portraits, + COUNTIF(isLandscape) AS landscapes, + COUNTIF(isSquare) AS squares, + COUNT(0) AS numberOfImagesPerClient + FROM + imgs + GROUP BY + client +) + +SELECT + client, + SAFE_DIVIDE(portraits, numberOfImagesPerClient) AS percentPortrait, + SAFE_DIVIDE(landscapes, numberOfImagesPerClient) AS percentLandscape, + SAFE_DIVIDE(squares, numberOfImagesPerClient) AS percentSquare, + numberOfImagesPerClient +FROM + counts_per_client diff --git a/sql/2024/media/score_progressive_jpeg.sql b/sql/2024/media/score_progressive_jpeg.sql new file mode 100644 index 00000000000..38f4a372a87 --- /dev/null +++ b/sql/2024/media/score_progressive_jpeg.sql @@ -0,0 +1,22 @@ +#standardSQL +# percent of pages with score_progressive_jpeg +# -1, 0 - 25, 25 - 50, 50 - 75, 75 - 100 +# score_progressive_jpeg.sql + +SELECT + client, + COUNTIF(score < 0) / COUNT(0) AS percent_negative, + COUNTIF(score >= 0 AND score < 25) / COUNT(0) AS percent_0_25, + COUNTIF(score >= 25 AND score < 50) / COUNT(0) AS percent_25_50, + COUNTIF(score >= 50 AND score < 75) / COUNT(0) AS percent_50_75, + COUNTIF(score >= 75 AND score <= 100) / COUNT(0) AS percent_75_100 +FROM ( + SELECT + _TABLE_SUFFIX AS client, + CAST(JSON_EXTRACT(payload, '$._score_progressive_jpeg') AS INT64) AS score + FROM + `httparchive.pages.2024_06_01_*`) +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/media/srcset_density_coverage.sql b/sql/2024/media/srcset_density_coverage.sql new file mode 100644 index 00000000000..4c41468dd17 --- /dev/null +++ b/sql/2024/media/srcset_density_coverage.sql @@ -0,0 +1,76 @@ +#standardSQL +# images srcset candidates' densities: do they cover useful ranges? +# srcset_density_coverage.sql + +CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING) +RETURNS ARRAY, + numberOfSrcsetCandidates INT64, + minDensity FLOAT64, + maxDensity FLOAT64>> +LANGUAGE js AS ''' + const parsed = JSON.parse( responsiveImagesJsonString ); + if ( parsed && parsed.map ) { + return parsed.map( d => { + const result = { + hasSrcset: d.hasSrcset, + srcsetHasXDescriptors: d.srcsetHasXDescriptors, + srcsetHasWDescriptors: d.srcsetHasXDescriptors, + srcsetCandidateDensities: [], + numberOfSrcsetCandidates: 0, + minDensity: d.currentSrcDensity, + maxDensity: d.currentSrcDensity + }; + if ( d.srcsetCandidateDensities && d.srcsetCandidateDensities.map ) { + const densities = d.srcsetCandidateDensities.map( n => parseFloat( n ) ); + result.srcsetCandidateDensities = densities; + result.numberOfSrcsetCandidates = densities.length; + result.minDensity = Math.min( ...densities ); + result.maxDensity = Math.max( ...densities ); + } + return result; + }); + } +'''; + +WITH imgs AS ( + SELECT + _TABLE_SUFFIX AS client, + hasSrcset, + srcsetCandidateDensities, + minDensity, + maxDensity + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images'))) + WHERE + srcsetHasXDescriptors = TRUE OR srcsetHasWDescriptors = TRUE +), + +counts AS ( + SELECT + client, + COUNT(0) AS number_of_imgs_with_srcset, + COUNTIF(minDensity <= 1 AND maxDensity >= 1.5) AS number_of_srcsets_covering_1x_to_1p5x, + COUNTIF(minDensity <= 1 AND maxDensity >= 2) AS number_of_srcsets_covering_1x_to_2x, + COUNTIF(minDensity <= 1 AND maxDensity >= 2.5) AS number_of_srcsets_covering_1x_to_2p5x, + COUNTIF(minDensity <= 1 AND maxDensity >= 3) AS number_of_srcsets_covering_1x_to_3x + FROM imgs + GROUP BY client +) + +SELECT + client, + number_of_imgs_with_srcset, + number_of_srcsets_covering_1x_to_1p5x, + number_of_srcsets_covering_1x_to_2x, + number_of_srcsets_covering_1x_to_2p5x, + number_of_srcsets_covering_1x_to_3x, + number_of_srcsets_covering_1x_to_1p5x / number_of_imgs_with_srcset AS pct_of_srcsets_covering_1x_to_1p5x, + number_of_srcsets_covering_1x_to_2x / number_of_imgs_with_srcset AS pct_of_srcsets_covering_1x_to_2x, + number_of_srcsets_covering_1x_to_2p5x / number_of_imgs_with_srcset AS pct_of_srcsets_covering_1x_to_2p5x, + number_of_srcsets_covering_1x_to_3x / number_of_imgs_with_srcset AS pct_of_srcsets_covering_1x_to_3x +FROM counts diff --git a/sql/2024/media/top_aspect_ratios.sql b/sql/2024/media/top_aspect_ratios.sql new file mode 100644 index 00000000000..4a9d63a53d3 --- /dev/null +++ b/sql/2024/media/top_aspect_ratios.sql @@ -0,0 +1,71 @@ +#standardSQL +# top particular aspect ratios +# top_aspect_ratios.sql + +CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' + const parsed = JSON.parse( responsiveImagesJsonString ); + if ( parsed && parsed.map ) { + return parsed.map( d => ({ + imgURL: d.url, + approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ), + approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ), + aspectRatio: ( d.approximateResourceWidth > 0 && d.approximateResourceHeight > 0 ? + Math.round( ( d.approximateResourceWidth / d.approximateResourceHeight ) * 1000 ) / 1000 : + -1 ) + }) ); + } +'''; + +WITH imgs AS ( + SELECT + _TABLE_SUFFIX AS client, + url AS pageURL, + imgURL, + approximateResourceWidth, + approximateResourceHeight, + aspectRatio + FROM + `httparchive.pages.2024_06_01_*`, + UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images'))) + WHERE + approximateResourceWidth > 1 AND + approximateResourceHeight > 1 +), + +counts_per_client AS ( + SELECT + client, + COUNT(0) AS numberOfImagesPerClient + FROM + imgs + GROUP BY + client +), + +counts_per_client_and_aspect_ratio AS ( + SELECT + client, + aspectRatio, + COUNT(0) AS numberOfImagesPerClientAndAspectRatio + FROM + imgs + GROUP BY + client, + aspectRatio +) + +SELECT + client, + aspectRatio, + numberOfImagesPerClientAndAspectRatio, + SAFE_DIVIDE(numberOfImagesPerClientAndAspectRatio, numberOfImagesPerClient) AS percentOfImages +FROM + counts_per_client_and_aspect_ratio +LEFT JOIN + counts_per_client +USING + (client) +ORDER BY + percentOfImages DESC diff --git a/sql/2024/media/video_adoption.sql b/sql/2024/media/video_adoption.sql new file mode 100644 index 00000000000..1cac4f6e5c4 --- /dev/null +++ b/sql/2024/media/video_adoption.sql @@ -0,0 +1,20 @@ +#standardSQL +# How many pages use