Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Media 2024 queries #3738

Merged
merged 5 commits into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions sql/2024/media/animated_gif_bpp.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#standardSQL
# what is the bpp of animated vs non-animated GIFs?
# animated_gif_bpp.sql


# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
RETURNS INT64
LANGUAGE js AS r'''

if (!imageMagickNumberString) { return null; }
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
const multiples = {
'K': 1e3,
'M': 1e6,
'G': 1e9,
'T': 1e12
}
if ( matched && matched[1] ) {
return Math.round(
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
);
} else {
return null;
}

''';

# ImageMagick reports bytesizes in a friendly, human readable format. Just return bytes
CREATE TEMPORARY FUNCTION magickBytes(imageMagickNumberString STRING)
RETURNS INT64
LANGUAGE js AS r'''

if (!imageMagickNumberString) { return 0; }
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)$/);
const multiples = {
'B': 1,
'KB': 1e3,
'MB': 1e6,
'GB': 1e9,
'TB': 1e12
}
if ( matched && matched[1] && matched[2] ) {
return Math.round(
parseFloat( matched[1] ) * multiples[ matched[2] ]
);
} else {
return null;
}

''';

WITH gifs AS (
SELECT
_TABLE_SUFFIX AS client,
CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL) AS is_animated,
(magickBytes(JSON_VALUE(payload, '$._image_details.magick.filesize')) * 8) /
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) AS bits_per_pixel
FROM
`requests.2024_06_01_*`
WHERE
JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1 AND
JSON_VALUE(payload, '$._image_details.animated') IS NOT NULL
)

SELECT
percentile,
client,
is_animated,
APPROX_QUANTILES(bits_per_pixel, 1000)[OFFSET(percentile * 10)] AS bpp
FROM
gifs,
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
percentile,
client,
is_animated
ORDER BY
percentile,
client,
is_animated
39 changes: 39 additions & 0 deletions sql/2024/media/animated_gif_count.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#standardSQL
# what percent of gifs are animated?
# animated_gif_count.sql

# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
RETURNS INT64
LANGUAGE js AS r'''

if (!imageMagickNumberString) { return null; }
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
const multiples = {
'K': 1e3,
'M': 1e6,
'G': 1e9,
'T': 1e12
}
if ( matched && matched[1] ) {
return Math.round(
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
);
} else {
return null;
}

''';

SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total_gifs,
COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) AS total_animated_gifs,
COUNTIF(CAST(JSON_VALUE(payload, '$._image_details.animated') AS BOOL)) / COUNT(0) AS pct_animated_gifs
FROM
`requests.2024_06_01_*`
WHERE
JSON_VALUE(payload, '$._image_details.detected_type') = 'gif' AND
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1
GROUP BY
client
50 changes: 50 additions & 0 deletions sql/2024/media/animated_gif_framecount.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#standardSQL
# distribution of animated GIF framecounts
# animated_gif_framecount.sql

# ImageMagick reports big images as having, e.g., "1.29097M" pixels. This means ~1.2 million pixels, but BigQuery doesn't know that.
CREATE TEMPORARY FUNCTION magickPixels(imageMagickNumberString STRING)
RETURNS INT64
LANGUAGE js AS r'''

if (!imageMagickNumberString) { return null; }
const matched = imageMagickNumberString.match(/([\d\.]+)(\w+)?$/);
const multiples = {
'K': 1e3,
'M': 1e6,
'G': 1e9,
'T': 1e12
}
if ( matched && matched[1] ) {
return Math.round(
parseFloat( matched[1] ) * ( multiples[ matched[2] ] || 1 )
);
} else {
return null;
}

''';

WITH framecounts AS (
SELECT
_TABLE_SUFFIX AS client,
CAST(JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') AS NUMERIC) AS framecount
FROM `requests.2024_06_01_*`
WHERE
JSON_VALUE(payload, '$._image_details.metadata.GIF.FrameCount') IS NOT NULL AND
magickPixels(JSON_VALUE(payload, '$._image_details.magick.numberPixels')) > 1
)

SELECT
percentile,
client,
APPROX_QUANTILES(framecount, 1000)[OFFSET(percentile * 10)] AS framecount
FROM
framecounts,
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
GROUP BY
percentile,
client
ORDER BY
percentile,
client
13 changes: 13 additions & 0 deletions sql/2024/media/at_least_one_image_request.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#standardSQL
# What % of pages load at least one image?
# ✅ at_least_one_image_request.sql

SELECT
_TABLE_SUFFIX AS client,
COUNTIF(reqImg > 0) AS atLeastOneImgReqCount,
COUNT(0) AS total,
SAFE_DIVIDE(COUNTIF(reqImg > 0), COUNT(0)) AS atLeastOneImgReqPct
FROM
`httparchive.summary_pages.2024_06_01_*`
GROUP BY
client
31 changes: 31 additions & 0 deletions sql/2024/media/at_least_one_img.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#standardSQL
# What % of pages have at least one <img>?
# at_least_one_img.sql

CREATE TEMPORARY FUNCTION numberOfImages(images_string STRING)
RETURNS INT64
LANGUAGE js AS '''
try {
return JSON.parse(images_string).filter( i => parseInt(i.approximateResourceWidth) > 1 && parseInt(i.approximateResourceWidth) > 1 ).length;
} catch {
return 0;
}
''';

WITH numImgs AS (
SELECT
_TABLE_SUFFIX AS client,
numberOfImages(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')) AS numberOfImages
FROM
`httparchive.pages.2024_06_01_*`
)

SELECT
client,
COUNTIF(numberOfImages > 0) AS atLeastOneCount,
COUNT(0) AS total,
SAFE_DIVIDE(COUNTIF(numberOfImages > 0), COUNT(0)) AS atLeastOnePct
FROM
numImgs
GROUP BY
client
174 changes: 174 additions & 0 deletions sql/2024/media/bytes_and_dimensions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#standardSQL
# Measuring img loaded bytes and dimensions
# bytes_and_dimensions.sql

CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING)
RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, bitsPerPixel NUMERIC, isPixel BOOL, isDataURL BOOL, resourceFormat STRING>>
LANGUAGE js AS '''

function pithyType( { contentType, url } ) {
const subtypeMap = {
'svg+xml': 'svg',
'svgz': 'svg',
'jpeg': 'jpg',
'jfif': 'jpg',
'x-png': 'png',
'vnd.microsoft.icon': 'ico',
'x-icon': 'ico',
'jxr': 'jxr',
'vnd.ms-photo': 'jxr',
'hdp': 'jxr',
'wdp': 'jxr',
'jpf': 'jp2',
'jpx': 'jp2',
'jpm': 'jp2',
'mj2': 'jp2',
'x-jp2-container': 'jp2',
'x-jp2-codestream': 'jp2',
'x-jpeg2000-image': 'jp2',
'heic': 'heif',
'x-ms-bmp': 'bmp',
'x-pict': 'pict',
'tif': 'tiff',
'x-tif': 'tiff',
'x-tiff': 'tiff',
'vnd.mozilla.apng': 'apng',
// identities
'apng': 'apng',
'jpg': 'jpg',
'jp2': 'jp2',
'png': 'png',
'gif': 'gif',
'ico': 'ico',
'webp': 'webp',
'avif': 'avif',
'tiff': 'tiff',
'flif': 'flif',
'heif': 'heif',
'jxl': 'jxl',
'avif-sequence': 'avif-sequence', // keep separate from single frames...
'heic-sequence': 'heic-sequence',
'bmp': 'bmp',
'pict': 'pict'
};

function normalizeSubtype( subtype ) {
if ( subtypeMap[ subtype ] ) {
return subtypeMap[ subtype ];
}
return 'unknown'; // switch between:
// `subtype`
// to see everything, check if there's anything else worth capturing
// `'unknown'`
// to make results manageable
}

// if it's a data url, take the mime type from there, done.
if ( url &&
typeof url === "string" ) {
const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ );
if ( match && match[ 1 ] ) {
return normalizeSubtype( match[ 1 ] );
}
}

// if we get a content-type header, use it!
if ( contentType &&
typeof contentType === "string" ) {
const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ );
if ( match && match[ 1 ] ) {
return normalizeSubtype( match[ 1 ] );
}
}

// otherwise fall back to extension in the URL
if ( url &&
typeof url === "string" ) {
const splitOnSlashes = url.split("/");
if ( splitOnSlashes.length > 1 ) {
const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ],
splitOnDots = afterLastSlash.split(".");
if ( splitOnDots.length > 1 ) {
return normalizeSubtype(
splitOnDots[ splitOnDots.length - 1 ]
.toLowerCase()
.replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params
);
}
}
}

// otherwise throw up our hands
return 'unknown';
}

const parsed = JSON.parse( responsiveImagesJsonString );
if ( parsed && parsed.map ) {
const dataRegEx = new RegExp('^data');
return parsed.map( d => ({
imgURL: d.url,
approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ),
approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ),
byteSize: Math.floor( d.byteSize || 0 ),
bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ),
isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1,
isDataURL: dataRegEx.test(d.url),
resourceFormat: pithyType({ contentType: d.mimeType, url: d.url })
}) );
}
''';

WITH imgs AS (
SELECT
_TABLE_SUFFIX AS client,
url AS pageURL,
imgURL,
approximateResourceWidth,
approximateResourceHeight,
byteSize,
bitsPerPixel,
isPixel,
isDataURL,
(approximateResourceWidth * approximateResourceHeight) / 1000000 AS megapixels,
(approximateResourceWidth / approximateResourceHeight) AS aspectRatio,
resourceFormat
FROM
`httparchive.pages.2024_06_01_*`,
UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')))
),

percentiles AS (
SELECT
client,
APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles,
APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles,
APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles,
APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles,
APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles,
APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles,
COUNT(0) AS imgCount
FROM
imgs
WHERE
approximateResourceWidth > 1 AND
approximateResourceHeight > 1
GROUP BY
client
)

SELECT
percentile,
client,
imgCount,
resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth,
resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight,
aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio,
megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels,
byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize,
bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel
FROM
percentiles,
UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile
ORDER BY
imgCount DESC,
percentile
Loading