Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
5 changes: 5 additions & 0 deletions sql/mozfun/map/mode_last_struct_retain_nulls/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
description: Given an array of structs, return the most frequently occurring full struct;
break ties by the latest occurrence (mode-last). Equality is on the entire struct, not per-field.
Nulls are retained.
friendly_name: Map Mode Last Struct Retain Nulls
223 changes: 223 additions & 0 deletions sql/mozfun/map/mode_last_struct_retain_nulls/udf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
/*
Return the most frequent STRUCT from an array, breaking ties by latest occurrence
(i.e., mode_last_retain_nulls over whole structs). Use to keep related fields aggregated together.
See also: `map.mode_last`, which determines each value using `stats.mode_last`.
*/
CREATE OR REPLACE FUNCTION map.mode_last_struct_retain_nulls(entries ANY TYPE) AS (
(
SELECT AS STRUCT
s.*
FROM
(
SELECT
s,
COUNT(*) AS freq,
MAX(pos) AS last_pos
FROM
UNNEST(entries) AS s
WITH OFFSET pos
GROUP BY
s
)
ORDER BY
freq DESC,
last_pos DESC
LIMIT
1
)
);

-- Tests
SELECT
-- 1) Most frequent wins (Berlin appears twice)
assert.struct_equals(
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
map.mode_last_struct_retain_nulls(
[
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
STRUCT(
'Munich' AS city,
'BY' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
)
]
)
),
-- 2) Tie -> latest wins (Berlin x2, Munich x2, last element is Munich)
assert.struct_equals(
STRUCT(
'Munich' AS city,
'BY' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
map.mode_last_struct_retain_nulls(
[
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
STRUCT(
'Munich' AS city,
'BY' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
STRUCT(
'Munich' AS city,
'BY' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
) -- latest among the tied
]
)
),
-- 3) FULL-struct equality: different subdivision2 means different value
assert.struct_equals(
STRUCT('Berlin' AS city, 'BE' AS subdivision1, 'A' AS subdivision2, 'DE' AS country),
map.mode_last_struct_retain_nulls(
[
STRUCT('Berlin' AS city, 'BE' AS subdivision1, 'A' AS subdivision2, 'DE' AS country),
STRUCT('Berlin' AS city, 'BE' AS subdivision1, 'B' AS subdivision2, 'DE' AS country),
STRUCT('Berlin' AS city, 'BE' AS subdivision1, 'A' AS subdivision2, 'DE' AS country)
]
)
),
-- 4) Single element returns itself
assert.struct_equals(
STRUCT(
'Cologne' AS city,
'NW' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
map.mode_last_struct_retain_nulls(
[
STRUCT(
'Cologne' AS city,
'NW' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
)
]
)
),
-- 5) Tie between NULL struct and a non-NULL struct; latest wins -> expect Berlin
assert.struct_equals(
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
map.mode_last_struct_retain_nulls(
[
STRUCT(
CAST(NULL AS STRING) AS city,
CAST(NULL AS STRING) AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
CAST(NULL AS STRING) AS country
),
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
STRUCT(
CAST(NULL AS STRING) AS city,
CAST(NULL AS STRING) AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
CAST(NULL AS STRING) AS country
),
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
) -- latest among the tied
]
)
),
-- 6) NULL struct occurs most frequently -> expect NULL
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense for this udf but for #7974, wouldn't we want to consider Berlin as the first/last seen city in this case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This UDF would first be applied to stable tables to produce one row per client per day (mirroring baseline_clients_daily). We should retain NULLs at this stage: clients in cities with populations <15k or in locations MaxMind can’t map should remain NULL. If we drop NULLs here, we’d misrepresent a client’s true location and only capture them when they travel to a resolvable city. Downstream, the city_seen table can then keep only the non-NULL city values after this step. I hope that makes sense?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that matches what I was thinking so I might be misunderstanding where this is going to be used. I'll see if it makes sense when I look at where the udf is used

assert.struct_equals(
STRUCT(
CAST(NULL AS STRING) AS city,
CAST(NULL AS STRING) AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
CAST(NULL AS STRING) AS country
),
map.mode_last_struct_retain_nulls(
[
STRUCT(
CAST(NULL AS STRING) AS city,
CAST(NULL AS STRING) AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
CAST(NULL AS STRING) AS country
),
STRUCT(
'Berlin' AS city,
'BE' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
STRUCT(
CAST(NULL AS STRING) AS city,
CAST(NULL AS STRING) AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
CAST(NULL AS STRING) AS country
)
]
)
),
-- 7) City is NULL but other fields present; that exact struct is most frequent -> expect that struct (with city = NULL)
assert.struct_equals(
STRUCT(
CAST(NULL AS STRING) AS city,
'BY' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
map.mode_last_struct_retain_nulls(
[
STRUCT(
CAST(NULL AS STRING) AS city,
'BY' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
),
STRUCT('Berlin' AS city, 'BE' AS subdivision1, NULL AS subdivision2, 'DE' AS country),
STRUCT(
CAST(NULL AS STRING) AS city,
'BY' AS subdivision1,
CAST(NULL AS STRING) AS subdivision2,
'DE' AS country
)
]
)
);