Skip to content

Commit 9fd0733

Browse files
committed
Added mode last retain nulls
1 parent 5bde840 commit 9fd0733

File tree

7 files changed

+1016
-61
lines changed

7 files changed

+1016
-61
lines changed

sql_generators/baseline_clients_city_seen_v1/templates/metadata.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ scheduling:
3232
bigquery:
3333
time_partitioning:
3434
type: day
35-
field: last_seen_geo_date
35+
field: first_seen_city_date
3636
require_partition_filter: false
3737
expiration_days: null
3838
clustering:

sql_generators/baseline_clients_city_seen_v1/templates/query.sql

Lines changed: 89 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ WITH
1414
mozfun.glean.parse_datetime(ping_info.end_time) AS parsed_end_time,
1515
`moz-fx-data-shared-prod.udf.glean_timespan_seconds`( metrics.timespan.glean_baseline_duration ) AS duration,
1616
metadata.geo.city,
17-
metadata.geo.subdivision1 AS geo_subdivision1,
18-
metadata.geo.subdivision2 AS geo_subdivision2,
17+
metadata.geo.subdivision1 AS subdivision1,
18+
metadata.geo.subdivision2 AS subdivision2,
19+
metadata.geo.country AS country,
1920
FROM
2021
`{{ project_id }}.{{ app_id }}_stable.baseline_v1`
2122
WHERE
@@ -56,9 +57,15 @@ WITH
5657
client_id,
5758
sample_id,
5859
ROW_NUMBER() OVER w1_unframed AS _n,
59-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(city) OVER w1) AS city,
60-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(geo_subdivision1) OVER w1) AS geo_subdivision1,
61-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(geo_subdivision2) OVER w1) AS geo_subdivision2,
60+
`moz-fx-data-shared-prod.udf.mode_last_retain_nulls`(
61+
ARRAY_AGG(STRUCT(
62+
city AS city,
63+
subdivision1 AS subdivision1,
64+
subdivision2 AS subdivision2,
65+
country AS country
66+
)
67+
) OVER w1
68+
) AS geo,
6269
FROM
6370
with_date_offsets_{{ app_id }}
6471
LEFT JOIN
@@ -91,15 +98,16 @@ WITH
9198
FROM
9299
windowed_{{ app_id }} AS cd
93100
WHERE
94-
_n = 1 ),
101+
_n = 1 AND geo.city IS NOT NULL),
95102
clients_city_first_seen_{{ app_id }} AS (
96103
SELECT
97104
client_id,
98105
sample_id,
99-
submission_date AS first_seen_geo_date,
100-
city AS first_seen_geo_city,
101-
geo_subdivision1 AS first_seen_geo_subdivision1,
102-
geo_subdivision2 AS first_seen_geo_subdivision2,
106+
submission_date AS first_seen_city_date,
107+
geo.city AS first_seen_city,
108+
geo.subdivision1 AS first_seen_subdivision1,
109+
geo.subdivision2 AS first_seen_subdivision2,
110+
geo.country AS first_seen_country,
103111
FROM
104112
clients_daily_{{ app_id }}
105113
QUALIFY
@@ -108,10 +116,11 @@ clients_city_first_seen_{{ app_id }} AS (
108116
SELECT
109117
client_id,
110118
sample_id,
111-
submission_date AS last_seen_geo_date,
112-
city AS last_seen_geo_city,
113-
geo_subdivision1 AS last_seen_geo_subdivision1,
114-
geo_subdivision2 AS last_seen_geo_subdivision2,
119+
submission_date AS last_seen_city_date,
120+
geo.city AS last_seen_city,
121+
geo.subdivision1 AS last_seen_subdivision1,
122+
geo.subdivision2 AS last_seen_subdivision2,
123+
geo.country AS last_seen_country,
115124
FROM
116125
clients_daily_{{ app_id }}
117126
QUALIFY
@@ -122,20 +131,21 @@ SELECT
122131
"{{ app_id }}" AS app_id,
123132
COALESCE(cfs.client_id, cls.client_id) AS client_id,
124133
COALESCE(cfs.sample_id, cls.sample_id) AS sample_id,
125-
first_seen_geo_date,
126-
first_seen_geo_city,
127-
first_seen_geo_subdivision1,
128-
first_seen_geo_subdivision2,
129-
last_seen_geo_date,
130-
last_seen_geo_city,
131-
last_seen_geo_subdivision1,
132-
last_seen_geo_subdivision2,
134+
first_seen_city_date,
135+
first_seen_city,
136+
first_seen_subdivision1,
137+
first_seen_subdivision2,
138+
first_seen_country,
139+
last_seen_city_date,
140+
last_seen_city,
141+
last_seen_subdivision1,
142+
last_seen_subdivision2,
143+
last_seen_country,
133144
FROM
134145
clients_city_first_seen_{{ app_id }} cfs
135146
FULL OUTER JOIN
136147
clients_city_last_seen_{{ app_id }} cls
137-
ON
138-
cfs.client_id = cls.client_id
148+
ON cfs.client_id = cls.client_id
139149
AND cfs.sample_id = cls.sample_id
140150
{{ "UNION ALL" if not loop.last }}
141151
{% endfor -%}
@@ -157,14 +167,16 @@ _current_windowed_{{ app_id }} AS (
157167
client_info.client_id AS client_id,
158168
sample_id,
159169
ROW_NUMBER() OVER w1_unframed AS _n,
160-
@submission_date AS first_seen_geo_date,
161-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(metadata.geo.city) OVER w1) AS first_seen_geo_city,
162-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(metadata.geo.subdivision1) OVER w1) AS first_seen_geo_subdivision1,
163-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(metadata.geo.subdivision2) OVER w1) AS first_seen_geo_subdivision2,
164-
@submission_date AS last_seen_geo_date,
165-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(metadata.geo.city) OVER w1) AS last_seen_geo_city,
166-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(metadata.geo.subdivision1) OVER w1) AS last_seen_geo_subdivision1,
167-
`moz-fx-data-shared-prod.udf.mode_last`(ARRAY_AGG(metadata.geo.subdivision2) OVER w1) AS last_seen_geo_subdivision2,
170+
submission_date,
171+
`moz-fx-data-shared-prod.udf.mode_last_retain_nulls`(
172+
ARRAY_AGG(STRUCT(
173+
city AS city,
174+
subdivision1 AS subdivision1,
175+
subdivision2 AS subdivision2,
176+
country AS country
177+
)
178+
) OVER w1
179+
) AS geo,
168180
FROM
169181
`moz-fx-data-shared-prod.{{ app_id }}_live.baseline_v1`
170182
WHERE
@@ -189,40 +201,73 @@ _current_windowed_{{ app_id }} AS (
189201
_current_{{ app_id }} AS (
190202
SELECT
191203
cw.* EXCEPT (_n),
204+
submission_date AS first_seen_city_date,
205+
geo.city AS first_seen_city,
206+
geo.subdivision1 AS first_seen_subdivision1,
207+
geo.subdivision2 AS first_seen_subdivision2,
208+
geo.country AS first_seen_country,
209+
submission_date AS last_seen_city_date,
210+
geo.city AS last_seen_city,
211+
geo.subdivision1 AS last_seen_subdivision1,
212+
geo.subdivision2 AS last_seen_subdivision2,
213+
geo.country AS last_seen_country
192214
FROM
193215
_current_windowed_{{ app_id }} AS cw
194216
WHERE
195-
_n = 1){{ "," if not loop.last }}
217+
_n = 1
218+
AND geo.city IS NOT NULL){{ "," if not loop.last }}
196219
{% endfor -%}
197220
{% for app_id in app_id_list -%}
198221
SELECT
199222
app_id,
200223
client_id,
201224
sample_id,
202225
IF
203-
(_p.client_id IS NULL, _c.first_seen_geo_date, _p.first_seen_geo_date) AS first_seen_geo_date,
226+
(_p.client_id IS NULL,
227+
_c.first_seen_city_date,
228+
_p.first_seen_city_date
229+
) AS first_seen_city_date,
204230
IF
205-
(_p.client_id IS NULL, _c.first_seen_geo_city, _p.first_seen_geo_city) AS first_seen_geo_city,
231+
(_p.client_id IS NULL,
232+
_c.first_seen_city,
233+
_p.first_seen_city
234+
) AS first_seen_city,
206235
IF
207-
(_p.client_id IS NULL, _c.first_seen_geo_subdivision1, _p.first_seen_geo_subdivision1) AS first_seen_geo_subdivision1,
236+
(_p.client_id IS NULL,
237+
_c.first_seen_subdivision1,
238+
_p.first_seen_subdivision1
239+
) AS first_seen_subdivision1,
208240
IF
209-
(_p.client_id IS NULL, _c.first_seen_geo_subdivision2, _p.first_seen_geo_subdivision2) AS first_seen_geo_subdivision2,
241+
(_p.client_id IS NULL,
242+
_c.first_seen_subdivision2,
243+
_p.first_seen_subdivision2
244+
) AS first_seen_subdivision2,
210245
IF
211-
(_p.last_seen_geo_date < _c.last_seen_geo_date, _c.last_seen_geo_date, _p.last_seen_geo_date) AS last_seen_geo_date,
246+
(_p.last_seen_city_date < _c.last_seen_city_date,
247+
_c.last_seen_city_date,
248+
_p.last_seen_city_date
249+
) AS last_seen_city_date,
212250
IF
213-
(_p.last_seen_geo_date < _c.last_seen_geo_date, _c.last_seen_geo_city, _p.last_seen_geo_city) AS last_seen_geo_city,
251+
(_p.last_seen_city_date < _c.last_seen_city_date,
252+
_c.last_seen_city,
253+
_p.last_seen_city
254+
) AS last_seen_city,
214255
IF
215-
(_p.last_seen_geo_date < _c.last_seen_geo_date, _c.last_seen_geo_subdivision1, _p.last_seen_geo_subdivision1) AS last_seen_geo_subdivision1,
256+
(_p.last_seen_city_date < _c.last_seen_city_date,
257+
_c.last_seen_subdivision1,
258+
_p.last_seen_subdivision1
259+
) AS last_seen_subdivision1,
216260
IF
217-
(_p.last_seen_geo_date < _c.last_seen_geo_date, _c.last_seen_geo_subdivision2, _p.last_seen_geo_subdivision2) AS last_seen_geo_subdivision2,
261+
(_p.last_seen_city_date < _c.last_seen_city_date,
262+
_c.last_seen_subdivision2,
263+
_p.last_seen_subdivision2
264+
) AS last_seen_subdivision2,
218265
FROM
219266
_current_{{ app_id }} AS _c
220267
FULL JOIN
221268
_previous_{{ app_id }} AS _p
222269
USING
223-
(client_id,
224-
sample_id,
225-
app_id)
270+
(client_id, sample_id, app_id)
226271
{{ "UNION ALL" if not loop.last }}
227272
{% endfor -%}
228273
{% raw %}

sql_generators/baseline_clients_city_seen_v1/templates/schema.yaml

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,35 +11,43 @@ fields:
1111
type: INTEGER
1212
mode: NULLABLE
1313
description: A number, 0-99, that samples by client_id and allows filtering data for analysis.
14-
- name: first_seen_geo_date
14+
- name: first_seen_city_date
1515
type: DATE
1616
mode: NULLABLE
17-
description: Date when the first seen geo fields were captured.
18-
- name: first_seen_geo_city
17+
description: Date when the first seen city was captured.
18+
- name: first_seen_city
1919
type: STRING
2020
mode: NULLABLE
21-
description: City captured on first_seen_geo_date.
22-
- name: first_seen_geo_subdivision1
21+
description: City captured on first_seen_city_date.
22+
- name: first_seen_subdivision1
2323
type: STRING
2424
mode: NULLABLE
25-
description: Major country subdivision, typically a state, province, or county captured on first_seen_geo_date.
26-
- name: first_seen_geo_subdivision2
25+
description: Major country subdivision, typically a state, province, or county captured on first_seen_city_date.
26+
- name: first_seen_subdivision2
2727
type: STRING
2828
mode: NULLABLE
29-
description: Second major country subdivision; not applicable for most countries captured on first_seen_geo_date.
30-
- name: last_seen_geo_date
29+
description: Second major country subdivision; not applicable for most countries captured on first_seen_city_date.
30+
- name: first_seen_country
31+
type: STRING
32+
mode: NULLABLE
33+
description: An ISO 3166-1 alpha-2 country code captured on first_seen_city_date.
34+
- name: last_seen_city_date
3135
type: DATE
3236
mode: NULLABLE
33-
description: Date when the last seen geo fields were captured.
34-
- name: last_seen_geo_city
37+
description: Date when the last seen city was captured.
38+
- name: last_seen_city
39+
type: STRING
40+
mode: NULLABLE
41+
description: City captured on last_seen_city_date.
42+
- name: last_seen_subdivision1
3543
type: STRING
3644
mode: NULLABLE
37-
description: City captured on last_seen_geo_city.
38-
- name: last_seen_geo_subdivision1
45+
description: Major country subdivision, typically a state, province, or county captured on last_seen_city_date.
46+
- name: last_seen_subdivision2
3947
type: STRING
4048
mode: NULLABLE
41-
description: Major country subdivision, typically a state, province, or county captured on last_seen_geo_date.
42-
- name: last_seen_geo_subdivision2
49+
description: Second major country subdivision; not applicable for most countries captured on last_seen_city_date.
50+
- name: last_seen_country
4351
type: STRING
4452
mode: NULLABLE
45-
description: Second major country subdivision; not applicable for most countries captured on last_seen_geo_date.
53+
description: An ISO 3166-1 alpha-2 country code captured on last_seen_city_date.

0 commit comments

Comments
 (0)