Skip to content
Permalink

Comparing changes

This is a direct comparison between two commits made in this repository or its related repositories. View the default comparison for this range or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: all-of-us/curation
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 9d2a0db00cc472c7dcd1b9585ae55726c214aea2
Choose a base ref
..
head repository: all-of-us/curation
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 259d9323a4089c211dac54595b97cd353539036a
Choose a head ref
Showing with 2,395 additions and 1,926 deletions.
  1. +0 −126 data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py
  2. +55 −0 data_steward/analytics/cdr_ops/clean_rdr_export_qc.py
  3. +28 −4 data_steward/analytics/cdr_ops/combined.py
  4. +20 −39 data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py
  5. +2 −3 data_steward/analytics/cdr_ops/curation_dashboard.py
  6. +8 −32 data_steward/analytics/cdr_ops/ehr_union_qc.py
  7. +65 −3 data_steward/analytics/cdr_ops/fitbit_qc.py
  8. +5 −2 data_steward/analytics/cdr_ops/notebook_utils.py
  9. +4 −2 data_steward/analytics/cdr_ops/participant_validation_qc.py
  10. +12 −14 data_steward/analytics/cdr_ops/raw_fitbit_qc.py
  11. +54 −10 data_steward/analytics/cdr_ops/raw_rdr_export_qc.py
  12. +159 −49 data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_base_qa_report1.py
  13. +7 −29 data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py
  14. +10 −12 data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report3_col_suppression.py
  15. +145 −140 data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py
  16. +0 −831 data_steward/analytics/cdr_ops/vocabulary/single_survey_mapping_validation.py
  17. +193 −76 data_steward/analytics/cdr_ops/vocabulary/validate_proposed_mapping_single_survey.py
  18. +4 −0 data_steward/bq_utils.py
  19. +7 −0 data_steward/cdr_cleaner/clean_cdr.py
  20. +37 −23 data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts.py
  21. +2 −2 data_steward/cdr_cleaner/cleaning_rules/create_aian_lookup.py
  22. +44 −37 data_steward/cdr_cleaner/cleaning_rules/create_expected_ct_list.py
  23. +2 −1 data_steward/cdr_cleaner/cleaning_rules/create_person_ext_table.py
  24. +24 −8 data_steward/cdr_cleaner/cleaning_rules/date_unshift_cope_responses.py
  25. +175 −0 data_steward/cdr_cleaner/cleaning_rules/deid/generalize_indian_health_services.py
  26. +163 −0 data_steward/cdr_cleaner/cleaning_rules/drop_row_duplicates.py
  27. +2 −4 data_steward/cdr_cleaner/cleaning_rules/ensure_date_datetime_consistency.py
  28. +22 −14 data_steward/cdr_cleaner/cleaning_rules/generate_wear_study_table.py
  29. +1 −1 data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables.py
  30. +117 −0 data_steward/cdr_cleaner/cleaning_rules/replace_freetext_notes.py
  31. +1 −1 data_steward/cdr_cleaner/cleaning_rules/repopulate_person_post_deid.py
  32. +3 −0 data_steward/cdr_cleaner/cleaning_rules/temporal_consistency.py
  33. +4 −2 data_steward/cdr_cleaner/cleaning_rules/truncate_fitbit_data.py
  34. +10 −1 data_steward/constants/utils/bq.py
  35. +32 −2 data_steward/gcloud/bq/__init__.py
  36. +3 −2 data_steward/resources.py
  37. +28 −13 data_steward/tools/add_hpo.py
  38. +17 −36 data_steward/tools/create_combined_backup_dataset.py
  39. +9 −101 data_steward/tools/create_combined_dataset.py
  40. +192 −0 data_steward/tools/generate_unioned_ehr_dataset.py
  41. +0 −163 data_steward/tools/generate_unioned_ehr_dataset.sh
  42. +133 −0 data_steward/tools/pipeline_utils.py
  43. +9 −9 data_steward/tools/run_deid.py
  44. +17 −0 data_steward/validation/participants/snapshot_validation_dataset.py
  45. +9 −3 tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/bigquery_tests_base.py
  46. +4 −2 ...ation_tests/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts_test.py
  47. +25 −10 tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/create_expected_ct_list_test.py
  48. +55 −8 tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/date_unshift_cope_responses_test.py
  49. +105 −0 ...tion_tests/data_steward/cdr_cleaner/cleaning_rules/deid/generalize_indian_health_services_test.py
  50. +120 −0 tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/drop_row_duplicates_test.py
  51. +43 −4 ...ntegration_tests/data_steward/cdr_cleaner/cleaning_rules/ensure_date_datetime_consistency_test.py
  52. +8 −6 tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/generate_wear_study_table_test.py
  53. +131 −0 tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/replace_freetext_notes_test.py
  54. +1 −1 tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/repopulate_person_post_deid_test.py
  55. +10 −12 tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/truncate_fitbit_data_test.py
  56. +3 −4 tests/integration_tests/data_steward/tools/create_combined_backup_dataset_test.py
  57. +0 −25 tests/unit_tests/data_steward/bq_utils_test.py
  58. +4 −6 tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/generate_ext_tables_test.py
  59. +1 −2 tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables_test.py
  60. +8 −12 tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/truncate_fitbit_data_test.py
  61. +24 −31 tests/unit_tests/data_steward/tools/add_hpo_test.py
  62. +19 −8 tests/unit_tests/data_steward/tools/run_deid_test.py

This file was deleted.

55 changes: 55 additions & 0 deletions data_steward/analytics/cdr_ops/clean_rdr_export_qc.py
Original file line number Diff line number Diff line change
@@ -1313,4 +1313,59 @@
query = tpl.render(new_rdr=new_rdr, raw_rdr=raw_rdr, project_id=project_id)
execute(client, query)

# # Check to catch duplicates in observation
# If the query fails find extra information on the issue type below.
# * "multiple rows with the same observation_id". There should not be any rows that have the same observation_id. A similar check exists in the raw rdr qc notebook and should have caught occurrences of this issue in the raw export. This check failing most likely means that a rdr cleaning rule is the culpret.
# * "whole row duplicates excluding obs_id" These are rows that have all the same data except for the observation_id. A cleaning rule was created to remove duplicates of this type 'drop_row_duplicates'.

# +
tpl = JINJA_ENV.from_string('''
WITH whole_row_dups AS (
SELECT
COUNT(*) as n
FROM `{{project_id}}.{{new_rdr}}.observation`
GROUP BY -- all fields except observation_id --
person_id, observation_concept_id, observation_date, observation_datetime, observation_type_concept_id,
value_as_number, value_as_string, value_as_concept_id, qualifier_concept_id, unit_concept_id, provider_id,
visit_occurrence_id, visit_detail_id, observation_source_value, observation_source_concept_id, unit_source_value,
qualifier_source_value, value_source_concept_id, value_source_value, questionnaire_response_id
HAVING n > 1)
, observation_id_dups AS (
SELECT
observation_id,
COUNT(observation_id) AS n
FROM `{{project_id}}.{{new_rdr}}.observation`
GROUP BY observation_id
HAVING n>1)
SELECT
"multiple rows with the same obs_id" as issue,
COUNT(*) AS n
FROM
observation_id_dups
UNION ALL
SELECT
"duplicates on whole row - excluding obs_id" as issue,
COUNT(*) AS n
FROM
whole_row_dups
''')
query = tpl.render(project_id=project_id, new_rdr=new_rdr)
df = execute(client, query)

is_success = sum(df['n']) == 0
success_msg = 'No issue found.'
failure_msg = 'Duplicates found. See check description.'

render_message(df,
success_msg,
failure_msg,
is_success=is_success)
# -



32 changes: 28 additions & 4 deletions data_steward/analytics/cdr_ops/combined.py
Original file line number Diff line number Diff line change
@@ -122,18 +122,18 @@
SELECT
"{{table_name}}" AS table_name
,"{{date_field}}" AS date_field
,t.{{date_field}} AS date_value
,DATE(t.{{date_field}}) AS date_value
,p.birth_datetime AS birth_datetime
FROM `{{dataset_id}}.{{table_name}}` t
JOIN `{{dataset_id}}.person` p
USING (person_id)
WHERE
(
-- age <= 0y --
t.{{date_field}} < DATE(p.birth_datetime)
DATE(t.{{date_field}}) < DATE(p.birth_datetime)
-- age >= 150y --
OR pipeline_tables.calculate_age(t.{{date_field}}, EXTRACT(DATE FROM p.birth_datetime)) >= 150
OR pipeline_tables.calculate_age(DATE(t.{{date_field}}), EXTRACT(DATE FROM p.birth_datetime)) >= 150
)
AND
p.birth_datetime IS NOT NULL
@@ -256,7 +256,7 @@
|| 'USING (' || table_name ||'_id) '
|| 'LEFT JOIN consented c '
|| ' USING (person_id)'
|| 'WHERE m.src_hpo_id <> "rdr" AND c.person_id IS NULL)'
|| 'WHERE m.src_hpo_id NOT IN (\\"ce\\", \\"vibrent\\", \\"healthpro\\") AND c.person_id IS NULL)'
, ' UNION ALL ')
FROM `{{DATASET_ID}}.INFORMATION_SCHEMA.COLUMNS` c
JOIN `{{DATASET_ID}}.__TABLES__` t
@@ -274,6 +274,30 @@
execute(client, query)
# -

# ## Verify Note text data

# +
query = f'''
SELECT 'note_text' AS field, note_text AS field_value, COUNT(note_text) AS row_count,
FROM `{PROJECT_ID}.{DATASET_ID}.note`
GROUP BY note_text
UNION ALL
SELECT 'note_title' AS field, note_title AS field_value, COUNT(note_title) AS row_count,
FROM `{PROJECT_ID}.{DATASET_ID}.note`
GROUP BY note_title
UNION ALL
SELECT 'note_source_value' AS field, note_source_value AS field_value, COUNT(note_source_value) AS row_count,
FROM `{PROJECT_ID}.{DATASET_ID}.note`
GROUP BY note_source_value
'''

execute(client, query)
# -

# ## Date and datetime fields should have the same date
# The date represented by associated `_date` and `_datetime` fields of the same
# row should be the same. If there any discrepancies, there may be a bug in the
Original file line number Diff line number Diff line change
@@ -61,6 +61,7 @@
project_id = ""
rt_dataset = ""
ct_dataset = ""
combined_dataset = ""
deid_sandbox = ""
earliest_ehr_date = ""
cut_off_date = ""
@@ -1015,7 +1016,7 @@ def my_sql(table_name, column_name):
FROM `{{project_id}}.{{ct_dataset}}.concept` c
JOIN `{{project_id}}.{{ct_dataset}}.{{table_name}}` ON (concept_id={{column_name}})
WHERE standard_concept !='S'
WHERE standard_concept not in ('S', 'C')
AND {{column_name}} !=0
""")
q = query.render(project_id=project_id,
@@ -1059,17 +1060,19 @@ def my_sql(table_name, column_name):
ignore_index=True)

# # Query 13 observation concept ids (4013886, 4135376, 4271761) that have dates equal to birth dates should be set to CDR cutoff date
#
# Note: CT person table does not contain exact birth dates, and some observations with exact date matches might not exist in RT, therefore, the combined dataset is used in this check.

# +

query = JINJA_ENV.from_string("""
WITH rows_having_brith_date as (
SELECT distinct observation_id
FROM
`{{project_id}}.{{rt_dataset}}.observation` ob
JOIN {{project_id}}.{{rt_dataset}}.person p USING (person_id)
SELECT distinct observation_id
FROM
`{{project_id}}.{{combined_dataset}}.observation` ob
JOIN `{{project_id}}.{{combined_dataset}}.person` p USING (person_id)
WHERE observation_concept_id in (4013886, 4135376, 4271761)
AND observation_date=DATE(p.birth_datetime)
)
@@ -1089,14 +1092,12 @@ def my_sql(table_name, column_name):
""")

q = query.render(project_id=project_id,
rt_dataset=rt_dataset,
ct_dataset=ct_dataset,
combined_dataset=combined_dataset,
cut_off_date=cut_off_date)
df1 = execute(client, q)
df1.shape
# -

df1
# -

if df1.iloc[:, 3].sum() == 0:
df = df.append(
@@ -1119,17 +1120,19 @@ def my_sql(table_name, column_name):

#
# # Query 14 done all other observation concept ids WITH dates similar to birth dates other than the 3 above should be removed
#
# Related to CR year_of_birth_records_suppression

# +
query = JINJA_ENV.from_string("""
WITH rows_having_brith_date as (
WITH rows_having_brith_date as (
SELECT observation_id
FROM {{project_id}}.{{rt_dataset}}.observation ob
JOIN {{project_id}}.{{rt_dataset}}.person p USING (person_id)
WHERE observation_concept_id NOT IN (4013886, 4135376, 4271761)
AND observation_date=DATE(p.birth_datetime)
FROM {{project_id}}.{{ct_dataset}}.observation ob
JOIN {{project_id}}.{{ct_dataset}}.person p USING (person_id)
WHERE (observation_concept_id NOT IN (4013886, 4135376, 4271761) OR observation_concept_id IS NULL)
AND ABS(EXTRACT(YEAR FROM observation_date)- p.year_of_birth) < 2
)
SELECT
@@ -1146,13 +1149,11 @@ def my_sql(table_name, column_name):
""")

q = query.render(project_id=project_id,
rt_dataset=rt_dataset,
ct_dataset=ct_dataset)
df1 = execute(client, q)
df1.shape
# -

df1
# -

if df1.iloc[:, 3].sum() == 0:
df = df.append(
@@ -1299,7 +1300,6 @@ def query_template(table_era):
# **If check fails:**<br>
# * The issue `participant with multiple records` means that those participants have multiple rows in the wear_study table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table. <br>
# * The issue `not in person table` means that participants exist in the wear_study table that aren't in the person table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table.<br>
# * The issue `no primary consent` means that participants exist in the wear_study table that do not have proper primary consent. Investigate the issue. It is possible that there is another way to determine primary consent. <br>

# +
query = JINJA_ENV.from_string("""
@@ -1328,26 +1328,6 @@ def query_template(table_era):
SELECT person_id
FROM `{{project_id}}.{{ct_dataset}}.person` o
)
UNION ALL
SELECT
'no primary consent' as issue,
COUNT(person_id) as bad_rows
FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws
WHERE person_id not in ( -- aou consenting participants --
SELECT cte.person_id
FROM latest_primary_consent_records cte
LEFT JOIN ( -- any positive primary consent --
SELECT *
FROM `{{project_id}}.{{ct_dataset}}.observation`
WHERE REGEXP_CONTAINS(observation_source_value, '(?i)extraconsent_agreetoconsent')
AND value_as_concept_id = 45877994) o
ON cte.person_id = o.person_id
AND cte.latest_date = o.observation_date
WHERE o.person_id IS NOT NULL
)
""")
q = query.render(project_id=project_id, ct_dataset=ct_dataset)
df1 = execute(client, q)
@@ -1370,6 +1350,7 @@ def query_template(table_era):
ignore_index=True)
# -


df1

# +
@@ -1529,4 +1510,4 @@ def highlight_cells(val):
return f'background-color: {color}'


df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'})
df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'})
Loading