all-of-us
diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py
diff --git a/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py b/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py
@@ -1313,4 +1313,59 @@
 query = tpl.render(new_rdr=new_rdr, raw_rdr=raw_rdr, project_id=project_id)
 execute(client, query)
 
+# # Check to catch duplicates in observation
+# If the query fails find extra information on the issue type below.
+# * "multiple rows with the same observation_id". There should not be any rows that have the same observation_id. A similar check exists in the raw rdr qc notebook and should have caught occurrences of this issue in the raw export. This check failing most likely means that a rdr cleaning rule is the culpret. 
+# * "whole row duplicates excluding obs_id" These are rows that have all the same data except for the observation_id. A cleaning rule was created to remove duplicates of this type 'drop_row_duplicates'.  
+
+# +
+tpl = JINJA_ENV.from_string('''
+WITH whole_row_dups AS (
+SELECT
+COUNT(*) as n
+FROM `{{project_id}}.{{new_rdr}}.observation`
+GROUP BY -- all fields except observation_id --
+ person_id, observation_concept_id, observation_date, observation_datetime, observation_type_concept_id, 
+ value_as_number, value_as_string, value_as_concept_id, qualifier_concept_id, unit_concept_id, provider_id, 
+ visit_occurrence_id, visit_detail_id, observation_source_value, observation_source_concept_id, unit_source_value, 
+ qualifier_source_value, value_source_concept_id, value_source_value, questionnaire_response_id
+HAVING n > 1)
+
+, observation_id_dups AS (
+SELECT
+observation_id,
+COUNT(observation_id) AS n
+FROM `{{project_id}}.{{new_rdr}}.observation`
+GROUP BY observation_id
+HAVING n>1)
+
+SELECT
+ "multiple rows with the same obs_id" as issue,
+  COUNT(*) AS n
+FROM 
+    observation_id_dups
+
+UNION ALL
+
+SELECT
+ "duplicates on whole row - excluding obs_id" as issue,
+  COUNT(*) AS n
+FROM 
+    whole_row_dups
+
+''')
+query = tpl.render(project_id=project_id, new_rdr=new_rdr)
+df = execute(client, query)
+
+is_success = sum(df['n']) == 0
+success_msg = 'No issue found.'
+failure_msg = 'Duplicates found. See check description.'
+
+render_message(df,
+               success_msg,
+               failure_msg,
+               is_success=is_success)
+# -
+
+
 
diff --git a/data_steward/analytics/cdr_ops/combined.py b/data_steward/analytics/cdr_ops/combined.py
@@ -122,18 +122,18 @@
 SELECT
  "{{table_name}}"     AS table_name
 ,"{{date_field}}"     AS date_field
-,t.{{date_field}}     AS date_value
+,DATE(t.{{date_field}})     AS date_value
 ,p.birth_datetime     AS birth_datetime
 FROM `{{dataset_id}}.{{table_name}}` t
  JOIN `{{dataset_id}}.person` p
   USING (person_id)
 WHERE
 (
  -- age <= 0y --
- t.{{date_field}} < DATE(p.birth_datetime)
+ DATE(t.{{date_field}}) < DATE(p.birth_datetime)
 
  -- age >= 150y --
- OR pipeline_tables.calculate_age(t.{{date_field}}, EXTRACT(DATE FROM p.birth_datetime)) >= 150
+ OR pipeline_tables.calculate_age(DATE(t.{{date_field}}), EXTRACT(DATE FROM p.birth_datetime)) >= 150
 )
 AND
 p.birth_datetime IS NOT NULL
@@ -256,7 +256,7 @@
       || 'USING (' || table_name ||'_id) '
       || 'LEFT JOIN consented c '
       || ' USING (person_id)'
-      || 'WHERE m.src_hpo_id <> "rdr" AND c.person_id IS NULL)'
+      || 'WHERE m.src_hpo_id NOT IN (\\"ce\\", \\"vibrent\\", \\"healthpro\\") AND c.person_id IS NULL)'
    , ' UNION ALL ')
  FROM `{{DATASET_ID}}.INFORMATION_SCHEMA.COLUMNS` c
  JOIN `{{DATASET_ID}}.__TABLES__` t
@@ -274,6 +274,30 @@
 execute(client, query)
 # -
 
+# ## Verify Note text data
+
+# +
+query = f'''
+SELECT 'note_text' AS field, note_text AS field_value, COUNT(note_text) AS row_count,
+FROM `{PROJECT_ID}.{DATASET_ID}.note`
+GROUP BY note_text
+
+UNION ALL
+
+SELECT 'note_title' AS field, note_title AS field_value, COUNT(note_title) AS row_count,
+FROM `{PROJECT_ID}.{DATASET_ID}.note`
+GROUP BY note_title
+
+UNION ALL
+
+SELECT 'note_source_value' AS field, note_source_value AS field_value, COUNT(note_source_value) AS row_count,
+FROM `{PROJECT_ID}.{DATASET_ID}.note`
+GROUP BY note_source_value
+'''
+
+execute(client, query)
+# -
+
 # ## Date and datetime fields should have the same date
 # The date represented by associated `_date` and `_datetime` fields of the same
 # row should be the same. If there any discrepancies, there may be a bug in the

diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py
@@ -61,6 +61,7 @@
 project_id = ""
 rt_dataset = ""
 ct_dataset = ""
+combined_dataset = ""
 deid_sandbox = ""
 earliest_ehr_date = ""
 cut_off_date = ""
@@ -1015,7 +1016,7 @@ def my_sql(table_name, column_name):
 
 FROM `{{project_id}}.{{ct_dataset}}.concept` c
 JOIN `{{project_id}}.{{ct_dataset}}.{{table_name}}`  ON (concept_id={{column_name}})
-WHERE  standard_concept !='S'
+WHERE  standard_concept not in ('S', 'C')
 AND {{column_name}} !=0
 """)
     q = query.render(project_id=project_id,
@@ -1059,17 +1060,19 @@ def my_sql(table_name, column_name):
         ignore_index=True)
 
 # # Query 13 observation concept ids (4013886, 4135376, 4271761) that have dates equal to birth dates should be set to CDR cutoff date
+#
+# Note: CT person table does not contain exact birth dates, and some observations with exact date matches might not exist in RT, therefore, the combined dataset is used in this check.
 
 # +
 
 query = JINJA_ENV.from_string("""
 
  WITH rows_having_brith_date as (
 
- SELECT distinct observation_id
- FROM
-`{{project_id}}.{{rt_dataset}}.observation` ob
-JOIN {{project_id}}.{{rt_dataset}}.person p USING (person_id)
+SELECT distinct observation_id
+FROM
+`{{project_id}}.{{combined_dataset}}.observation` ob
+JOIN `{{project_id}}.{{combined_dataset}}.person` p USING (person_id)
 WHERE  observation_concept_id in (4013886, 4135376, 4271761)
 AND observation_date=DATE(p.birth_datetime)
  )
@@ -1089,14 +1092,12 @@ def my_sql(table_name, column_name):
  """)
 
 q = query.render(project_id=project_id,
-                 rt_dataset=rt_dataset,
                  ct_dataset=ct_dataset,
+                 combined_dataset=combined_dataset,
                  cut_off_date=cut_off_date)
 df1 = execute(client, q)
-df1.shape
-# -
-
 df1
+# -
 
 if df1.iloc[:, 3].sum() == 0:
     df = df.append(
@@ -1119,17 +1120,19 @@ def my_sql(table_name, column_name):
 
 #
 # # Query 14 done all other observation concept ids WITH dates similar to birth dates other than the 3 above should be removed
+#
+# Related to CR year_of_birth_records_suppression
 
 # +
 query = JINJA_ENV.from_string("""
 
- WITH rows_having_brith_date as (
+WITH rows_having_brith_date as (
 
 SELECT observation_id
-  FROM {{project_id}}.{{rt_dataset}}.observation ob
-JOIN  {{project_id}}.{{rt_dataset}}.person p USING (person_id)
- WHERE observation_concept_id NOT IN (4013886, 4135376, 4271761)
-  AND observation_date=DATE(p.birth_datetime)
+  FROM {{project_id}}.{{ct_dataset}}.observation ob
+JOIN  {{project_id}}.{{ct_dataset}}.person p USING (person_id)
+ WHERE (observation_concept_id NOT IN (4013886, 4135376, 4271761) OR observation_concept_id IS NULL)
+  AND ABS(EXTRACT(YEAR FROM observation_date)- p.year_of_birth) < 2
   )
 
 SELECT
@@ -1146,13 +1149,11 @@ def my_sql(table_name, column_name):
 """)
 
 q = query.render(project_id=project_id,
-                 rt_dataset=rt_dataset,
                  ct_dataset=ct_dataset)
 df1 = execute(client, q)
-df1.shape
-# -
 
 df1
+# -
 
 if df1.iloc[:, 3].sum() == 0:
     df = df.append(
@@ -1299,7 +1300,6 @@ def query_template(table_era):
 # **If check fails:**<br>
 # * The issue `participant with multiple records` means that those participants have multiple rows in the wear_study table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table. <br>
 # * The issue `not in person table` means that participants exist in the wear_study table that aren't in the person table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table.<br>
-# * The issue `no primary consent` means that participants exist in the wear_study table that do not have proper primary consent. Investigate the issue. It is possible that there is another way to determine primary consent. <br>
 
 # +
 query = JINJA_ENV.from_string("""
@@ -1328,26 +1328,6 @@ def query_template(table_era):
   SELECT person_id
   FROM `{{project_id}}.{{ct_dataset}}.person` o
   )
-
-UNION ALL
-
-SELECT
-  'no primary consent' as issue,
-  COUNT(person_id) as bad_rows
-FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws
-WHERE person_id not in (  -- aou consenting participants --
-  SELECT cte.person_id
-  FROM latest_primary_consent_records cte
-    LEFT JOIN ( -- any positive primary consent --
-      SELECT *
-      FROM `{{project_id}}.{{ct_dataset}}.observation`
-      WHERE REGEXP_CONTAINS(observation_source_value, '(?i)extraconsent_agreetoconsent')
-      AND value_as_concept_id = 45877994) o
-    ON cte.person_id = o.person_id
-    AND cte.latest_date = o.observation_date
-  WHERE o.person_id IS NOT NULL
-  )
-
 """)
 q = query.render(project_id=project_id, ct_dataset=ct_dataset)
 df1 = execute(client, q)
@@ -1370,6 +1350,7 @@ def query_template(table_era):
         ignore_index=True)
 # -
 
+
 df1
 
 # +
@@ -1529,4 +1510,4 @@ def highlight_cells(val):
     return f'background-color: {color}'
 
 
-df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'})
+df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'})