all-of-us · nishanthpp93 · Feb 28, 2025 · Mar 1, 2025
diff --git a/data_steward/cdr_cleaner/clean_cdr.py b/data_steward/cdr_cleaner/clean_cdr.py
@@ -30,6 +30,7 @@
 from cdr_cleaner.cleaning_rules.drop_multiple_measurements import DropMultipleMeasurements
 from cdr_cleaner.cleaning_rules.drop_participants_without_any_basics import DropParticipantsWithoutAnyBasics
 from cdr_cleaner.cleaning_rules.clean_survey_conduct_recurring_surveys import CleanSurveyConductRecurringSurveys
+from cdr_cleaner.cleaning_rules.suppress_note_fields import SuppressNoteFields
 from cdr_cleaner.cleaning_rules.update_survey_source_concept_id import UpdateSurveySourceConceptId
 from cdr_cleaner.cleaning_rules.drop_unverified_survey_data import DropUnverifiedSurveyData
 from cdr_cleaner.cleaning_rules.drug_refills_days_supply import DrugRefillsDaysSupply
@@ -175,6 +176,7 @@
     (RemoveRecordsWithWrongDate,),
     (RemoveInvalidProcedureSourceRecords,),
     (CalculatePrimaryDeathRecord,),
+    (SuppressNoteFields,),
     (CleanMappingExtTables,),  # should be one of the last cleaning rules run
 ]
 

diff --git a/data_steward/cdr_cleaner/cleaning_rules/suppress_note_fields.py b/data_steward/cdr_cleaner/cleaning_rules/suppress_note_fields.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+"""
+Removes Note fields that need to be suppressed.
+Applied at the combined stage before combined release
+
+Jira issues = DC-3866
+"""
+# Python imports
+import logging
+
+# Project imports
+from gcloud.bq import BigQueryClient
+import constants.cdr_cleaner.clean_cdr as cdr_consts
+from cdr_cleaner.cleaning_rules.base_cleaning_rule import BaseCleaningRule, query_spec_list
+from common import JINJA_ENV, NOTE
+
+LOGGER = logging.getLogger(__name__)
+
+JIRA_ISSUE_NUMBERS = ['DC3866']
+
+SANDBOX_NOTE_TABLE = JINJA_ENV.from_string("""
+CREATE OR REPLACE TABLE `{{project_id}}.{{sandbox_dataset_id}}.{{sandbox_table}}` AS (
+  SELECT * FROM `{{project_id}}.{{dataset_id}}.note`
+)
+""")
+
+SUPPRESS_NOTE_QUERY = JINJA_ENV.from_string("""
+UPDATE TABLE `{{project_id}}.{{dataset_id}}.note`
+SET note_title = "",
+note_text = "",
+provider_id = NULL,
+note_source_value = NULL
+WHERE TRUE
+""")
+
+
+class SuppressNoteFields(BaseCleaningRule):
+
+    def __init__(self,
+                 project_id,
+                 dataset_id,
+                 sandbox_dataset_id,
+                 table_namer=None):
+        """
+        Initialize the class with proper information.
+
+        Set the issue numbers, description and affected datasets. As other tickets may affect
+        this SQL, append them to the list of Jira Issues.
+        DO NOT REMOVE ORIGINAL JIRA ISSUE NUMBERS!
+        """
+        desc = 'Nulls string fields in the note table.'
+
+        super().__init__(issue_numbers=JIRA_ISSUE_NUMBERS,
+                         description=desc,
+                         affected_datasets=[cdr_consts.COMBINED],
+                         affected_tables=[NOTE],
+                         project_id=project_id,
+                         dataset_id=dataset_id,
+                         sandbox_dataset_id=sandbox_dataset_id,
+                         table_namer=table_namer)
+
+    def get_sandbox_tablenames(self) -> list:
+        return [self.sandbox_table_for(table) for table in self.affected_tables]
+
+    def setup_rule(self, client: BigQueryClient, *args, **keyword_args) -> None:
+        """
+        Load the lookup table values into the sandbox.
+
+        The following queries will use the lookup table as part of the execution.
+        Loads the operational pii fields from resource_files/_operational_pii_fields.csv
+        into project_id.sandbox_dataset_id.operational_pii_fields in BQ
+        """
+        pass
+
+    def get_query_specs(self, *args, **keyword_args) -> query_spec_list:
+        """
+        Return a list of dictionary query specifications.
+
+        :return:  A list of dictionaries. Each dictionary contains a single query
+            and a specification for how to execute that query. The specifications
+            are optional but the query is required.
+        """
+        queries_list = []
+
+        sandbox_query = dict()
+        sandbox_query[cdr_consts.QUERY] = SANDBOX_NOTE_TABLE.render(
+            project_id=self.project_id,
+            dataset_id=self.dataset_id,
+            sandbox_dataset_id=self.sandbox_dataset_id,
+            sandbox_table=self.get_sandbox_tablenames()[0])
+        queries_list.append(sandbox_query)
+
+        suppress_query = dict()
+        suppress_query[cdr_consts.QUERY] = SUPPRESS_NOTE_QUERY.render(
+            project_id=self.project_id,
+            dataset_id=self.dataset_id,
+            sandbox_dataset_id=self.sandbox_dataset_id,
+            sandbox_table=self.get_sandbox_tablenames()[0])
+        queries_list.append(suppress_query)
+
+        return queries_list
+
+    def setup_validation(self, client: BigQueryClient) -> None:
+        """
+        Run required steps for validation setup
+        """
+        pass
+
+    def validate_rule(self, client: BigQueryClient) -> None:
+        """
+        Validates the cleaning rule which deletes or updates the data from the tables
+        """
+        pass
+
+
+if __name__ == '__main__':
+    import cdr_cleaner.args_parser as parser
+    import cdr_cleaner.clean_cdr_engine as clean_engine
+
+    ARGS = parser.parse_args()
+
+    if ARGS.list_queries:
+        clean_engine.add_console_logging()
+        query_list = clean_engine.get_query_list(ARGS.project_id,
+                                                 ARGS.dataset_id,
+                                                 ARGS.sandbox_dataset_id,
+                                                 [(SuppressNoteFields,)])
+        for query in query_list:
+            LOGGER.info(query)
+    else:
+        clean_engine.add_console_logging(ARGS.console_log)
+        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
+                                   ARGS.sandbox_dataset_id,
+                                   [(SuppressNoteFields,)])