diff --git a/app/workflows/anonymize_case_data/README.md b/app/workflows/anonymize_case_data/README.md index 24ae30f7..d65afb9e 100644 --- a/app/workflows/anonymize_case_data/README.md +++ b/app/workflows/anonymize_case_data/README.md @@ -120,9 +120,9 @@ The checkbox `Suppress boolean False / binary 0` is also selected by default. Th Rename `age` to `age_range` and `quarter` to `period`. -#### Evaluating synthesizability +#### Evaluating anonymizability -The `Synthesizability summary` gives an initial indication of how easy it will be to generate high-accurary synthetic data given the number of attribute combinations in the final sensitive dataset. The smaller each of these numbers, the better: +The `Anonymizability summary` gives an initial indication of how easy it will be to generate high-accurary anonymous data given the number of attribute combinations in the final sensitive dataset. The smaller each of these numbers, the better: - `Number of selected columns`: The number of columns after all data transformations - `Number of distinct attribute values`: The number of distinct attribute values across selected columns @@ -132,7 +132,7 @@ The `Synthesizability summary` gives an initial indication of how easy it will b - `Typical combinations per record`: The number of possible combinations in the typical number of attribute values(2^typical_values) - `Excess combinations ratio`: Theoretical combinations per record / Typical combinations per record -The last metric, `Excess combinations ratio`, is the main one to pay attention to in terms of synthesizability. As a rule of thumb, try to keep this ratio at or below `5`. The general idea here is that datasets should have sufficient records to support all possible combinations of attribute values, given the number of distinct attribute values in each column. Not all combinations will be present in most cases – data records tend to cluster around certain attribute patterns – so it is ok for this value to be greater than `1`. How far it can actually go and still yield high-accuracy synthetic data depends on how many of these possible attribute combinations are actually observed in the data. +The last metric, `Excess combinations ratio`, is the main one to pay attention to in terms of anonymizability. As a rule of thumb, try to keep this ratio at or below `5`. The general idea here is that datasets should have sufficient records to support all possible combinations of attribute values, given the number of distinct attribute values in each column. Not all combinations will be present in most cases – data records tend to cluster around certain attribute patterns – so it is ok for this value to be greater than `1`. How far it can actually go and still yield high-accuracy anonymous data depends on how many of these possible attribute combinations are actually observed in the data. ### Generating anonymous data diff --git a/app/workflows/anonymize_case_data/workflow.py b/app/workflows/anonymize_case_data/workflow.py index f94f0d87..babb410a 100644 --- a/app/workflows/anonymize_case_data/workflow.py +++ b/app/workflows/anonymize_case_data/workflow.py @@ -65,35 +65,43 @@ def create(sv: ds_variables.SessionVariables, workflow: None): syn_stats = acd.analyze_synthesizability(sv.anonymize_sensitive_df.value) - st.markdown("### Synthesizability summary") + st.markdown("### Anonymizability summary") st.markdown( f"Number of selected columns: **{syn_stats.num_cols}**", - help="This is the number of columns you selected for processing. The more columns you select, the harder it will be to synthesize data.", + help="This is the number of columns you selected for processing. The more columns you select, the harder it will be to anonymize data.", ) st.markdown( f"Number of distinct attribute values: **{syn_stats.overall_att_count}**", - help="This is the total number of distinct attribute values across all selected columns. The more distinct values, the harder it will be to synthesize data.", + help="This is the total number of distinct attribute values across all selected columns. The more distinct values, the harder it will be to anonymize data.", ) st.markdown( f"Theoretical attribute combinations: **{syn_stats.possible_combinations}**", - help="This is the total number of possible attribute combinations across all selected columns. The higher this number, the harder it will be to synthesize data.", + help="This is the total number of possible attribute combinations across all selected columns. The higher this number, the harder it will be to anonymize data.", ) st.markdown( f"Theoretical combinations per record: **{syn_stats.possible_combinations_per_row}**", - help="This is the mean number of possible attribute combinations per sensitive case record. The higher this number, the harder it will be to synthesize data.", + help="This is the mean number of possible attribute combinations per sensitive case record. The higher this number, the harder it will be to anonymize data.", ) st.markdown( f"Typical values per record: **{round(syn_stats.mean_vals_per_record, 1)}**", - help="This is the mean number of actual attribute values per sensitive case record. The higher this number, the harder it will be to synthesize data.", + help="This is the mean number of actual attribute values per sensitive case record. The higher this number, the harder it will be to anonymize data.", ) st.markdown( f"Typical combinations per record: **{round(syn_stats.max_combinations_per_record, 1)}**", help="This is the number of attribute combinations in a record with the typical number of values.", ) st.markdown( - f"Excess combinations ratio: **{round(syn_stats.excess_combinations_ratio, 1)}**", - help="This is the ratio of theoretical combinations per record to the typical combinations per record. The higher this number, the harder it will be to synthesize data. **Rule of thumb**: Aim for a ratio of **5** or lower.", + f"**Excess combinations ratio: {round(syn_stats.excess_combinations_ratio, 1)}**", + help="This is the ratio of theoretical combinations per record to the typical combinations per record. The higher this number, the harder it will be to anonymize data. **Rule of thumb**: Aim for a ratio of **5** or lower.", ) + if syn_stats.excess_combinations_ratio <= 5: + st.success( + "This dataset is likely to be anonymizable. You can proceed to anonymize the data." + ) + else: + st.warning( + "This dataset may be difficult to anonymize. You may need to reduce the number of columns or attribute values to proceed." + ) with generate_tab: if len(sv.anonymize_sensitive_df.value) == 0: @@ -112,7 +120,6 @@ def create(sv: ds_variables.SessionVariables, workflow: None): ) with b2: if st.button("Anonymize data"): - print("Anonymizing data...") sv.anonymize_epsilon.value = epsilon df = sv.anonymize_sensitive_df.value with st.spinner("Anonymizing data..."): @@ -123,7 +130,10 @@ def create(sv: ds_variables.SessionVariables, workflow: None): sv.anonymize_synthetic_df.value = acd.synthetic_df sv.anonymize_aggregate_df.value = acd.aggregate_df sv.anonymize_delta.value = f"{acd.delta:.2e}" - + if epsilon > 12: + st.warning( + "Epsilon is above the recommended threshold of 12" + ) st.markdown( "#### Analyze data", help="Tables show three evaluation metrics for each **Length** of attribute combination up to 4, plus an **Overall** average.\n\n- **Count +/- Error** is the average number of records for the combination length +/- the average absolute error in the number of records.\n- **Suppressed %** is the percentage of the total attribute counts that were suppressed, i.e., present in the Sensitive data but not the Aggregate/Synthetic data.\n- **Fabricated %** is the percentage of the total attribute counts that were fabricated, i.e., present in the Aggregate/Synthetic data but not the Sensitive data.\n\nPercentages are calculated with respect to attribute counts in the Sensitive data.\n\n**Rule of thumb**: For the Synthetic data, aim to keep the Overall Error below the Overall Count, Suppressed % below 10%, and Fabricated % below 1%.", @@ -139,12 +149,24 @@ def create(sv: ds_variables.SessionVariables, workflow: None): hide_index=True, use_container_width=False, ) + error_str = acd.aggregate_error_report[acd.aggregate_error_report["Length"] == "Overall"]["Count +/- Error"].values[0] + mean_count, mean_error = error_str.split(" +/- ") + if float(mean_error) <= float(mean_count): + st.success("Error < Count on average: data quality is good") + else: + st.warning("Error > Count on average: simplify sensitive data to improve") st.markdown("###### Synthetic data quality") st.dataframe( acd.synthetic_error_report, hide_index=True, use_container_width=False, ) + error_str = acd.synthetic_error_report[acd.synthetic_error_report["Length"] == "Overall"]["Count +/- Error"].values[0] + mean_count, mean_error = error_str.split(" +/- ") + if float(mean_error) <= float(mean_count): + st.success("Error < Count on average: data quality is good") + else: + st.warning("Error > Count on average: simplify sensitive data to improve") st.warning( "**Caution**: These tables should only be used to evaluate the quality of data for release. Sharing them compromises privacy." ) diff --git a/intelligence_toolkit/anonymize_case_data/api.py b/intelligence_toolkit/anonymize_case_data/api.py index 763e9b1a..f1f70016 100644 --- a/intelligence_toolkit/anonymize_case_data/api.py +++ b/intelligence_toolkit/anonymize_case_data/api.py @@ -2,7 +2,6 @@ # Licensed under the MIT license. See LICENSE file in the project. import math - import pandas as pd import plotly.graph_objects as go from pacsynth import ( @@ -41,7 +40,9 @@ def analyze_synthesizability(self, df: pd.DataFrame) -> SynthesizabilityStatisti distinct_values = [ x for x in df[col].astype(str).unique() if x not in ["", "nan"] ] - distinct_counts.append(len(distinct_values)) + num = len(distinct_values) + if num > 0: + distinct_counts.append(num) distinct_counts.sort() overall_att_count = sum(distinct_counts) possible_combinations = math.prod(distinct_counts)