From 335a16c6521bc3370bc745365f407fc07f7b21d6 Mon Sep 17 00:00:00 2001 From: Malinda Date: Tue, 28 Jun 2022 19:27:18 -0700 Subject: [PATCH 1/3] use np.sum to improve performance and make the code succinct, --- tensorflow_transform/info_theory.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tensorflow_transform/info_theory.py b/tensorflow_transform/info_theory.py index f536d105..4372276e 100644 --- a/tensorflow_transform/info_theory.py +++ b/tensorflow_transform/info_theory.py @@ -52,12 +52,10 @@ def calculate_partial_expected_mutual_information(n, x_i, y_j): if x_i == 0 or y_j == 0: return 0 coefficient = (-log2(x_i) - log2(y_j) + log2(n)) - sum_probability = 0.0 - partial_result = 0.0 - for n_j, p_j in _hypergeometric_pmf(n, x_i, y_j): - if n_j != 0: - partial_result += n_j * (coefficient + log2(n_j)) * p_j - sum_probability += p_j + hyp_geo_pmf = _hypergeometric_pmf(n, x_i, y_j) + sum_probability = np.sum([p_j for n_j,p_j in hyp_geo_pmf]) + partial_result = np.sum([n_j * (coefficient + log2(n_j)) * p_j for n_j, p_j in hyp_geo_pmf if n_j != 0]) + # The values of p_j should sum to 1, but given approximate calculations for # log2(x) and exp2(x) with large x, the full pmf might not sum to exactly 1. # We correct for this by dividing by the sum of the probabilities. From 2ed9a22ecd9abe694749a1c134ffeca5086a52fd Mon Sep 17 00:00:00 2001 From: Malinda Date: Tue, 28 Jun 2022 19:27:52 -0700 Subject: [PATCH 2/3] add import --- tensorflow_transform/info_theory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_transform/info_theory.py b/tensorflow_transform/info_theory.py index 4372276e..2c35aa1f 100644 --- a/tensorflow_transform/info_theory.py +++ b/tensorflow_transform/info_theory.py @@ -14,7 +14,7 @@ """Utilities for information-theoretic preprocessing algorithms.""" import math - +import numpy as np # math.log2 was added in Python 3.3 log2 = getattr(math, 'log2', lambda x: math.log(x, 2)) From d97c906b9f9ea637295cbce048a88ea13515a084 Mon Sep 17 00:00:00 2001 From: Malinda Date: Tue, 28 Jun 2022 19:29:26 -0700 Subject: [PATCH 3/3] Add space for consistency --- tensorflow_transform/info_theory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_transform/info_theory.py b/tensorflow_transform/info_theory.py index 2c35aa1f..e0497c5b 100644 --- a/tensorflow_transform/info_theory.py +++ b/tensorflow_transform/info_theory.py @@ -53,7 +53,7 @@ def calculate_partial_expected_mutual_information(n, x_i, y_j): return 0 coefficient = (-log2(x_i) - log2(y_j) + log2(n)) hyp_geo_pmf = _hypergeometric_pmf(n, x_i, y_j) - sum_probability = np.sum([p_j for n_j,p_j in hyp_geo_pmf]) + sum_probability = np.sum([p_j for n_j, p_j in hyp_geo_pmf]) partial_result = np.sum([n_j * (coefficient + log2(n_j)) * p_j for n_j, p_j in hyp_geo_pmf if n_j != 0]) # The values of p_j should sum to 1, but given approximate calculations for