|
4 | 4 | from .. import _dataframe as sbd
|
5 | 5 | from . import _plotting, _sample_table, _utils
|
6 | 6 |
|
7 |
| -_HIGH_CARDINALITY_THRESHOLD = 10 |
8 | 7 | _SUBSAMPLE_SIZE = 3000
|
9 | 8 | _N_TOP_ASSOCIATIONS = 20
|
10 | 9 |
|
@@ -130,6 +129,15 @@ def _summarize_column(
|
130 | 129 | if summary["null_count"] == dataframe_summary["n_rows"]:
|
131 | 130 | summary["plot_names"] = []
|
132 | 131 | return summary
|
| 132 | + try: |
| 133 | + summary["n_unique"] = sbd.n_unique(column) |
| 134 | + summary["unique_proportion"] = summary["n_unique"] / max( |
| 135 | + 1, dataframe_summary["n_rows"] |
| 136 | + ) |
| 137 | + except Exception: |
| 138 | + # for some dtypes n_unique can fail eg with a typeerror for |
| 139 | + # non-hashable types in pandas. |
| 140 | + pass |
133 | 141 | _add_value_counts(
|
134 | 142 | summary, column, dataframe_summary=dataframe_summary, with_plots=with_plots
|
135 | 143 | )
|
@@ -160,15 +168,17 @@ def _add_nulls_summary(summary, column, dataframe_summary):
|
160 | 168 |
|
161 | 169 | def _add_value_counts(summary, column, *, dataframe_summary, with_plots):
|
162 | 170 | if sbd.is_numeric(column) or sbd.is_any_date(column):
|
163 |
| - summary["high_cardinality"] = True |
164 | 171 | return
|
165 | 172 | n_unique, value_counts = _utils.top_k_value_counts(column, k=10)
|
166 | 173 | # if the column contains all nulls, _add_value_counts does not get called
|
167 | 174 | assert n_unique > 0
|
168 | 175 |
|
| 176 | + # value_counts may be able to find the number of unique values in cases |
| 177 | + # where n_unique() fails (eg non-hashable column content in pandas) so we |
| 178 | + # update n_unique and unique_proportion |
169 | 179 | summary["n_unique"] = n_unique
|
170 | 180 | summary["unique_proportion"] = n_unique / max(1, dataframe_summary["n_rows"])
|
171 |
| - summary["high_cardinality"] = n_unique >= _HIGH_CARDINALITY_THRESHOLD |
| 181 | + |
172 | 182 | summary["value_counts"] = value_counts
|
173 | 183 | summary["most_frequent_values"] = [v for v, _ in value_counts]
|
174 | 184 |
|
|
0 commit comments