Skip to content

Commit d0658c6

Browse files
authored
compute n_unique for all columns in tablereport (#1154)
* compute n_unique for all columns in tablereport * changelog
1 parent 0f73984 commit d0658c6

File tree

4 files changed

+18
-6
lines changed

4 files changed

+18
-6
lines changed

CHANGES.rst

+3
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ Minor changes
7979
* Added a `DropColumnIfNull` transformer that drops columns that contain only null
8080
values. :pr:`1115` by :user: `Riccardo Cappuzzo <riccardocappuzzo>`
8181

82+
* The :class:`TableReport` now also reports the number of unique values for
83+
numeric columns. :pr:`1154` by :user:`Jérôme Dockès <jeromedockes>`.
84+
8285
Bug fixes
8386
---------
8487

skrub/_reporting/_summarize.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from .. import _dataframe as sbd
55
from . import _plotting, _sample_table, _utils
66

7-
_HIGH_CARDINALITY_THRESHOLD = 10
87
_SUBSAMPLE_SIZE = 3000
98
_N_TOP_ASSOCIATIONS = 20
109

@@ -130,6 +129,15 @@ def _summarize_column(
130129
if summary["null_count"] == dataframe_summary["n_rows"]:
131130
summary["plot_names"] = []
132131
return summary
132+
try:
133+
summary["n_unique"] = sbd.n_unique(column)
134+
summary["unique_proportion"] = summary["n_unique"] / max(
135+
1, dataframe_summary["n_rows"]
136+
)
137+
except Exception:
138+
# for some dtypes n_unique can fail eg with a typeerror for
139+
# non-hashable types in pandas.
140+
pass
133141
_add_value_counts(
134142
summary, column, dataframe_summary=dataframe_summary, with_plots=with_plots
135143
)
@@ -160,15 +168,17 @@ def _add_nulls_summary(summary, column, dataframe_summary):
160168

161169
def _add_value_counts(summary, column, *, dataframe_summary, with_plots):
162170
if sbd.is_numeric(column) or sbd.is_any_date(column):
163-
summary["high_cardinality"] = True
164171
return
165172
n_unique, value_counts = _utils.top_k_value_counts(column, k=10)
166173
# if the column contains all nulls, _add_value_counts does not get called
167174
assert n_unique > 0
168175

176+
# value_counts may be able to find the number of unique values in cases
177+
# where n_unique() fails (eg non-hashable column content in pandas) so we
178+
# update n_unique and unique_proportion
169179
summary["n_unique"] = n_unique
170180
summary["unique_proportion"] = n_unique / max(1, dataframe_summary["n_rows"])
171-
summary["high_cardinality"] = n_unique >= _HIGH_CARDINALITY_THRESHOLD
181+
172182
summary["value_counts"] = value_counts
173183
summary["most_frequent_values"] = [v for v, _ in value_counts]
174184

skrub/_reporting/js_tests/cypress/e2e/summary-statistics.cy.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@ describe('test sorting the summary stats columns', () => {
1818
cy.get('@table').find('tbody tr').first().should('have.attr',
1919
'data-column-name', 'gender');
2020
cy.get('@table').find('tbody tr').last().should('have.attr',
21-
'data-column-name', 'year_first_hired');
21+
'data-column-name', 'date_first_hired');
2222
cy.get('@unique').parent().find('button').first().next()
2323
.click();
2424
cy.get('@table').find('tbody tr').first().should('have.attr',
2525
'data-column-name', 'date_first_hired');
2626
cy.get('@table').find('tbody tr').last().should('have.attr',
27-
'data-column-name', 'year_first_hired');
27+
'data-column-name', 'assignment_category');
2828
});
2929
});

skrub/_reporting/tests/test_summarize.py

-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ def test_summarize(monkeypatch, df_module, air_quality, order_by, with_plots):
5050
assert c == {
5151
"idx": 0,
5252
"dtype": "string",
53-
"high_cardinality": False,
5453
"n_unique": 2,
5554
"name": "city",
5655
"null_count": 0,

0 commit comments

Comments
 (0)