diff --git a/datar/base/__init__.py b/datar/base/__init__.py index 7f075ddf..359d073d 100644 --- a/datar/base/__init__.py +++ b/datar/base/__init__.py @@ -67,6 +67,7 @@ rank, outer, ) +from .glimpse import glimpse from .logical import ( FALSE, TRUE, diff --git a/datar/base/glimpse.py b/datar/base/glimpse.py new file mode 100644 index 00000000..6d37e74c --- /dev/null +++ b/datar/base/glimpse.py @@ -0,0 +1,183 @@ +"""Provides glimpse""" +import textwrap +import html +from functools import singledispatch +from shutil import get_terminal_size + +from pipda import register_verb + +from ..core.tibble import TibbleGrouped, TibbleRowwise +from ..core.backends.pandas import DataFrame +from ..core.backends.pandas.core.groupby import SeriesGroupBy + + +@singledispatch +def formatter(x): + """Formatter passed to glimpse to format a single element of a dataframe.""" + return str(x) + + +@formatter.register(DataFrame) +def _dataframe_formatter(x): + """Format a dataframe element.""" + return f"" + + +@formatter.register(str) +def _str_formatter(x): + """Format a string""" + return repr(x) + + +def _is_notebook() -> bool: # pragma: no cover + """Check if the current environment is notebook""" + try: + from IPython import get_ipython + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type (?) + except (ImportError, NameError): + return False # Probably standard Python interpreter + + +class Glimpse: + """Glimpse class + + Args: + x: The data to be glimpseed + width: The width of the output + formatter: The formatter to use to format data elements + """ + def __init__(self, x, width, formatter) -> None: + self.x = x + self.width = width or get_terminal_size((100, 20)).columns + self.formatter = formatter + self.colwidths = (0, 0) + + def __repr__(self) -> str: + return f"" + + def __str__(self) -> str: + self._calculate_output_widths() + return "\n".join( + ( + "\n".join(self._general()), + "\n".join(self._variables()), + ) + ) + + def _repr_html_(self): + out = [] + for gen in self._general(): + out.append(f"
{gen}
") + out.append("") + out.extend(self._variables(fmt="html")) + out.append("
") + return "\n".join(out) + + def _general(self): + if isinstance(self.x, TibbleGrouped): + groups = ", ".join((str(name) for name in self.x.group_vars)) + group_title = ( + "Rowwise" if isinstance(self.x, TibbleRowwise) else "Groups" + ) + return ( + f"Rows: {self.x.shape[0]}", + f"Columns: {self.x.shape[1]}", + f"{group_title}: {groups} " + f"[{self.x._datar['grouped'].grouper.ngroups}]", + ) + + return ( + f"Rows: {self.x.shape[0]}", + f"Columns: {self.x.shape[1]}", + ) + + def _calculate_output_widths(self): + colname_width = max(len(str(colname)) for colname in self.x.columns) + dtype_width = max(len(str(dtype)) for dtype in self.x.dtypes) + 2 + self.colwidths = (colname_width, dtype_width) + + def _variables(self, fmt="str"): + for col in self.x: + yield self._format_variable( + col, + self.x[col].dtype, + self.x[col].obj.values + if isinstance(self.x[col], SeriesGroupBy) + else self.x[col].values, + fmt=fmt, + ) + + def _format_variable(self, col, dtype, data, fmt="str"): + if fmt == "str": + return self._format_variable_str(col, dtype, data) + + return self._format_variable_html(col, dtype, data) + + def _format_data(self, data): + """Format the data for the glimpse view + + Formatting 10 elements in a batch in case of a long dataframe. + Since we don't need to format all the data, but only the first a few + till the line (terminal width or provided width) overflows. + """ + out = "" + placeholder = "…" + i = 0 + chunk_size = 10 + while not out.endswith(placeholder) and i < data.size: + if out: + out += ", " + out += ", ".join( + self.formatter(d) for d in data[i:i + chunk_size] + ) + i += chunk_size + out = textwrap.shorten( + out, + break_long_words=True, + break_on_hyphens=True, + width=self.width - 4 - sum(self.colwidths), + placeholder=placeholder, + ) + return out + + def _format_variable_str(self, col, dtype, data): + name_col = col.ljust(self.colwidths[0]) + dtype_col = f'<{dtype}>'.ljust(self.colwidths[1]) + data_col = self._format_data(data) + return f". {name_col} {dtype_col} {data_col}" + + def _format_variable_html(self, col, dtype, data): + name_col = f". {col}" + dtype_col = f"<{dtype}>" + data_col = html.escape(self._format_data(data)) + return ( + f"{name_col}" + f"{dtype_col}" + f"{data_col}" + ) + + def show(self): + """Show the glimpse view""" + if _is_notebook(): # pragma: no cover + from IPython.display import display, HTML + display(HTML(self._repr_html_())) + else: + print(self.__str__()) + + +@register_verb(DataFrame) +def glimpse(x, width=None, formatter=formatter): + """Get a glimpse of your data + + Args: + x: An object to glimpse at. + width: Width of output, defaults to the width of the console. + formatter: A single-dispatch function to format a single element. + """ + Glimpse(x, width=width, formatter=formatter).show() diff --git a/docs/notebooks/nest.ipynb b/docs/notebooks/nest.ipynb index 4348be36..e6e778fb 100644 --- a/docs/notebooks/nest.ipynb +++ b/docs/notebooks/nest.ipynb @@ -377,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.142333Z", @@ -831,7 +831,7 @@ "49 5.0 3.3 1.4 0.2" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -843,7 +843,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.186712Z", @@ -911,7 +911,7 @@ "2 virginica " ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -923,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.207533Z", @@ -996,7 +996,7 @@ "2 virginica " ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1007,7 +1007,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.265853Z", @@ -1080,7 +1080,7 @@ "2 virginica " ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1091,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.333173Z", @@ -1257,7 +1257,7 @@ "[TibbleGrouped: fish (n=19)]" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1268,7 +1268,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.401063Z", @@ -1343,7 +1343,7 @@ "[TibbleGrouped: cyl (n=3)]" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1363,7 +1363,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.451681Z", @@ -1443,7 +1443,7 @@ "3 3 3 2" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1462,7 +1462,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.535876Z", @@ -1549,7 +1549,7 @@ "4 3 3.0 2.0" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1560,7 +1560,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.550880Z", @@ -1633,7 +1633,7 @@ "2 c 3 22" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1649,7 +1649,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2021-07-16T22:28:27.614822Z", @@ -1736,7 +1736,7 @@ "4 c 3 22" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/reference-maps/base.md b/docs/reference-maps/base.md index 6ebf5f8b..56e4f5d9 100644 --- a/docs/reference-maps/base.md +++ b/docs/reference-maps/base.md @@ -285,6 +285,7 @@ See [here](../stats) for APIs ported from `r-stats` and [here](../utils) for API |API|Description|Notebook example| |---|---|---:| +|[`glimpse()`][166]|Get a glimpse of your data|| |[`cut()`][113]|Convert Numeric to Factor|[:material-notebook:][163]| |[`diff()`][164]|Returns suitably lagged and iterated differences.|[:material-notebook:][163]| |[`identity()`][114]|Identity Function|[:material-notebook:][163]| @@ -464,3 +465,4 @@ See [here](../stats) for APIs ported from `r-stats` and [here](../utils) for API [163]: ../../notebooks/base-funs [164]: ../../api/datar.base.funs/#datar.base.funs.diff [165]: ../../api/datar.base.funs/#datar.base.funs.outer +[166]: ../../api/datar.base.glimpse/#datar.base.glimpse.glimpse diff --git a/tests/base/test_glimpse.py b/tests/base/test_glimpse.py new file mode 100644 index 00000000..14339a0d --- /dev/null +++ b/tests/base/test_glimpse.py @@ -0,0 +1,43 @@ +import pytest + +from datar.base.glimpse import Glimpse, formatter +from datar.all import ( + f, + group_by, + glimpse, + tibble, + nest, +) + + +def test_glimpse_str_df(capsys): + df = tibble(x=f[:10], y=[str(i) for i in range(10)]) + glimpse(df) + out = capsys.readouterr().out + assert "Rows: 10" in out + assert "Columns: 2" in out + assert "0, 1, 2" in out + + +def test_glimpse_str_nest_df(capsys): + df = tibble(x=f[:10], y=f[10:20]) >> nest(data=~f.x) + glimpse(df) + out = capsys.readouterr().out + assert "Rows: 10" in out + assert "Columns: 2" in out + assert ", " in out + + +def test_glimpse_str_gf(capsys): + df = tibble(x=f[:10], y=[str(i) for i in range(10)]) >> group_by(f.y) + glimpse(df) + assert "Groups: y [10]" in capsys.readouterr().out + + +def test_glimpse_html_df(): + df = tibble(x=f[:20], y=[str(i) for i in range(20)]) + g = Glimpse(df, 100, formatter) + assert repr(g).startswith("" in out