diff --git a/greenplumpython/__init__.py b/greenplumpython/__init__.py index 9ac45ea2..75ac8465 100644 --- a/greenplumpython/__init__.py +++ b/greenplumpython/__init__.py @@ -6,5 +6,5 @@ from greenplumpython.func import create_function # type: ignore from greenplumpython.func import aggregate_function, function from greenplumpython.op import operator -from greenplumpython.table import Table, table, to_table +from greenplumpython.table import Table, table, to_table, from_dataframe from greenplumpython.type import get_type diff --git a/greenplumpython/col.py b/greenplumpython/col.py index 78db78fe..a69dcc29 100644 --- a/greenplumpython/col.py +++ b/greenplumpython/col.py @@ -65,6 +65,16 @@ def name(self) -> str: """ return self._name + @property + def type(self) -> Type: + """ + Returns :class:`Type` type + + Returns: + str: column type + """ + return self._type + @property def table(self) -> Optional["Table"]: """ diff --git a/greenplumpython/table.py b/greenplumpython/table.py index 667fcd5a..edb3af4e 100644 --- a/greenplumpython/table.py +++ b/greenplumpython/table.py @@ -1,25 +1,23 @@ + + + """ `Table` is the core data structure in GreenplumPython. Conceptually, a `Table` is a two-dimensional unordered structure containing data. This aligns with `the definition of "Table" on Wikipedia `_. - In the data science world, a `Table` is similar to a `DataFrame` in `pandas `_, except that - - | Data in a `Table` is lazily evaluated rather than eagerly. That is, they are computed only when | they are observed. This can improve efficiency in many cases. - | Data in a `Table` is located and manipulated on a remote database system rather than locally. As | a consequence, - - - | Retrieving them from the database system can be expensive. Therefore, once the data + - | Retrieving them from the database system can be expensive. Therefore, once the data | of a :class:`Table` is fetched from the database system, it will be cached locally for later use. - - | They might be modified concurrently by other users of the database system. You might + - | They might be modified concurrently by other users of the database system. You might | need to use :meth:`~table.Table.refresh()` to sync the updates if the data becomes stale. - In the database world, a `Table` is similar to a **materialized view** in a database system in that - - They both result from a possibly complex query. - They both hold data, as oppose to views. - | The data can become stale due to concurrent modification. And the :meth:`~table.Table.refresh()` method @@ -56,7 +54,12 @@ from greenplumpython.group import TableGroupingSets from greenplumpython.order import OrderedTable from greenplumpython.row import Row +import json +try: + import pandas # type: ignore +except ImportError: # pragma: NO COVER + pandas = None class Table: """ @@ -143,38 +146,24 @@ def __getitem__(self, _): """ Returns - a :class:`~expr.Column` of the current Table if key is string - .. code-block:: python - id_col = tab["id"] - - a new :class:`Table` from the current Table per the type of key: - - if key is a list, then SELECT a subset of columns, a.k.a. targets; - .. code-block:: python - id_table = tab[["id"]] - - if key is an :class:`~expr.Expr`, then SELECT a subset of rows per the value of the Expr; - .. code-block:: python - id_cond_table = tab[lambda t: t["id"] == 0] - - if key is a slice, then SELECT a portion of consecutive rows - .. code-block:: python - slice_table = tab[2:5] - """ return self._getitem(_) def __repr__(self): """ :meta private: - Return a string representation for a table """ repr_string: str = "" @@ -204,7 +193,7 @@ def __repr__(self): if isinstance(c, list): repr_string += ("| {:{}} |").format("{}".format(c), width[idx]) # type: ignore else: - repr_string += ("| {:{}} |").format(c, width[idx]) + repr_string += ("| {:{}} |").format(c if c is not None else "", width[idx]) repr_string += "\n" return repr_string @@ -219,9 +208,13 @@ def _repr_html_(self): ) repr_html_str += "\t\n" for row in self: - repr_html_str += "\t\n" content = [row[c] for c in row] - repr_html_str += ("\t\t{:}\n" * len(list(row))).format(*content) + repr_html_str += "\t\n" + for c in content: + if isinstance(c, list): + repr_html_str += ("\t\t{:}\n").format("{}".format(c)) # type: ignore + else: + repr_html_str += ("\t\t{:}\n").format(c if c is not None else "") repr_html_str += "\t\n" repr_html_str += "" return repr_html_str @@ -230,10 +223,8 @@ def _repr_html_(self): def where(self, predicate: Callable[["Table"], "Expr"]) -> "Table": """ Returns the :class:`Table` filtered by Expression. - Args: predicate: :class:`~expr.Expr` : where condition statement - Returns: Table : Table filtered according **expr** passed in argument """ @@ -260,16 +251,12 @@ def apply( Table: resulted Table Example: .. code-block:: python - rows = [(i,) for i in range(-10, 0)] series = gp.values(rows, db=db, column_names=["id"]) abs = gp.function("abs", db=db) result = series.apply(lambda t: abs(t["id"])) - If we want to give constant as attribute, it is also easy to use. Suppose *label* function takes a str and a int: - .. code-block:: python - result = series.apply(lambda t: label("label", t["id"])) """ # We need to support calling functions with constant args or even no @@ -285,23 +272,18 @@ def assign(self, **new_columns: Callable[["Table"], Any]) -> "Table": """ Assigns new columns to the current :class:`Table`. Existing columns cannot be reassigned. - Args: new_columns: a :class:`dict` whose keys are column names and values are :class:`Callable`s returning column data when applied to the current :class:`Table`. - Returns: Table: New table including the new assigned columns - Example: .. code-block:: python - rows = [(i,) for i in range(-10, 0)] series = gp.to_table(rows, db=db, column_names=["id"]) abs = gp.function("abs") results = series.assign(abs=lambda nums: abs(nums["id"])) - """ if len(new_columns) == 0: @@ -333,19 +315,15 @@ def order_by( ) -> OrderedTable: """ Returns :class:`Table` order by the given arguments. - Args: column_name: name of column to order the table by ascending: Optional[Bool]: Define ascending of order, True = ASC / False = DESC nulls_first: Optional[bool]: Define if nulls will be ordered first or last, True = First / False = Last operator: Optional[str]: Define order by using operator. **Can't combine with ascending.** - Returns: OrderedTable : Table ordered by the given arguments - Example: .. code-block:: Python - t.order_by("id") """ # State transition diagram: @@ -373,7 +351,6 @@ def join( ) -> "Table": """ Joins the current :class:`Table` with another :class:`Table`. - Args: other: :class:`Table` to join with how: How the two tables are joined. The value can be one of @@ -383,7 +360,6 @@ def join( - `"FULL"`: full outer join, or - `"CROSS"`: cross join, i.e. the Cartesian product The default value `""` is equivalent to "INNER". - cond: :class:`Callable` lambda function as the join condition using: a list of column names that exist in both tables to join on. `cond` and `using` cannot be used together. @@ -394,7 +370,6 @@ def join( can be used as a key to indicate all columns. other_columns: Same as `self_columns`, but for the `other` table. - Note: When using `"*"` as key in `self_columns` or `other_columns`, please ensure that there will not be more than one column with the @@ -438,28 +413,24 @@ def bind(t: Table, columns: Union[Dict[str, Optional[str]], Set[str]]) -> List[s inner_join = partialmethod(join, how="INNER") """ Inner joins the current :class:`Table` with another :class:`Table`. - Equivalent to calling :meth:`Table.join` with `how="INNER"`. """ left_join = partialmethod(join, how="LEFT") """ Left-outer joins the current :class:`Table` with another :class:`Table`. - Equivalent to calling :meth:`Table.join` with `how="LEFT"`. """ right_join = partialmethod(join, how="RIGHT") """ Right-outer joins the current :class:`Table` with another :class:`Table`. - Equivalent to calling :meth:`Table.join` with `how="RIGHT"`. """ full_join = partialmethod(join, how="FULL") """ Full-outer joins the current :class:`Table` with another :class:`Table`. - Equivalent to calling :meth:`Table.join` with argutment `how="FULL"`. """ @@ -467,7 +438,6 @@ def bind(t: Table, columns: Union[Dict[str, Optional[str]], Set[str]]) -> List[s """ Cross joins the current :class:`Table` with another :class:`Table`, i.e. the Cartesian product. - Equivalent to calling :meth:`Table.join` with `how="CROSS"`. """ @@ -475,7 +445,6 @@ def bind(t: Table, columns: Union[Dict[str, Optional[str]], Set[str]]) -> List[s def name(self) -> str: """ Returns name of :class:`Table` - Returns: str: Table name """ @@ -485,7 +454,6 @@ def name(self) -> str: def db(self) -> Optional[db.Database]: """ Returns :class:`~db.Database` associated with :class:`Table` - Returns: Optional[Database]: database associated with table """ @@ -495,7 +463,6 @@ def db(self) -> Optional[db.Database]: def columns(self) -> Optional[Iterable[Column]]: """ Returns its :class:`~expr.Column` name of :class:`Table`, has results only for selected table and joined table with targets. - Returns: Optional[Iterable[str]]: None or List of its columns names of table """ @@ -585,7 +552,6 @@ def validate_data(json_pairs: List[tuple[str, Any]]): def refresh(self) -> "Table": """ Refresh self._contents - Returns: self """ @@ -602,10 +568,8 @@ def _fetch(self, is_all: bool = True) -> Iterable[Tuple[Any]]: Fetch rows of this :class:`Table`. - if is_all is True, fetch all rows at once - otherwise, open a CURSOR and FETCH one row at a time - Args: is_all: bool: Define if fetch all rows at once - Returns: Iterable[Tuple[Any]]: results of query received from database """ @@ -622,12 +586,10 @@ def _fetch(self, is_all: bool = True) -> Iterable[Tuple[Any]]: def save_as(self, table_name: str, temp: bool = False, column_names: List[str] = []) -> "Table": """ Save the table to database as a real Greenplum Table - Args: table_name : str temp : bool : if table is temporary column_names : List : list of column names - Returns: Table : table saved in database """ @@ -668,10 +630,8 @@ def save_as(self, table_name: str, temp: bool = False, column_names: List[str] = def explain(self, format: str = "TEXT") -> Iterable[Tuple[str]]: """ Explained the table's query - Args: format: str: format of explain - Returns: Iterable[Tuple[str]]: EXPLAIN query answer """ @@ -684,10 +644,8 @@ def group_by(self, *column_names: str) -> TableGroupingSets: """ Group the current :class:`~table.Table` by columns specified by `column_names`. - Args: column_names: one or more column names of the table - Returns: TableGroupingSets: a list of grouping sets. Each group is identified by a different set of values of the columns in the arguments. @@ -701,10 +659,8 @@ def group_by(self, *column_names: str) -> TableGroupingSets: def distinct_on(self, *column_names: str) -> "Table": """ Deduplicate the current :class:`Table` with respect to the given columns. - Args: column_names: name of column of the current :class:`Table`. - Returns: :class:`Table`: Table containing only the distinct values of the given columns. @@ -712,12 +668,87 @@ def distinct_on(self, *column_names: str) -> "Table": cols = [Column(name, self).serialize() for name in column_names] return Table(f"SELECT DISTINCT ON ({','.join(cols)}) * FROM {self.name}", parents=[self]) + def describe(self, include = None): + import greenplumpython.builtin.function as F + if include is None: + cols = next(iter(self)).column_names() + elif isinstance(include,str): + cols = [include] + else: + cols = [col for col in include] + # Dictionary of summary functions + summary_functions = { + F.count: "count", + F.sum: "sum", + F.avg: "avg", + F.min: "min", + F.max: "max" + + } + + summary_rows = [] + for function_obj, function_name in summary_functions.items(): + tmp_summary_function = [function_name] + for col in cols: + try: + #tbl = list(self.group_by().apply(lambda t: function_obj(self[col])))[0][function_name] + tbl = list(self.group_by().assign(function = lambda row: function_obj(self[col])))[0]["function"] + if isinstance(tbl, str): + tbl = None + except: + tbl = None + tmp_summary_function.append(tbl) + summary_rows.append(tuple(tmp_summary_function)) + + rows_string = ",".join( + ["(" + ",".join(serialize(datum) for datum in row) + ")" for row in summary_rows] + ) + cols = ["__summary__"] + cols + columns_string = f"({','.join(cols)})" if any(cols) else "" + return Table(f"SELECT * FROM (VALUES {rows_string}) AS vals {columns_string}", parents=[self]) + + def to_dataframe( + self, + ) -> "pandas.DataFrame": + + result_set = self._fetch() + df = pandas.DataFrame() + + if len(result_set) > 0: + rows = [json.loads(dict(row)["to_json"]) for row in result_set] + df = pandas.DataFrame(rows) + + return df + + def head(self, n : int = 5) -> "Table": + return Table( + f"SELECT * FROM {self.name} LIMIT {n}", + parents=[self], + ) + + def tail(self, n : int = 5) -> "Table": + if (n < len(list(self))): + return Table( + f"SELECT * FROM {self.name} OFFSET (SELECT COUNT(*) FROM {self.name}) - {n}", + parents=[self], + ) + return Table( + f"SELECT * FROM {self.name}", + parents=[self], + ) + + def size(self): + return len(list(self)) + + def shape(self): + return (len(list(self)), len(next(iter(self)).column_names())) + + # table_name can be table/view name def table(name: str, db: db.Database) -> Table: """ Returns a :class:`Table` using table name and associated :class:`~db.Database` - Args: name: str: Table name db: :class:`~db.Database`: database which contains the table @@ -730,23 +761,27 @@ def to_table( ) -> Table: """ Returns a :class:`Table` using list of values given - Args: rows: Iterable[Tuple[Any]]: List of values db: :class:`~db.Database`: database which will be associated with table column_names: Iterable[str]: List of given column names - Returns: Table: table generated with given values - .. code-block:: python - rows = [(1,), (2,), (3,)] t = gp.to_table(rows, db=db) - """ rows_string = ",".join( ["(" + ",".join(serialize(datum) for datum in row) + ")" for row in rows] ) columns_string = f"({','.join(column_names)})" if any(column_names) else "" return Table(f"SELECT * FROM (VALUES {rows_string}) AS vals {columns_string}", db=db) + +def from_dataframe(df: pandas.DataFrame, db: db.Database): + + dicts = df.reset_index().to_dict('split') + columns = dicts['columns'] + rows = [tuple(row) for row in dicts['data']] + + return to_table(rows = rows, db = db, column_names= columns) + diff --git a/greenplumpython/type.py b/greenplumpython/type.py index 07f67937..291cb4d9 100644 --- a/greenplumpython/type.py +++ b/greenplumpython/type.py @@ -21,10 +21,10 @@ class TypeCast(Expr): """ def __init__( - self, - obj: object, - type_name: str, - db: Optional[Database] = None, + self, + obj: object, + type_name: str, + db: Optional[Database] = None, ) -> None: """ @@ -93,6 +93,9 @@ def __call__(self, obj: object) -> TypeCast: def name(self) -> str: return self._name + def is_numeric(self) -> bool: + return self.name in ["integer", "float", "double precision"] + # -- Map between Python and Greenplum primitive types _defined_types: dict[Optional[type], Type] = { @@ -121,9 +124,9 @@ def get_type(name: str) -> Type: def to_pg_type( - annotation: Optional[type], - db: Optional[Database] = None, - for_return: bool = False, + annotation: Optional[type], + db: Optional[Database] = None, + for_return: bool = False, ) -> str: """ :meta private: diff --git a/requirements.txt b/requirements.txt index d7661d4d..3adcaf58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ psycopg2==2.9.3 +pandas +numpy \ No newline at end of file diff --git a/tests/test_from_pandas.py b/tests/test_from_pandas.py new file mode 100644 index 00000000..8357e229 --- /dev/null +++ b/tests/test_from_pandas.py @@ -0,0 +1,12 @@ +import pandas as pd +import pytest +import greenplumpython as gp +from tests import db + +def test_from_pandas(db: gp.Database): + t = gp.from_dataframe(pd.DataFrame({"a": [1, 2, 3, 1], "b": [2, 2, 2, 2], "c": ["A", "B", "C", "B"]}),db) + assert len(list(t)) == 4 + assert next(iter(t))["a"] == 1 + assert next(iter(t))["b"] == 2 + assert next(iter(t))["c"] == "A" + diff --git a/tests/test_head.py b/tests/test_head.py new file mode 100644 index 00000000..814dcd7d --- /dev/null +++ b/tests/test_head.py @@ -0,0 +1,21 @@ +import pytest + +import greenplumpython as gp +from tests import db + + +@pytest.fixture +def table(db: gp.Database): + + rows1 = [("a", 0, "b",), + ("c", 60, "d",), + ("e", 13, "f",)] + return gp.to_table(rows1, db=db, column_names=["categorical", "numeric", "text"]) + +def test_head(db: gp.Database, table: gp.Table): + t = table.head(1) + assert len(list(t)) == 1 + assert next(iter(t))['categorical'] == "a" + assert next(iter(t))['numeric'] == 0 + assert next(iter(t))['text'] == "b" + diff --git a/tests/test_shape.py b/tests/test_shape.py new file mode 100644 index 00000000..c2a32fb9 --- /dev/null +++ b/tests/test_shape.py @@ -0,0 +1,22 @@ +import pandas as pd +import pytest + +import greenplumpython as gp +from tests import db + + +@pytest.fixture +def table(db: gp.Database): + + rows1 = [("a", 0, "b",), + ("c", 60, "d",), + ("e", 13, "f",)] + return gp.to_table(rows1, db=db, column_names=["categorical", "numeric", "text"]) + +def test_shape(db: gp.Database, table: gp.Table): + assert table.shape() == (3,3) + + +def test_shape_slice(db: gp.Database, table: gp.Table): + assert table[:1].shape() == (1,3) + diff --git a/tests/test_size.py b/tests/test_size.py new file mode 100644 index 00000000..c0562e8a --- /dev/null +++ b/tests/test_size.py @@ -0,0 +1,22 @@ +import pandas as pd +import pytest + +import greenplumpython as gp +from tests import db + + +@pytest.fixture +def table(db: gp.Database): + + rows1 = [("a", 0, "b",), + ("c", 60, "d",), + ("e", 13, "f",)] + return gp.to_table(rows1, db=db, column_names=["categorical", "numeric", "text"]) + + +def test_size(db: gp.Database, table: gp.Table): + assert table.size() == 3 + +def test_size_slice(db: gp.Database, table: gp.Table): + assert table[:1].size() == 1 + diff --git a/tests/test_table_describe.py b/tests/test_table_describe.py new file mode 100644 index 00000000..1fb984bf --- /dev/null +++ b/tests/test_table_describe.py @@ -0,0 +1,35 @@ +import pytest +import greenplumpython as gp +from tests import db + + +@pytest.fixture +def table(db: gp.Database): + + rows1 = [("a", 0, "b",), + ("c", 60, "d",), + ("e", 13, "f",)] + return gp.to_table(rows1, db=db, column_names=["categorical", "numeric", "text"]) + + +def test_describe_one_column(db: gp.Database, table: gp.Table): + t = table.describe("text") + assert len(list(t)) == 5 + assert next(iter(t))['text'] == 3 + assert next(t)['text'] is None + +def test_describe_few_columns(db: gp.Database, table: gp.Table): + t = table.describe(["numeric","text"]) + assert len(list(t)) == 5 + for row in t: + keys = row.column_names() + assert len(keys) == 3 + +def test_describe_all_columns(db: gp.Database, table: gp.Table): + t = table.describe() + assert len(list(t)) == 5 + for row in t: + keys = row.column_names() + assert len(keys) == 4 + + diff --git a/tests/test_tail.py b/tests/test_tail.py new file mode 100644 index 00000000..ea79ad38 --- /dev/null +++ b/tests/test_tail.py @@ -0,0 +1,25 @@ +import pytest + +import greenplumpython as gp +from tests import db + + +@pytest.fixture +def table(db: gp.Database): + + rows1 = [("a", 0, "b",), + ("c", 60, "d",), + ("e", 13, "f",)] + return gp.to_table(rows1, db=db, column_names=["categorical", "numeric", "text"]) + +def test_tail(db: gp.Database, table: gp.Table): + t = table.tail(1) + assert len(list(t)) == 1 + assert next(iter(t))['categorical'] == "e" + assert next(iter(t))['numeric'] == 13 + assert next(iter(t))['text'] == "f" + +def test_tail_check_offset(db: gp.Database, table: gp.Table): + t = table.tail(4) + assert len(list(t)) == 3 + diff --git a/tests/test_to_pandas.py b/tests/test_to_pandas.py new file mode 100644 index 00000000..246955c6 --- /dev/null +++ b/tests/test_to_pandas.py @@ -0,0 +1,21 @@ +import pandas as pd +import pytest + +import greenplumpython as gp +from tests import db + + +@pytest.fixture +def table(db: gp.Database): + + rows1 = [("a", 0, "b",), + ("c", 60, "d",), + ("e", 13, "f",)] + return gp.to_table(rows1, db=db, column_names=["categorical", "numeric", "text"]) + +def test_to_pandas(db: gp.Database, table: gp.Table): + df = table.to_dataframe() + df2 = pd.DataFrame({"categorical":["a", "c", "e"],"numeric":[0, 60, 13],"text":["b","d", "f"]}) + assert df.shape == (3, 3) + assert df.equals(df2) +