diff --git a/.travis.yml b/.travis.yml index 4e4beb2f..1db80743 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,5 +10,10 @@ install: - pip install -r requirements.txt - pip install . # command to run tests +services: + - postgresql +env: + global: + - PGPORT=5432 script: - make test-travis diff --git a/Makefile b/Makefile index 60cc9574..9aaf71da 100644 --- a/Makefile +++ b/Makefile @@ -4,11 +4,11 @@ NOTEBOOK_TESTS=$(addprefix examples/, examples-dplyr-funcs.ipynb case-iris-selec test: py.test --nbval $(NOTEBOOK_TESTS) - py.test + pytest --dbs="sqlite,postgresql" siuba/tests test-travis: py.test --nbval $(filter-out %postgres.ipynb, $(NOTEBOOK_TESTS)) - py.test + pytest --dbs="sqlite,postgresql" siuba/tests examples/%.ipynb: jupyter nbconvert --to notebook --inplace --execute $@ diff --git a/docker-compose.yml b/docker-compose.yml index e7132b78..fe50f1f4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,6 +6,6 @@ services: image: postgres restart: always environment: - POSTGRES_PASSWORD: example + POSTGRES_PASSWORD: "" ports: - 5433:5432 diff --git a/examples/examples-postgres.ipynb b/examples/examples-postgres.ipynb index e7ea193e..bfb3e097 100644 --- a/examples/examples-postgres.ipynb +++ b/examples/examples-postgres.ipynb @@ -14,18 +14,10 @@ "execution_count": 1, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/machow/.virtualenvs/siuba/lib/python3.6/site-packages/psycopg2/__init__.py:144: UserWarning: The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use \"pip install psycopg2-binary\" instead. For details see: .\n", - " \"\"\")\n" - ] - }, { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 1, @@ -38,7 +30,10 @@ "from sqlalchemy import sql\n", "from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey\n", "from sqlalchemy import create_engine\n", - "engine = create_engine('postgresql://postgres:example@localhost:5433/postgres', echo=False)\n", + "import os\n", + "\n", + "port = os.environ.get(\"PGPORT\", \"5433\")\n", + "engine = create_engine('postgresql://postgres:@localhost:%s/postgres'%port, echo=False)\n", "\n", "\n", "metadata = MetaData()\n", @@ -1009,7 +1004,7 @@ "data": { "text/plain": [ "█─'__call__'\n", - "├─\n", + "├─\n", "├─_\n", "└─█─''\n", " └─█─'__call__'\n", diff --git a/requirements.txt b/requirements.txt index 79405b9c..cb3bd17b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,8 @@ pytz==2018.9 six==1.12.0 SQLAlchemy==1.2.17 nbval==0.9.1 +# tests +psycopg2==2.8.2 # only used for iris dataset scikit-learn==0.20.2 # used for docs diff --git a/siuba/dply/verbs.py b/siuba/dply/verbs.py index fe9706c9..60e6787a 100644 --- a/siuba/dply/verbs.py +++ b/siuba/dply/verbs.py @@ -621,6 +621,7 @@ def arrange(__data, *args): @singledispatch2(DataFrame) def distinct(__data, *args, _keep_all = False, **kwargs): + # using dict as ordered set cols = {simple_varname(x): True for x in args} if None in cols: raise Exception("positional arguments must be simple column, " @@ -629,10 +630,14 @@ def distinct(__data, *args, _keep_all = False, **kwargs): # mutate kwargs cols.update(kwargs) - tmp_data = mutate(__data, **kwargs).drop_duplicates(cols) + + # special case: use all variables when none are specified + if not len(cols): cols = __data.columns + + tmp_data = mutate(__data, **kwargs).drop_duplicates(list(cols)).reset_index(drop = True) if not _keep_all: - return tmp_data[cols] + return tmp_data[list(cols)] return tmp_data diff --git a/siuba/sql/verbs.py b/siuba/sql/verbs.py index 1a3fa3ff..722e380c 100644 --- a/siuba/sql/verbs.py +++ b/siuba/sql/verbs.py @@ -93,9 +93,6 @@ def lift_inner_cols(tbl): return sql.base.ImmutableColumnCollection(data, cols) -def is_grouped_sel(select): - return False - def has_windows(clause): windows = [] append_win = lambda col: windows.append(col) @@ -618,8 +615,8 @@ def _rename(__data, **kwargs): @distinct.register(LazyTbl) def _distinct(__data, *args, _keep_all = False, **kwargs): - if _keep_all: - raise NotImplementedError("Distinct in sql requires _keep_all = True") + if (args or kwargs) and _keep_all: + raise NotImplementedError("Distinct with variables specified in sql requires _keep_all = False") inner_sel = mutate(__data, **kwargs).last_op if kwargs else __data.last_op @@ -633,6 +630,8 @@ def _distinct(__data, *args, _keep_all = False, **kwargs): "e.g. _.colname or _['colname']" ) + if not cols: cols = list(inner_sel.columns.keys()) + sel_cols = lift_inner_cols(inner_sel) distinct_cols = [sel_cols[k] for k in cols] diff --git a/siuba/tests/__init__.py b/siuba/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/siuba/tests/conftest.py b/siuba/tests/conftest.py new file mode 100644 index 00000000..3fc39a01 --- /dev/null +++ b/siuba/tests/conftest.py @@ -0,0 +1,6 @@ +import pytest + +def pytest_addoption(parser): + parser.addoption( + "--dbs", action="store", default="sqlite", help="databases tested against (comma separated)" + ) diff --git a/siuba/tests/helpers.py b/siuba/tests/helpers.py new file mode 100644 index 00000000..10782f25 --- /dev/null +++ b/siuba/tests/helpers.py @@ -0,0 +1,67 @@ +from sqlalchemy import create_engine, types +from siuba.sql import LazyTbl, collect +from pandas.testing import assert_frame_equal + +class DbConRegistry: + table_name_indx = 0 + + def __init__(self): + self.connections = {} + + def register(self, name, engine): + self.connections[name] = engine + + def remove(self, name): + con = self.connections[name] + con.close() + del self.connections[name] + + return con + + @classmethod + def unique_table_name(cls): + cls.table_name_indx += 1 + return "siuba_{0:03d}".format(cls.table_name_indx) + + def load_df(self, df): + out = [] + for k, engine in self.connections.items(): + lazy_tbl = copy_to_sql(df, self.unique_table_name(), engine) + out.append(lazy_tbl) + return out + +def assert_frame_sort_equal(a, b): + """Tests that DataFrames are equal, even if rows are in different order""" + sorted_a = a.sort_values(by = a.columns.tolist()).reset_index(drop = True) + sorted_b = b.sort_values(by = b.columns.tolist()).reset_index(drop = True) + + assert_frame_equal(sorted_a, sorted_b) + +def assert_equal_query(tbls, lazy_query, target): + for tbl in tbls: + out = collect(lazy_query(tbl)) + assert_frame_sort_equal(out, target) + + +PREFIX_TO_TYPE = { + # for datetime, need to convert to pandas datetime column + #"dt": types.DateTime, + "int": types.Integer, + "float": types.Float, + "str": types.String + } + +def auto_types(df): + dtype = {} + for k in df.columns: + pref, *_ = k.split('_') + if pref in PREFIX_TO_TYPE: + dtype[k] = PREFIX_TO_TYPE[pref] + return dtype + + +def copy_to_sql(df, name, engine): + df.to_sql(name, engine, dtype = auto_types(df), index = False, if_exists = "replace") + return LazyTbl(engine, name) + + diff --git a/siuba/tests/test_dply_verbs.py b/siuba/tests/test_dply_verbs.py index f720e187..0320f40a 100644 --- a/siuba/tests/test_dply_verbs.py +++ b/siuba/tests/test_dply_verbs.py @@ -76,6 +76,15 @@ def test_varlist_multi_slice_negate(df1): assert out.columns.tolist() == ["language", "stars", "x"] +# Distinct -------------------------------------------------------------------- + +from siuba.dply.verbs import distinct + +def test_distinct_no_args(): + df =pd.DataFrame({'x': [1,1,2], 'y': [1,1,2]}) + assert_frame_equal(distinct(df), df.drop_duplicates().reset_index(drop = True)) + + # Nest ------------------------------------------------------------------------ from siuba.dply.verbs import nest, unnest diff --git a/siuba/tests/test_sql_verbs_distinct.py b/siuba/tests/test_sql_verbs_distinct.py new file mode 100644 index 00000000..f02d3586 --- /dev/null +++ b/siuba/tests/test_sql_verbs_distinct.py @@ -0,0 +1,79 @@ +""" +Note: this test file was heavily influenced by its dbplyr counterpart. + +https://github.com/tidyverse/dbplyr/blob/master/tests/testthat/test-verb-distinct.R +""" + +from siuba.sql import LazyTbl, collect +from siuba import _, distinct +import pandas as pd +import os + +import pytest +from sqlalchemy import create_engine + +from .helpers import assert_equal_query, DbConRegistry + +DATA = pd.DataFrame({ + "x": [1,1,1,1], + "y": [1,1,2,2], + "z": [1,2,1,2] + }) + +@pytest.fixture(scope = "module") +def dbs(request): + dialects = set(request.config.getoption("--dbs").split(",")) + dbs = DbConRegistry() + + if "sqlite" in dialects: + dbs.register("sqlite", create_engine("sqlite:///:memory:")) + if "postgresql" in dialects: + port = os.environ.get("PGPORT", "5433") + dbs.register("postgresql", create_engine('postgresql://postgres:@localhost:%s/postgres'%port)) + + + yield dbs + + # cleanup + for engine in dbs.connections.values(): + engine.dispose() + +@pytest.fixture(scope = "module") +def dfs(dbs): + yield dbs.load_df(DATA) + +def test_distinct_no_args(dfs): + assert_equal_query(dfs, distinct(), DATA.drop_duplicates()) + assert_equal_query(dfs, distinct(), distinct(DATA)) + +def test_distinct_one_arg(dfs): + assert_equal_query( + dfs, + distinct(_.y), + DATA.drop_duplicates(['y'])[['y']].reset_index(drop = True) + ) + + assert_equal_query(dfs, distinct(_.y), distinct(DATA, _.y)) + +def test_distinct_keep_all_not_impl(dfs): + # TODO: should just mock LazyTbl + for tbl in dfs: + with pytest.raises(NotImplementedError): + distinct(tbl, _.y, _keep_all = True) >> collect() + + +@pytest.mark.xfail +def test_distinct_via_group_by(dfs): + # NotImplemented + assert False + +def test_distinct_kwargs(dfs): + dst = DATA.drop_duplicates(['y', 'x']) \ + .rename(columns = {'x': 'a'}) \ + .reset_index(drop = True)[['y', 'a']] + + assert_equal_query(dfs, distinct(_.y, a = _.x), dst) + + + +