From 44e46c94aec369f624729c33ac6426adf5ae07d2 Mon Sep 17 00:00:00 2001 From: Michael Chow Date: Fri, 2 Aug 2019 11:35:38 -0400 Subject: [PATCH] docs: commit gather, add developer section, sql translators --- docs/api_tidy/02_gather.Rmd | 48 ++++ docs/conf.py | 2 +- docs/developer/index.rst | 8 + docs/developer/sql-translators.ipynb | 373 +++++++++++++++++++++++++++ docs/index.rst | 1 + docs/intro.Rmd | 2 +- docs/intro_sql_basic.ipynb | 4 +- docs/intro_sql_interm.ipynb | 2 +- 8 files changed, 435 insertions(+), 5 deletions(-) create mode 100644 docs/api_tidy/02_gather.Rmd create mode 100644 docs/developer/index.rst create mode 100644 docs/developer/sql-translators.ipynb diff --git a/docs/api_tidy/02_gather.Rmd b/docs/api_tidy/02_gather.Rmd new file mode 100644 index 00000000..26837c0a --- /dev/null +++ b/docs/api_tidy/02_gather.Rmd @@ -0,0 +1,48 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{python nbsphinx=hidden} +import pandas as pd +pd.set_option("display.max_rows", 20) +``` + +## Gather + +```{python} +from siuba import _, nest, unnest, group_by, gather +from siuba.data import mtcars +``` + +```{python} +costs = pd.DataFrame({ + 'id': [1,2], + 'price_x': [.1, .2], + 'price_y': [.4, .5], + 'price_z': [.7, .8] +}) + +costs +``` + +```{python} +costs >> gather('measure', 'value', _.price_x, _.price_y, _.price_z) +``` + +```{python} +costs >> gather('measure', 'value', _["price_x":"price_z"]) +``` + +```{python} +costs >> gather('measure', 'value', -_.id) +``` diff --git a/docs/conf.py b/docs/conf.py index 76b62f09..a39e53f6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -10,7 +10,7 @@ ] # Exclude build directory and Jupyter backup files: -exclude_patterns = ['_build', '**.ipynb_checkpoints'] +exclude_patterns = ['_build', '**.ipynb_checkpoints', '**.swp'] # Default language for syntax highlighting in reST and Markdown cells highlight_language = 'none' diff --git a/docs/developer/index.rst b/docs/developer/index.rst new file mode 100644 index 00000000..0b23cb1d --- /dev/null +++ b/docs/developer/index.rst @@ -0,0 +1,8 @@ +Developers +========== + +.. toctree:: + :maxdepth: 2 + + sql-translators.ipynb + diff --git a/docs/developer/sql-translators.ipynb b/docs/developer/sql-translators.ipynb new file mode 100644 index 00000000..238a6b4f --- /dev/null +++ b/docs/developer/sql-translators.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SQL translators" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this vignette is to walk through how expressions like `_.id.mean()` are converted into SQL.\n", + "\n", + "This process involves 3 parts\n", + "\n", + "1. SQL translation functions, e.g. taking column \"id\" and producing the SQL \"ROUND(id)\".\n", + "2. SQL translation from a symbolic call\n", + " - Converting method calls like `_.id.round(2)` to `round(_.id, 2)`\n", + " - Looking up SQL translators (e.g. for \"mean\" function call)\n", + "3. Handling SQL partitions, like in OVER clauses\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using sqlalchemy select statment for convenience\n", + "\n", + "Throughout this vignette, we'll use a select statement object from sqlalchemy,\n", + "so we can conveniently access its columns as needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import sql\n", + "col_names = ['id', 'x', 'y']\n", + "sel = sql.select([sql.column(x) for x in col_names])\n", + "\n", + "print(sel)\n", + "print(type(sel.columns))\n", + "print(sel.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Translator functions\n", + "\n", + "A SQL translator function takes...\n", + "\n", + "* a first argument that is a sqlalchemy Column\n", + "* (optional) additional arguments for the translation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A simple translator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f_simple_round = lambda col, n: sql.func.round(col, n)\n", + "\n", + "sql_expr = f_simple_round(sel.columns.x, 2)\n", + "\n", + "print(sql_expr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The function above is essentially what most translator functions are.\n", + "\n", + "For example, here is the round function defined for postgresql.\n", + "One key difference is that it casts the column to a numeric beforehand." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from siuba.sql.dialects.postgresql import funcs\n", + "\n", + "f_round = funcs['scalar']['round']\n", + "sql_expr = f_round(sel.columns.x, 2)\n", + "\n", + "print(sql_expr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handling windows with custom Over clauses" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f_win_mean = funcs['window']['mean']\n", + "\n", + "sql_over_expr = f_win_mean(sel.columns.x)\n", + "\n", + "print(type(sql_over_expr))\n", + "print(sql_over_expr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that this window expression has an empty over clause. This clause needs to be able to include any variables we've grouped the data by.\n", + "\n", + "Siuba handles this by implementing a `set_over` method on these custom sqlalchemy Over clauses, which takes grouping and ordering variables as arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_by_clause = sql.elements.ClauseList(sel.columns.x, sel.columns.y)\n", + "print(sql_over_expr.set_over(group_by_clause))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Call shaping\n", + "\n", + "The section above discusses how SQL translators are functions that take a sqlalchemy column, and return a SQL expression. However, when using siuba we often have expressions like...\n", + "\n", + "```\n", + "mutate(data, x = _.y.round(2))\n", + "```\n", + "\n", + "In this case, before we can even use a SQL translator, we need to...\n", + "\n", + "* find the name and arguments of the method being called\n", + "* find the column it is being called on\n", + "\n", + "This is done by using the `CallTreeLocal` class to analyze the tree of operations for each expression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from siuba.siu import Lazy, CallTreeLocal, Call, strip_symbolic\n", + "from siuba import _\n", + "\n", + "_.y.round(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example of translation with CallTreeLocal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from siuba.sql.dialects.postgresql import funcs\n", + "\n", + "local_funcs = {**funcs['scalar'], **funcs['window']}\n", + "\n", + "call_shaper = CallTreeLocal(\n", + " local_funcs,\n", + " rm_attr = ('str', 'dt'),\n", + " call_sub_attr = ('dt',)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "symbol = _.id.mean()\n", + "call = strip_symbolic(symbol)\n", + "print(call)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "func_call = call_shaper.enter(call)\n", + "print(func_call(sel.columns))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the same result as when we called the SQL translator for `mean` manually!\n", + "In that section we also showed that we can set group information, so that it takes \n", + "an average within each group.\n", + "\n", + "In this case it's easy to set group information to the Over clause.\n", + "However, an additional challenge is when it's part of a larger expression..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "call2 = strip_symbolic(_.id.mean() + 1)\n", + "func_call2 = call_shaper.enter(call2)\n", + "\n", + "func_call2(sel.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Handling partitions\n", + "\n", + "While the first section showed how siuba's custom Over clauses can add grouping info to a translation, it is missing one key detail: expressions that generate Over clauses, like `_.id.mean()`, can be part of larger expressions. For example `_.id.mean() + 1`.\n", + "\n", + "In this case, if we look at the call tree for that expression, the top operation is the addition..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "_.id.mean() + 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How can we create the appropriate expression...\n", + "\n", + "```\n", + "avg(some_col) OVER (PARTITION BY x, y) + 1\n", + "```\n", + "\n", + "when the piece that needs grouping info is not easily accessible? The answer is by using a tree visitor, which steps down every black rectangle in the call tree shown above, from top to bottom.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Full example\n", + "\n", + "Below, we copy the code from the call shaping section.." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from siuba.sql.verbs import track_call_windows\n", + "from siuba import _\n", + "from siuba.sql.dialects.postgresql import funcs\n", + "\n", + "local_funcs = {**funcs['scalar'], **funcs['window']}\n", + "\n", + "call_shaper = CallTreeLocal(\n", + " local_funcs,\n", + " rm_attr = ('str', 'dt'),\n", + " call_sub_attr = ('dt',)\n", + " )\n", + "\n", + "symbol3 = _.id.mean() + 1\n", + "call3 = strip_symbolic(symbol3)\n", + "func_call3 = call_shaper.enter(call3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we pass the shaped call..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "col, windows = track_call_windows(\n", + " func_call3,\n", + " sel.columns,\n", + " group_by = ['x', 'y'],\n", + " order_by = []\n", + " )\n", + "\n", + "print(col)\n", + "print(windows)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/index.rst b/docs/index.rst index 79d5fdf8..ebfc931c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,6 +5,7 @@ intro.Rmd intro_sql_basic.ipynb intro_sql_interm.ipynb + developer/index.rst .. toctree:: :maxdepth: 2 diff --git a/docs/intro.Rmd b/docs/intro.Rmd index 6f12a01a..1bde1e26 100644 --- a/docs/intro.Rmd +++ b/docs/intro.Rmd @@ -137,7 +137,7 @@ from sqlalchemy import create_engine from siuba.sql import LazyTbl # # copy in to sqlite -engine = create_engine("sqlite:///:memory") +engine = create_engine("sqlite:///:memory:") mtcars.to_sql("mtcars", engine, if_exists = "replace") # connect with siuba diff --git a/docs/intro_sql_basic.ipynb b/docs/intro_sql_basic.ipynb index 4ccc149d..cbe6548e 100644 --- a/docs/intro_sql_basic.ipynb +++ b/docs/intro_sql_basic.ipynb @@ -20,7 +20,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Using to query SQL (intro)" + "# Querying SQL (intro)" ] }, { @@ -320,7 +320,7 @@ "from siuba.sql import LazyTbl\n", "\n", "# copy in to sqlite\n", - "engine = create_engine(\"sqlite:///:memory\")\n", + "engine = create_engine(\"sqlite:///:memory:\")\n", "mtcars.to_sql(\"mtcars\", engine, if_exists = \"replace\")\n", "\n", "# connect with siuba\n", diff --git a/docs/intro_sql_interm.ipynb b/docs/intro_sql_interm.ipynb index 7d4265bb..4ffa2226 100644 --- a/docs/intro_sql_interm.ipynb +++ b/docs/intro_sql_interm.ipynb @@ -22,7 +22,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Using to query SQL (advanced)\n", + "# Querying SQL (advanced)\n", "\n", "**NOTE: THIS DOC IS CURRENTLY IN OUTLINE FORM**\n", "\n",