From 44e46c94aec369f624729c33ac6426adf5ae07d2 Mon Sep 17 00:00:00 2001
From: Michael Chow <machow@princeton.edu>
Date: Fri, 2 Aug 2019 11:35:38 -0400
Subject: [PATCH] docs: commit gather, add developer section, sql translators

---
 docs/api_tidy/02_gather.Rmd          |  48 ++++
 docs/conf.py                         |   2 +-
 docs/developer/index.rst             |   8 +
 docs/developer/sql-translators.ipynb | 373 +++++++++++++++++++++++++++
 docs/index.rst                       |   1 +
 docs/intro.Rmd                       |   2 +-
 docs/intro_sql_basic.ipynb           |   4 +-
 docs/intro_sql_interm.ipynb          |   2 +-
 8 files changed, 435 insertions(+), 5 deletions(-)
 create mode 100644 docs/api_tidy/02_gather.Rmd
 create mode 100644 docs/developer/index.rst
 create mode 100644 docs/developer/sql-translators.ipynb

diff --git a/docs/api_tidy/02_gather.Rmd b/docs/api_tidy/02_gather.Rmd
new file mode 100644
index 00000000..26837c0a
--- /dev/null
+++ b/docs/api_tidy/02_gather.Rmd
@@ -0,0 +1,48 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .Rmd
+      format_name: rmarkdown
+      format_version: '1.1'
+      jupytext_version: 1.1.1
+  kernelspec:
+    display_name: Python 3
+    language: python
+    name: python3
+---
+
+```{python nbsphinx=hidden}
+import pandas as pd
+pd.set_option("display.max_rows", 20)
+```
+
+## Gather
+
+```{python}
+from siuba import _, nest, unnest, group_by, gather
+from siuba.data import mtcars
+```
+
+```{python}
+costs = pd.DataFrame({
+    'id': [1,2],
+    'price_x': [.1, .2],
+    'price_y': [.4, .5],
+    'price_z': [.7, .8]
+})
+
+costs
+```
+
+```{python}
+costs >> gather('measure', 'value', _.price_x, _.price_y, _.price_z)
+```
+
+```{python}
+costs >> gather('measure', 'value', _["price_x":"price_z"])
+```
+
+```{python}
+costs >> gather('measure', 'value', -_.id)
+```
diff --git a/docs/conf.py b/docs/conf.py
index 76b62f09..a39e53f6 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -10,7 +10,7 @@
 ]
 
 # Exclude build directory and Jupyter backup files:
-exclude_patterns = ['_build', '**.ipynb_checkpoints']
+exclude_patterns = ['_build', '**.ipynb_checkpoints', '**.swp']
 
 # Default language for syntax highlighting in reST and Markdown cells
 highlight_language = 'none'
diff --git a/docs/developer/index.rst b/docs/developer/index.rst
new file mode 100644
index 00000000..0b23cb1d
--- /dev/null
+++ b/docs/developer/index.rst
@@ -0,0 +1,8 @@
+Developers
+==========
+
+.. toctree::
+    :maxdepth: 2
+
+    sql-translators.ipynb
+
diff --git a/docs/developer/sql-translators.ipynb b/docs/developer/sql-translators.ipynb
new file mode 100644
index 00000000..238a6b4f
--- /dev/null
+++ b/docs/developer/sql-translators.ipynb
@@ -0,0 +1,373 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SQL translators"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The purpose of this vignette is to walk through how expressions like `_.id.mean()` are converted into SQL.\n",
+    "\n",
+    "This process involves 3 parts\n",
+    "\n",
+    "1. SQL translation functions, e.g. taking column \"id\" and producing the SQL \"ROUND(id)\".\n",
+    "2. SQL translation from a symbolic call\n",
+    "  - Converting method calls like `_.id.round(2)` to `round(_.id, 2)`\n",
+    "  - Looking up SQL translators (e.g. for \"mean\" function call)\n",
+    "3. Handling SQL partitions, like in OVER clauses\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using sqlalchemy select statment for convenience\n",
+    "\n",
+    "Throughout this vignette, we'll use a select statement object from sqlalchemy,\n",
+    "so we can conveniently access its columns as needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sqlalchemy import sql\n",
+    "col_names = ['id', 'x', 'y']\n",
+    "sel = sql.select([sql.column(x) for x in col_names])\n",
+    "\n",
+    "print(sel)\n",
+    "print(type(sel.columns))\n",
+    "print(sel.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Translator functions\n",
+    "\n",
+    "A SQL translator function takes...\n",
+    "\n",
+    "* a first argument that is a sqlalchemy Column\n",
+    "* (optional) additional arguments for the translation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### A simple translator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f_simple_round = lambda col, n: sql.func.round(col, n)\n",
+    "\n",
+    "sql_expr = f_simple_round(sel.columns.x, 2)\n",
+    "\n",
+    "print(sql_expr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The function above is essentially what most translator functions are.\n",
+    "\n",
+    "For example, here is the round function defined for postgresql.\n",
+    "One key difference is that it casts the column to a numeric beforehand."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from siuba.sql.dialects.postgresql import funcs\n",
+    "\n",
+    "f_round = funcs['scalar']['round']\n",
+    "sql_expr = f_round(sel.columns.x, 2)\n",
+    "\n",
+    "print(sql_expr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Handling windows with custom Over clauses"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f_win_mean = funcs['window']['mean']\n",
+    "\n",
+    "sql_over_expr = f_win_mean(sel.columns.x)\n",
+    "\n",
+    "print(type(sql_over_expr))\n",
+    "print(sql_over_expr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice that this window expression has an empty over clause. This clause needs to be able to include any variables we've grouped the data by.\n",
+    "\n",
+    "Siuba handles this by implementing a `set_over` method on these custom sqlalchemy Over clauses, which takes grouping and ordering variables as arguments."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_by_clause = sql.elements.ClauseList(sel.columns.x, sel.columns.y)\n",
+    "print(sql_over_expr.set_over(group_by_clause))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Call shaping\n",
+    "\n",
+    "The section above discusses how SQL translators are functions that take a sqlalchemy column, and return a SQL expression. However, when using siuba we often have expressions like...\n",
+    "\n",
+    "```\n",
+    "mutate(data, x = _.y.round(2))\n",
+    "```\n",
+    "\n",
+    "In this case, before we can even use a SQL translator, we need to...\n",
+    "\n",
+    "* find the name and arguments of the method being called\n",
+    "* find the column it is being called on\n",
+    "\n",
+    "This is done by using the `CallTreeLocal` class to analyze the tree of operations for each expression."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from siuba.siu import Lazy, CallTreeLocal, Call, strip_symbolic\n",
+    "from siuba import _\n",
+    "\n",
+    "_.y.round(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Example of translation with CallTreeLocal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from siuba.sql.dialects.postgresql import funcs\n",
+    "\n",
+    "local_funcs = {**funcs['scalar'], **funcs['window']}\n",
+    "\n",
+    "call_shaper = CallTreeLocal(\n",
+    "    local_funcs,\n",
+    "    rm_attr = ('str', 'dt'),\n",
+    "    call_sub_attr = ('dt',)\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "symbol = _.id.mean()\n",
+    "call = strip_symbolic(symbol)\n",
+    "print(call)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "func_call = call_shaper.enter(call)\n",
+    "print(func_call(sel.columns))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is the same result as when we called the SQL translator for `mean` manually!\n",
+    "In that section we also showed that we can set group information, so that it takes \n",
+    "an average within each group.\n",
+    "\n",
+    "In this case it's easy to set group information to the Over clause.\n",
+    "However, an additional challenge is when it's part of a larger expression..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "call2 = strip_symbolic(_.id.mean() + 1)\n",
+    "func_call2 = call_shaper.enter(call2)\n",
+    "\n",
+    "func_call2(sel.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Handling partitions\n",
+    "\n",
+    "While the first section showed how siuba's custom Over clauses can add grouping info to a translation, it is missing one key detail: expressions that generate Over clauses, like `_.id.mean()`, can be part of larger expressions. For example `_.id.mean() + 1`.\n",
+    "\n",
+    "In this case, if we look at the call tree for that expression, the top operation is the addition..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "_.id.mean() + 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "How can we create the appropriate expression...\n",
+    "\n",
+    "```\n",
+    "avg(some_col) OVER (PARTITION BY x, y) + 1\n",
+    "```\n",
+    "\n",
+    "when the piece that needs grouping info is not easily accessible? The answer is by using a tree visitor, which steps down every black rectangle in the call tree shown above, from top to bottom.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Full example\n",
+    "\n",
+    "Below, we copy the code from the call shaping section.."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from siuba.sql.verbs import track_call_windows\n",
+    "from siuba import _\n",
+    "from siuba.sql.dialects.postgresql import funcs\n",
+    "\n",
+    "local_funcs = {**funcs['scalar'], **funcs['window']}\n",
+    "\n",
+    "call_shaper = CallTreeLocal(\n",
+    "    local_funcs,\n",
+    "    rm_attr = ('str', 'dt'),\n",
+    "    call_sub_attr = ('dt',)\n",
+    "    )\n",
+    "\n",
+    "symbol3 = _.id.mean() + 1\n",
+    "call3 = strip_symbolic(symbol3)\n",
+    "func_call3 = call_shaper.enter(call3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we pass the shaped call..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "col, windows = track_call_windows(\n",
+    "    func_call3,\n",
+    "    sel.columns,\n",
+    "    group_by = ['x', 'y'],\n",
+    "    order_by = []\n",
+    "    )\n",
+    "\n",
+    "print(col)\n",
+    "print(windows)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/index.rst b/docs/index.rst
index 79d5fdf8..ebfc931c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -5,6 +5,7 @@
     intro.Rmd
     intro_sql_basic.ipynb
     intro_sql_interm.ipynb
+    developer/index.rst
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/intro.Rmd b/docs/intro.Rmd
index 6f12a01a..1bde1e26 100644
--- a/docs/intro.Rmd
+++ b/docs/intro.Rmd
@@ -137,7 +137,7 @@ from sqlalchemy import create_engine
 from siuba.sql import LazyTbl
 
 # # copy in to sqlite
-engine = create_engine("sqlite:///:memory")
+engine = create_engine("sqlite:///:memory:")
 mtcars.to_sql("mtcars", engine, if_exists = "replace")
 
 # connect with siuba
diff --git a/docs/intro_sql_basic.ipynb b/docs/intro_sql_basic.ipynb
index 4ccc149d..cbe6548e 100644
--- a/docs/intro_sql_basic.ipynb
+++ b/docs/intro_sql_basic.ipynb
@@ -20,7 +20,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Using to query SQL (intro)"
+    "# Querying SQL (intro)"
    ]
   },
   {
@@ -320,7 +320,7 @@
     "from siuba.sql import LazyTbl\n",
     "\n",
     "# copy in to sqlite\n",
-    "engine = create_engine(\"sqlite:///:memory\")\n",
+    "engine = create_engine(\"sqlite:///:memory:\")\n",
     "mtcars.to_sql(\"mtcars\", engine, if_exists = \"replace\")\n",
     "\n",
     "# connect with siuba\n",
diff --git a/docs/intro_sql_interm.ipynb b/docs/intro_sql_interm.ipynb
index 7d4265bb..4ffa2226 100644
--- a/docs/intro_sql_interm.ipynb
+++ b/docs/intro_sql_interm.ipynb
@@ -22,7 +22,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Using to query SQL (advanced)\n",
+    "# Querying SQL (advanced)\n",
     "\n",
     "**NOTE: THIS DOC IS CURRENTLY IN OUTLINE FORM**\n",
     "\n",