diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dbe3eb1 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +notebooks/Rexhomes model notebook rubric.ipynb diff --git a/data/IrisDataSet.csv.gz b/data/IrisDataSet.csv.gz new file mode 100644 index 0000000..1b15ee6 Binary files /dev/null and b/data/IrisDataSet.csv.gz differ diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..09583f8 --- /dev/null +++ b/environment.yml @@ -0,0 +1,22 @@ +name: nb_rubric + +channels: + - defaults + - conda-forge + +dependencies: + - bokeh + - jupyter + - jupyter_contrib_nbextensions + - jupyter_nbextensions_configurator + - matplotlib + - pandas + - pip + - s3fs + - seaborn + - scikit-learn + - statsmodels + - xgboost + - pip: + - fancyimpute + - pandas-profiling diff --git a/images/MLflow-logo-final-white-TM.png b/images/MLflow-logo-final-white-TM.png new file mode 100644 index 0000000..1b6d0e5 Binary files /dev/null and b/images/MLflow-logo-final-white-TM.png differ diff --git a/images/austin-neill-emH2e5SBifE-unsplash.jpg b/images/austin-neill-emH2e5SBifE-unsplash.jpg new file mode 100644 index 0000000..f41f913 Binary files /dev/null and b/images/austin-neill-emH2e5SBifE-unsplash.jpg differ diff --git a/images/bentoml-readme-header.jpeg b/images/bentoml-readme-header.jpeg new file mode 100644 index 0000000..7b26ab7 Binary files /dev/null and b/images/bentoml-readme-header.jpeg differ diff --git a/images/casey-horner-y7jrFSlVZAQ-unsplash-cropped.jpg b/images/casey-horner-y7jrFSlVZAQ-unsplash-cropped.jpg new file mode 100644 index 0000000..5ad48bf Binary files /dev/null and b/images/casey-horner-y7jrFSlVZAQ-unsplash-cropped.jpg differ diff --git a/images/casey-horner-y7jrFSlVZAQ-unsplash.jpg b/images/casey-horner-y7jrFSlVZAQ-unsplash.jpg new file mode 100644 index 0000000..2b861c0 Binary files /dev/null and b/images/casey-horner-y7jrFSlVZAQ-unsplash.jpg differ diff --git a/images/guillaume-bolduc-uBe2mknURG4-unsplash.jpg b/images/guillaume-bolduc-uBe2mknURG4-unsplash.jpg new file mode 100644 index 0000000..7eb3cb6 Binary files /dev/null and b/images/guillaume-bolduc-uBe2mknURG4-unsplash.jpg differ diff --git a/images/mapr-lambda-architecture.png b/images/mapr-lambda-architecture.png new file mode 100644 index 0000000..134e457 Binary files /dev/null and b/images/mapr-lambda-architecture.png differ diff --git a/images/noaa-3duT-54VuK8-unsplash.jpg b/images/noaa-3duT-54VuK8-unsplash.jpg new file mode 100644 index 0000000..0bd4a76 Binary files /dev/null and b/images/noaa-3duT-54VuK8-unsplash.jpg differ diff --git a/images/provbook-example.png b/images/provbook-example.png new file mode 100644 index 0000000..ac3e573 Binary files /dev/null and b/images/provbook-example.png differ diff --git a/images/redis-ai.png b/images/redis-ai.png new file mode 100644 index 0000000..8d83a9a Binary files /dev/null and b/images/redis-ai.png differ diff --git a/images/richards-event-architecture.png b/images/richards-event-architecture.png new file mode 100644 index 0000000..697e46d Binary files /dev/null and b/images/richards-event-architecture.png differ diff --git a/images/sculley-et-al_hidden-tech-debt-ml.png b/images/sculley-et-al_hidden-tech-debt-ml.png new file mode 100644 index 0000000..03513c9 Binary files /dev/null and b/images/sculley-et-al_hidden-tech-debt-ml.png differ diff --git a/images/terrel-breakdown-of-on-node-monitors.png b/images/terrel-breakdown-of-on-node-monitors.png new file mode 100644 index 0000000..733e53d Binary files /dev/null and b/images/terrel-breakdown-of-on-node-monitors.png differ diff --git a/images/terrel-system-diagram.png b/images/terrel-system-diagram.png new file mode 100644 index 0000000..d1e8171 Binary files /dev/null and b/images/terrel-system-diagram.png differ diff --git a/images/whoisbenjamin-ApJp5Nk24a0-unsplash.jpg b/images/whoisbenjamin-ApJp5Nk24a0-unsplash.jpg new file mode 100644 index 0000000..aaea0a8 Binary files /dev/null and b/images/whoisbenjamin-ApJp5Nk24a0-unsplash.jpg differ diff --git a/notebooks/01_model_review.ipynb b/notebooks/01_model_review.ipynb index c887bce..6321fc0 100644 --- a/notebooks/01_model_review.ipynb +++ b/notebooks/01_model_review.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 10, + "execution_count": 1, "metadata": { "slideshow": { "slide_type": "skip" @@ -10,15 +10,322 @@ }, "outputs": [ { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'pandas_profiling'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpandas_profiling\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mstatsmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapi\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pandas_profiling'" - ] + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " var force = true;\n", + "\n", + " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + " var JS_MIME_TYPE = 'application/javascript';\n", + " var HTML_MIME_TYPE = 'text/html';\n", + " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " var CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " var script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " var cell = handle.cell;\n", + "\n", + " var id = cell.output_area._bokeh_element_id;\n", + " var server_id = cell.output_area._bokeh_server_id;\n", + " // Clean up Bokeh references\n", + " if (id != null && id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " var id = msg.content.text.trim();\n", + " if (id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " var output_area = handle.output_area;\n", + " var output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " var bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " var script_attrs = bk_div.children[0].attributes;\n", + " for (var i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " var toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " var events = require('base/js/events');\n", + " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + "\n", + " \n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " var NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"
    \\n\"+\n", + " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", + " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", + " \"
\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " var el = document.getElementById(\"1001\");\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + "\n", + " function on_error() {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " for (var i = 0; i < css_urls.length; i++) {\n", + " var url = css_urls[i];\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.1.min.js\": \"qkRvDQVAIfzsJo40iRBbxt6sttt0hv4lh74DG7OK4MCHv4C5oohXYoHUM5W11uqS\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.1.min.js\": \"Sb7Mr06a9TNlet/GEBeKaf5xH3eb6AlCzwjtU82wNPyDrnfoiVl26qnvlKjmcAd+\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.1.min.js\": \"HaJ15vgfmcfRtB4c4YBOI4f1MUujukqInOWVqZJZZGK7Q+ivud0OKGSTn/Vm2iso\"};\n", + "\n", + " for (var i = 0; i < js_urls.length; i++) {\n", + " var url = js_urls[i];\n", + " var element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.async = false;\n", + " element.src = url;\n", + " if (url in hashes) {\n", + " element.crossOrigin = \"anonymous\";\n", + " element.integrity = \"sha384-\" + hashes[url];\n", + " }\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " \n", + " var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.1.min.js\"];\n", + " var css_urls = [];\n", + " \n", + "\n", + " var inline_js = [\n", + " function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + " function(Bokeh) {\n", + " \n", + " \n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " \n", + " if (root.Bokeh !== undefined || force === true) {\n", + " \n", + " for (var i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }\n", + " if (force === true) {\n", + " display_loaded();\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + "\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(css_urls, js_urls, function() {\n", + " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"1001\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.1.min.js\": \"qkRvDQVAIfzsJo40iRBbxt6sttt0hv4lh74DG7OK4MCHv4C5oohXYoHUM5W11uqS\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.1.min.js\": \"Sb7Mr06a9TNlet/GEBeKaf5xH3eb6AlCzwjtU82wNPyDrnfoiVl26qnvlKjmcAd+\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.1.min.js\": \"HaJ15vgfmcfRtB4c4YBOI4f1MUujukqInOWVqZJZZGK7Q+ivud0OKGSTn/Vm2iso\"};\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n if (url in hashes) {\n element.crossOrigin = \"anonymous\";\n element.integrity = \"sha384-\" + hashes[url];\n }\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.1.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -70,7 +377,7 @@ "[Andy R. Terrel, PhD](https://www.linkedin.com/in/aterrel/) | Chief Data Scientist, [REX Inc.](https://rexhomes.com) | President, [NumFOCUS](https://numfocus.org) \n", "\n", "Contributions by: \n", - "[Andy Maloney](https://linkedin.com/in/andy-maloney-a43a34195) | [John Hanley](https:// linkedin.com/in/jhanley714) | REX Data Team" + "[Andy Maloney](https://linkedin.com/in/andy-maloney-a43a34195) | [John Hanley](https://linkedin.com/in/jhanley714) | REX Data Team" ] }, { @@ -605,6 +912,7 @@ "cell_type": "code", "execution_count": 8, "metadata": { + "collapsed": true, "slideshow": { "slide_type": "subslide" } @@ -645,6 +953,7 @@ "cell_type": "code", "execution_count": 9, "metadata": { + "collapsed": true, "slideshow": { "slide_type": "subslide" } @@ -685,11 +994,25 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Write down the meta-model\n", + "\n", + "**objective** – a sentence or two on what your model or analysis aims at.\n", + "\n", + "**KPIs* – List all key performance indicators. For example:\n", + "\n", + "- Model predicts X.\n", + "- Further actions that need to be taken to acquire data for the model.\n", + "- Out of sample predictions show that...\n", + "- Y is an outlier when these assumptions are made...\n", + "\n" + ] }, { "cell_type": "markdown", @@ -699,7 +1022,15 @@ } }, "source": [ - "### Defining your model KPIs" + "### Write down the meta-model\n", + "\n", + "**inputs**\n", + "- Tell us what the model reads, e.g. postgres_db_uri , or an S3 location like https://s3.console.aws.amazon.com/s3/buckets/bar/baz\n", + "- What is the provenance? Directly querying a System of Record? Or a subsequent journey? Can we reproduce it?\n", + "\n", + "**outputs** \n", + "- Where will inferences go? Stdout, file, S3, a table? \n", + "- Should we be worried about overwriting some frozen output?" ] }, { @@ -710,23 +1041,58 @@ } }, "source": [ - "### Finishing a checklist" + "### Write down the meta-model\n", + "\n", + "**assumptions**\n", + "\n", + "- Are assumptions explicitly written down in a ReadMe? \n", + "- The most common violations \n", + " - assuming independence over observations that aren't independent, or \n", + " - implicitly assuming some statistical distribution such as normality.\n", + "\n", + "- Do assumptions actually hold in practice, in the observed data? \n", + "\n", + "**benchmark** \n", + "\n", + "- What competing model are you comparing performance to? \n", + "- Is it the best available?\n", + "\n", + "**bias-variance tradeoff**\n", + "- What does the learning curve say about overfitting?\n", + "\n", + "**transformation** \n", + "- How is data transformed during ETL? \n", + "- Filtering? \n", + "- Missing attributes? \n", + "- Imputation?" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { - "slide_type": "slide" + "slide_type": "subslide" } }, "source": [ - "## ZOMG It's down!!!\n", + "### The Review Checklist" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### Summary\n", "\n", - "\n", + "Author will create a request file, and Reviewer will create a result file, which are permanently added to the repo.\n", "\n", + "#### Outcome\n", "\n", - "Photo by Casey Horner on Unsplash" + "At end of review, we’ll have learned whether Reviewer believes the model (believes it is useful), and finds it easy to use." ] }, { @@ -737,7 +1103,11 @@ } }, "source": [ - "### Define healthchecks" + "#### Prerequisites\n", + "\n", + "- You have a model checked into source control. Good! \n", + "- You have an **Objective**, in the form of a README.md or similar file. It, too, is checked into the repo.\n", + "- You have observations. Since they are “big”, too big to conveniently feed to the elephant named Git, they can be found in S3, or perhaps in an RDBMS table. They are frozen, they shall not change in the next week or two. Consider putting them in an S3 object or DB table that has an iso8601 date as part of the name, e.g. foo-2020-04-14." ] }, { @@ -748,7 +1118,11 @@ } }, "source": [ - "### Use model servers" + "#### Prerequisites (continued)\n", + "\n", + "- You have predictions, model outputs that are stored somewhere. They are frozen, just like the input observations. Perhaps they are “small” and may conveniently be stored in a git repo. Or perhaps they are big and are more conveniently stored in S3 or table. If the table contains additional rows, be sure your ReadMe or review-request shows how to query just the rows that matter for this review.\n", + "- You think the model is mature enough for review. Consider running it through a brief code review beforehand. \n", + "- Consider doing a dry run, where Author pretends he is Reviewer and verifies the task is feasible." ] }, { @@ -759,7 +1133,112 @@ } }, "source": [ - "### Connect to alerting systems" + "#### Author process\n", + "\n", + "- Pick a reviewer. That’s a reviewer, a single reviewer, just one, as there will be some effort involved. Count not to two. We can invite more to the party later.\n", + "\n", + "- Create a new git feature branch (along with a jira ticket) for this review. Not for code development. Just for review.\n", + "\n", + "- Commit and push a file full of review-specific instructions to the reviewer, with a name like doc/2020-04-14-review-request.md, or in toplevel dir, or whatever is convenient for your repo. Invite the reviewer to tweak an aspect of training or prediction, something that won’t take days of compute time. (Consider testing it yourself! For “big” models, consider producing model plus toy_model, which processes small input in less than an hour. There’s lots of things the reviewer might do – your job is to make it so easy that many of them are quickly accomplished.)\n", + "\n", + "- Push to Bitbucket and send your reviewer a PR in the usual way. Make no further edits. This branch is no longer yours – it belongs exclusively to the single reviewer you nominated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### Reviewer process (continued)\n", + "\n", + "- Checkout the feature / review branch.\n", + "- Create and commit an empty file with a name like doc/2020-04-16-review-result.md.\n", + "- Read the Objective, found in README.md or wherever the review-request explains it may be found. Add a sentence or two to your review-result file, describing whether the Objective is clear and seems relevant to stakeholders.\n", + "- Read the frozen model inputs, a few individual records plus stat summaries that Author helpfully made it easy to view. Write a sentence or two describing whether inputs seem to match reality, and match the Objective.\n", + "- Read the frozen model outputs. Write a sentence or two describing whether they seem to match reality, and match the Objective. For each of these three, a simple “makes sense, yes, I agree” will suffice." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### Reviewer process (continued)\n", + "\n", + "- Add a sentence or two describing transformation(s) done during ETL, and whether they seem reasonable. You should be starting with copy-n-paste of a sentence the Author helpfully put in the ReadMe.\n", + "- Read the model code, if you like, and append comments to the review-result file. This part is optional – code review should have been handled prior to model review, perhaps involving same reviewer. The code must respect frozen inputs and outputs, leaving them untouched. It must be able to send reviewer’s output to a new S3 object, table, or similar.\n", + "- Retrain the model from frozen inputs, or at least retrain toy_model. Timebox to one hour running time, and add a paragraph to review-result. Write “took too long” if model or toy_model get stuck at this stage.\n", + "- Reproduce the frozen model outputs. Timebox to one hour running time, and add a paragraph to review-result." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### Reviewer process (continued)\n", + "- Read the review-request instructions and tweak the model in the suggested way. Timebox to one hour, and add a paragraph to review-result.\n", + "- Tweak training or prediction in a way you find interesting. Timebox to one hour, and add a paragraph to review-result.\n", + "- Pick an example prediction error, or a summary description of how the error is distributed. Add a paragraph describing why the error makes sense, or does not.\n", + "- Revisit the Objective. Add a paragraph relating it to the model. Describe Next Steps, things the Reviewer feels offer opportunity for improvement, based on what we learned during review.\n", + "- Do final commit on review-result, push branch. Invite others / comment within the PR if you wish. Click Approve on the PR, and click Merge down to develop." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## ZOMG It's down!!!\n", + "\n", + "\n", + "\n", + "\n", + "Photo by Casey Horner on Unsplash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Define healthchecks and alerts\n", + "\n", + "\n", + "\n", + "Figure by Andy R. Terrel" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Use model servers\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "" ] }, { @@ -795,7 +1274,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/notebooks/environment.yml b/notebooks/environment.yml new file mode 100644 index 0000000..09583f8 --- /dev/null +++ b/notebooks/environment.yml @@ -0,0 +1,22 @@ +name: nb_rubric + +channels: + - defaults + - conda-forge + +dependencies: + - bokeh + - jupyter + - jupyter_contrib_nbextensions + - jupyter_nbextensions_configurator + - matplotlib + - pandas + - pip + - s3fs + - seaborn + - scikit-learn + - statsmodels + - xgboost + - pip: + - fancyimpute + - pandas-profiling