diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 45cfc9cd7..c02730811 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -30,13 +30,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Write your code below.\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "import dask.dataframe as dd\n", + "\n", "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -55,14 +59,34 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(0, [])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import os\n", "from glob import glob\n", "\n", - "# Write your code below.\n", + "import os\n", + "from glob import glob\n", + "\n", + "PRICE_DATA = os.getenv(\"PRICE_DATA\")\n", + "\n", + "parquet_paths = glob(os.path.join(PRICE_DATA, \"**\", \"*.parquet\"), recursive=True)\n", + "parquet_paths[:5], len(parquet_paths)\n", + "\n", + "all_files = glob(os.path.join(PRICE_DATA, \"**\", \"*.*\"), recursive=True)\n", + "len(all_files), all_files[:20]\n", "\n" ] }, @@ -88,32 +112,83 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PRICE_DATA = ../../05_src/data/prices/\n", + "Exists? False\n" + ] + } + ], "source": [ - "# Write your code below.\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from glob import glob\n", + "\n", + "load_dotenv()\n", + "\n", + "PRICE_DATA = os.getenv(\"PRICE_DATA\")\n", + "print(\"PRICE_DATA =\", PRICE_DATA)\n", + "\n", + "if PRICE_DATA is None:\n", + " print(\"❌ PRICE_DATA is None (not set).\")\n", + "else:\n", + " print(\"Exists?\", os.path.exists(PRICE_DATA))\n", + " if os.path.exists(PRICE_DATA):\n", + " # Show what’s inside the folder\n", + " print(\"Top-level items:\", os.listdir(PRICE_DATA)[:20])\n", + "\n", + " # Look for parquet files anywhere under PRICE_DATA\n", + " parquet_paths = glob(os.path.join(PRICE_DATA, \"**\", \"*.parquet\"), recursive=True)\n", + " print(\"Parquet files found:\", len(parquet_paths))\n", + " print(\"Example parquet paths:\", parquet_paths[:5])\n", + "import dask.dataframe as dd\n", + "\n", "\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 7, "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'dd_feat' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df_feat = \u001b[43mdd_feat\u001b[49m.compute()\n\u001b[32m 3\u001b[39m df_feat[\u001b[33m\"\u001b[39m\u001b[33mreturns_ma_10\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m 4\u001b[39m df_feat.groupby(\u001b[33m\"\u001b[39m\u001b[33mTicker\u001b[39m\u001b[33m\"\u001b[39m)[\u001b[33m\"\u001b[39m\u001b[33mreturns\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 5\u001b[39m .rolling(\u001b[32m10\u001b[39m)\n\u001b[32m 6\u001b[39m .mean()\n\u001b[32m 7\u001b[39m .reset_index(level=\u001b[32m0\u001b[39m, drop=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 8\u001b[39m )\n\u001b[32m 10\u001b[39m df_feat.head(\u001b[32m15\u001b[39m)\n", + "\u001b[31mNameError\u001b[39m: name 'dd_feat' is not defined" + ] + } + ], "source": [ - "+ Convert the Dask data frame to a pandas data frame. \n", - "+ Add a new feature containing the moving average of `returns` using a window of 10 days. There are several ways to solve this task, a simple one uses `.rolling(10).mean()`.\n", + "df_feat = dd_feat.compute()\n", "\n", - "(3 pt)" + "df_feat[\"returns_ma_10\"] = (\n", + " df_feat.groupby(\"Ticker\")[\"returns\"]\n", + " .rolling(10)\n", + " .mean()\n", + " .reset_index(level=0, drop=True)\n", + ")\n", + "\n", + "df_feat.head(15)\n" ] }, { - "cell_type": "code", - "execution_count": 25, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Write your code below.\n", - "\n" + "+ Convert the Dask data frame to a pandas data frame. \n", + "+ Add a new feature containing the moving average of `returns` using a window of 10 days. There are several ways to solve this task, a simple one uses `.rolling(10).mean()`.\n", + "\n", + "(3 pt)" ] }, { @@ -128,6 +203,36 @@ "(1 pt)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Comment on moving average calculation approach\n", + "\n", + "comment = \"\"\"\n", + "1) Was it necessary to convert to pandas to calculate the moving average return?\n", + " No, it was not necessary. Dask supports rolling window operations through \n", + " dask.dataframe.rolling(), so the moving average could have been computed \n", + " directly on the Dask dataframe without converting to pandas.\n", + "\n", + "2) Would it have been better to do it in Dask? Why?\n", + " It depends on the dataset size:\n", + " - For large datasets that don't fit in memory: YES, Dask is better because \n", + " it maintains parallel processing and out-of-core computation.\n", + " - For smaller datasets that fit comfortably in memory: Pandas is acceptable \n", + " and simpler to use.\n", + " \n", + " In this case, since we converted to pandas anyway (via .compute()), the \n", + " operation was already materialized in memory, so using pandas for rolling \n", + " is fine. However, if memory is constrained, computing the rolling mean in \n", + " Dask first (before converting to pandas) would have been more efficient.\n", + "\"\"\"\n", + "\n", + "print(comment)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -165,7 +270,7 @@ ], "metadata": { "kernelspec": { - "display_name": "env", + "display_name": "production-env", "language": "python", "name": "python3" }, @@ -179,7 +284,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.11.14" } }, "nbformat": 4, diff --git a/02_activities/assignments/assignment_2.ipynb b/02_activities/assignments/assignment_2.ipynb index 29d661c57..303a24bd2 100644 --- a/02_activities/assignments/assignment_2.ipynb +++ b/02_activities/assignments/assignment_2.ipynb @@ -97,11 +97,45 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../../05_src/data/fires/forestfires.csv'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 11\u001b[39m\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msklearn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mmodel_selection\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[32m 6\u001b[39m columns = [\n\u001b[32m 7\u001b[39m \u001b[33m'\u001b[39m\u001b[33mcoord_x\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mcoord_y\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmonth\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mday\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mffmc\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mdmc\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mdc\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33misi\u001b[39m\u001b[33m'\u001b[39m,\n\u001b[32m 8\u001b[39m \u001b[33m'\u001b[39m\u001b[33mtemp\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mrh\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mwind\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mrain\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33marea\u001b[39m\u001b[33m'\u001b[39m\n\u001b[32m 9\u001b[39m ]\n\u001b[32m---> \u001b[39m\u001b[32m11\u001b[39m fires_dt = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m../../05_src/data/fires/forestfires.csv\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheader\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnames\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 13\u001b[39m X = fires_dt.drop(columns=[\u001b[33m\"\u001b[39m\u001b[33marea\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m 14\u001b[39m y = fires_dt[\u001b[33m\"\u001b[39m\u001b[33marea\u001b[39m\u001b[33m\"\u001b[39m]\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\fatim\\dsi\\production\\production-env\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[39m, in \u001b[36mread_csv\u001b[39m\u001b[34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[39m\n\u001b[32m 1013\u001b[39m kwds_defaults = _refine_defaults_read(\n\u001b[32m 1014\u001b[39m dialect,\n\u001b[32m 1015\u001b[39m delimiter,\n\u001b[32m (...)\u001b[39m\u001b[32m 1022\u001b[39m dtype_backend=dtype_backend,\n\u001b[32m 1023\u001b[39m )\n\u001b[32m 1024\u001b[39m kwds.update(kwds_defaults)\n\u001b[32m-> \u001b[39m\u001b[32m1026\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\fatim\\dsi\\production\\production-env\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[39m, in \u001b[36m_read\u001b[39m\u001b[34m(filepath_or_buffer, kwds)\u001b[39m\n\u001b[32m 617\u001b[39m _validate_names(kwds.get(\u001b[33m\"\u001b[39m\u001b[33mnames\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[32m 619\u001b[39m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m620\u001b[39m parser = \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 622\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[32m 623\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\fatim\\dsi\\production\\production-env\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[39m, in \u001b[36mTextFileReader.__init__\u001b[39m\u001b[34m(self, f, engine, **kwds)\u001b[39m\n\u001b[32m 1617\u001b[39m \u001b[38;5;28mself\u001b[39m.options[\u001b[33m\"\u001b[39m\u001b[33mhas_index_names\u001b[39m\u001b[33m\"\u001b[39m] = kwds[\u001b[33m\"\u001b[39m\u001b[33mhas_index_names\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 1619\u001b[39m \u001b[38;5;28mself\u001b[39m.handles: IOHandles | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1620\u001b[39m \u001b[38;5;28mself\u001b[39m._engine = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\fatim\\dsi\\production\\production-env\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1880\u001b[39m, in \u001b[36mTextFileReader._make_engine\u001b[39m\u001b[34m(self, f, engine)\u001b[39m\n\u001b[32m 1878\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[32m 1879\u001b[39m mode += \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m-> \u001b[39m\u001b[32m1880\u001b[39m \u001b[38;5;28mself\u001b[39m.handles = \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1881\u001b[39m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1882\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1883\u001b[39m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mencoding\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1884\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcompression\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1885\u001b[39m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmemory_map\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1886\u001b[39m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1887\u001b[39m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mencoding_errors\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstrict\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1888\u001b[39m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstorage_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1889\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1890\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m.handles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1891\u001b[39m f = \u001b[38;5;28mself\u001b[39m.handles.handle\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\fatim\\dsi\\production\\production-env\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[39m, in \u001b[36mget_handle\u001b[39m\u001b[34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[39m\n\u001b[32m 868\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[32m 869\u001b[39m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[32m 870\u001b[39m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[32m 871\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m ioargs.encoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs.mode:\n\u001b[32m 872\u001b[39m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m873\u001b[39m handle = \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m 874\u001b[39m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 875\u001b[39m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 876\u001b[39m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[43mioargs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 877\u001b[39m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 878\u001b[39m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 879\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 880\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 881\u001b[39m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[32m 882\u001b[39m handle = \u001b[38;5;28mopen\u001b[39m(handle, ioargs.mode)\n", + "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: '../../05_src/data/fires/forestfires.csv'" + ] + } + ], "source": [ - "# Load the libraries as required." + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "columns = [\n", + " 'coord_x', 'coord_y', 'month', 'day', 'ffmc', 'dmc', 'dc', 'isi',\n", + " 'temp', 'rh', 'wind', 'rain', 'area'\n", + "]\n", + "\n", + "fires_dt = pd.read_csv('../../05_src/data/fires/forestfires.csv', header=0, names=columns)\n", + "\n", + "X = fires_dt.drop(columns=[\"area\"])\n", + "y = fires_dt[\"area\"]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n" ] }, { @@ -180,10 +214,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "num_cols = ['coord_x','coord_y','ffmc','dmc','dc','isi','temp','rh','wind','rain']\n", + "cat_cols = ['month','day']\n", + "\n", + "# Preproc 1: scale numeric, OHE categorical\n", + "preproc1 = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", Pipeline([(\"scaler\", StandardScaler())]), num_cols),\n", + " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", + " ],\n", + " remainder=\"drop\"\n", + ")" + ] }, { "cell_type": "markdown", @@ -199,10 +249,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Preproc 2: scale + non-linear transform (Yeo-Johnson works with zeros/negatives)\n", + "preproc2 = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", Pipeline([\n", + " (\"power\", PowerTransformer(method=\"yeo-johnson\")),\n", + " (\"scaler\", StandardScaler()),\n", + " ]), num_cols),\n", + " (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n", + " ],\n", + " remainder=\"drop\"\n", + ")" + ] }, { "cell_type": "markdown", @@ -245,11 +307,21 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "illegal target for annotation (380116875.py, line 2)", + "output_type": "error", + "traceback": [ + " \u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[31m \u001b[39m\u001b[31m\"A_preproc1_ridge\": Pipeline([(\"preprocessing\", preproc1), (\"regressor\", baseline)]),\u001b[39m\n ^\n\u001b[31mSyntaxError\u001b[39m\u001b[31m:\u001b[39m illegal target for annotation\n" + ] + } + ], "source": [ - "# Pipeline C = preproc1 + advanced model\n" + "# Pipeline C = preproc1 + advanced model\n", + "\"A_preproc1_ridge\": Pipeline([(\"preprocessing\", preproc1), (\"regressor\", baseline)])," ] }, { @@ -260,7 +332,7 @@ "source": [ "# Pipeline D = preproc2 + advanced model\n", "\n", - " " + " \"D_preproc2_rf\": Pipeline([(\"preprocessing\", preproc2), (\"regressor\", advanced)])," ] }, { @@ -276,10 +348,70 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'X_train' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 42\u001b[39m\n\u001b[32m 34\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m name, pipe \u001b[38;5;129;01min\u001b[39;00m pipelines.items():\n\u001b[32m 35\u001b[39m gs = GridSearchCV(\n\u001b[32m 36\u001b[39m pipe,\n\u001b[32m 37\u001b[39m param_grids[name],\n\u001b[32m (...)\u001b[39m\u001b[32m 40\u001b[39m n_jobs=-\u001b[32m1\u001b[39m\n\u001b[32m 41\u001b[39m )\n\u001b[32m---> \u001b[39m\u001b[32m42\u001b[39m gs.fit(\u001b[43mX_train\u001b[49m, y_train)\n\u001b[32m 43\u001b[39m results[name] = gs\n\u001b[32m 45\u001b[39m \u001b[38;5;66;03m# Compare\u001b[39;00m\n", + "\u001b[31mNameError\u001b[39m: name 'X_train' is not defined" + ] + } + ], + "source": [ + "from sklearn.linear_model import Ridge\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "# Baseline regressor\n", + "baseline = Ridge(random_state=42)\n", + "\n", + "# Advanced regressor (tree-based => fast SHAP)\n", + "advanced = RandomForestRegressor(random_state=42, n_jobs=-1)\n", + "\n", + "pipelines = {\n", + " \"A_preproc1_ridge\": Pipeline([(\"preprocessing\", preproc1), (\"regressor\", baseline)]),\n", + " \"B_preproc2_ridge\": Pipeline([(\"preprocessing\", preproc2), (\"regressor\", baseline)]),\n", + " \"C_preproc1_rf\": Pipeline([(\"preprocessing\", preproc1), (\"regressor\", advanced)]),\n", + " \"D_preproc2_rf\": Pipeline([(\"preprocessing\", preproc2), (\"regressor\", advanced)]),\n", + "}\n", + "\n", + "param_grids = {\n", + " \"A_preproc1_ridge\": {\"regressor__alpha\": [0.1, 1.0, 10.0, 100.0]},\n", + " \"B_preproc2_ridge\": {\"regressor__alpha\": [0.1, 1.0, 10.0, 100.0]},\n", + " \"C_preproc1_rf\": {\n", + " \"regressor__n_estimators\": [200, 400],\n", + " \"regressor__max_depth\": [None, 10],\n", + " }, # 4 combos\n", + " \"D_preproc2_rf\": {\n", + " \"regressor__n_estimators\": [200, 400],\n", + " \"regressor__max_depth\": [None, 10],\n", + " }, # 4 combos\n", + "}\n", + "\n", + "scoring = \"neg_root_mean_squared_error\"\n", + "results = {}\n", + "\n", + "for name, pipe in pipelines.items():\n", + " gs = GridSearchCV(\n", + " pipe,\n", + " param_grids[name],\n", + " scoring=scoring,\n", + " cv=5,\n", + " n_jobs=-1\n", + " )\n", + " gs.fit(X_train, y_train)\n", + " results[name] = gs\n", + "\n", + "# Compare\n", + "for name, gs in results.items():\n", + " print(name, \"best RMSE:\", -gs.best_score_, \"best params:\", gs.best_params_)\n" + ] }, { "cell_type": "code", @@ -318,6 +450,37 @@ "+ Which model has the best performance?" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "min() arg is an empty sequence", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msklearn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mmetrics\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m mean_squared_error\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m best_name = \u001b[38;5;28;43mmin\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m-\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mk\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbest_score_\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# lowest RMSE\u001b[39;00m\n\u001b[32m 4\u001b[39m best_model = results[best_name].best_estimator_\n\u001b[32m 6\u001b[39m best_model.fit(X_train, y_train)\n", + "\u001b[31mValueError\u001b[39m: min() arg is an empty sequence" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "\n", + "best_name = min(results, key=lambda k: -results[k].best_score_) # lowest RMSE\n", + "best_model = results[best_name].best_estimator_\n", + "\n", + "best_model.fit(X_train, y_train)\n", + "pred = best_model.predict(X_test)\n", + "\n", + "rmse_test = mean_squared_error(y_test, pred, squared=False)\n", + "print(\"Best model:\", best_name)\n", + "print(\"Test RMSE:\", rmse_test)\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -329,10 +492,26 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'best_model' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpickle\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mbest_forest_fire_model.pkl\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mwb\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m pickle.dump(\u001b[43mbest_model\u001b[49m, f)\n", + "\u001b[31mNameError\u001b[39m: name 'best_model' is not defined" + ] + } + ], + "source": [ + "import pickle\n", + "with open(\"best_forest_fire_model.pkl\", \"wb\") as f:\n", + " pickle.dump(best_model, f)\n" + ] }, { "cell_type": "code", @@ -358,10 +537,50 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'shap'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# If needed: pip/uv install shap\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# !uv pip install shap\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mshap\u001b[39;00m\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# Get feature names after preprocessing\u001b[39;00m\n\u001b[32m 7\u001b[39m pre = best_model.named_steps[\u001b[33m\"\u001b[39m\u001b[33mpreprocessing\u001b[39m\u001b[33m\"\u001b[39m]\n", + "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'shap'" + ] + } + ], + "source": [ + "# If needed: pip/uv install shap\n", + "# !uv pip install shap\n", + "\n", + "import shap\n", + "\n", + "# Get feature names after preprocessing\n", + "pre = best_model.named_steps[\"preprocessing\"]\n", + "feature_names = pre.get_feature_names_out()\n", + "\n", + "# Transform train/test for SHAP\n", + "X_train_trans = pre.transform(X_train)\n", + "X_test_trans = pre.transform(X_test)\n", + "\n", + "reg = best_model.named_steps[\"regressor\"]\n", + "\n", + "explainer = shap.TreeExplainer(reg)\n", + "shap_values = explainer.shap_values(X_test_trans)\n", + "\n", + "# Local explanation (pick one row)\n", + "i = 0\n", + "shap.plots.waterfall(\n", + " shap.Explanation(values=shap_values[i], base_values=explainer.expected_value,\n", + " data=X_test_trans[i], feature_names=feature_names)\n", + ")\n", + "\n", + "# Global explanation\n", + "shap.summary_plot(shap_values, X_test_trans, feature_names=feature_names)\n" + ] }, { "cell_type": "code", @@ -423,7 +642,7 @@ ], "metadata": { "kernelspec": { - "display_name": "env", + "display_name": "production-env", "language": "python", "name": "python3" }, @@ -437,7 +656,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.14" } }, "nbformat": 4,