UofT-DSI · fatimalukmani-ffl · Jan 29, 2026 · Jan 29, 2026
diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb
@@ -30,13 +30,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Write your code below.\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
+    "import dask.dataframe as dd\n",
+    "\n",
     "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -55,14 +59,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0, [])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import os\n",
     "from glob import glob\n",
     "\n",
-    "# Write your code below.\n",
+    "import os\n",
+    "from glob import glob\n",
+    "\n",
+    "PRICE_DATA = os.getenv(\"PRICE_DATA\")\n",
+    "\n",
+    "parquet_paths = glob(os.path.join(PRICE_DATA, \"**\", \"*.parquet\"), recursive=True)\n",
+    "parquet_paths[:5], len(parquet_paths)\n",
+    "\n",
+    "all_files = glob(os.path.join(PRICE_DATA, \"**\", \"*.*\"), recursive=True)\n",
+    "len(all_files), all_files[:20]\n",
     "\n"
    ]
   },
@@ -88,32 +112,83 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PRICE_DATA = ../../05_src/data/prices/\n",
+      "Exists? False\n"
+     ]
+    }
+   ],
    "source": [
-    "# Write your code below.\n",
+    "import os\n",
+    "from dotenv import load_dotenv\n",
+    "from glob import glob\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "PRICE_DATA = os.getenv(\"PRICE_DATA\")\n",
+    "print(\"PRICE_DATA =\", PRICE_DATA)\n",
+    "\n",
+    "if PRICE_DATA is None:\n",
+    "    print(\"❌ PRICE_DATA is None (not set).\")\n",
+    "else:\n",
+    "    print(\"Exists?\", os.path.exists(PRICE_DATA))\n",
+    "    if os.path.exists(PRICE_DATA):\n",
+    "        # Show what’s inside the folder\n",
+    "        print(\"Top-level items:\", os.listdir(PRICE_DATA)[:20])\n",
+    "\n",
+    "        # Look for parquet files anywhere under PRICE_DATA\n",
+    "        parquet_paths = glob(os.path.join(PRICE_DATA, \"**\", \"*.parquet\"), recursive=True)\n",
+    "        print(\"Parquet files found:\", len(parquet_paths))\n",
+    "        print(\"Example parquet paths:\", parquet_paths[:5])\n",
+    "import dask.dataframe as dd\n",
+    "\n",
     "\n"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'dd_feat' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df_feat = \u001b[43mdd_feat\u001b[49m.compute()\n\u001b[32m      3\u001b[39m df_feat[\u001b[33m\"\u001b[39m\u001b[33mreturns_ma_10\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m      4\u001b[39m     df_feat.groupby(\u001b[33m\"\u001b[39m\u001b[33mTicker\u001b[39m\u001b[33m\"\u001b[39m)[\u001b[33m\"\u001b[39m\u001b[33mreturns\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m      5\u001b[39m     .rolling(\u001b[32m10\u001b[39m)\n\u001b[32m      6\u001b[39m     .mean()\n\u001b[32m      7\u001b[39m     .reset_index(level=\u001b[32m0\u001b[39m, drop=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m      8\u001b[39m )\n\u001b[32m     10\u001b[39m df_feat.head(\u001b[32m15\u001b[39m)\n",
+      "\u001b[31mNameError\u001b[39m: name 'dd_feat' is not defined"
+     ]
+    }
+   ],
    "source": [
-    "+ Convert the Dask data frame to a pandas data frame. \n",
-    "+ Add a new feature containing the moving average of `returns` using a window of 10 days. There are several ways to solve this task, a simple one uses `.rolling(10).mean()`.\n",
+    "df_feat = dd_feat.compute()\n",
     "\n",
-    "(3 pt)"
+    "df_feat[\"returns_ma_10\"] = (\n",
+    "    df_feat.groupby(\"Ticker\")[\"returns\"]\n",
+    "    .rolling(10)\n",
+    "    .mean()\n",
+    "    .reset_index(level=0, drop=True)\n",
+    ")\n",
+    "\n",
+    "df_feat.head(15)\n"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 25,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# Write your code below.\n",
-    "\n"
+    "+ Convert the Dask data frame to a pandas data frame. \n",
+    "+ Add a new feature containing the moving average of `returns` using a window of 10 days. There are several ways to solve this task, a simple one uses `.rolling(10).mean()`.\n",
+    "\n",
+    "(3 pt)"
    ]
   },
   {
@@ -128,6 +203,36 @@
     "(1 pt)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Comment on moving average calculation approach\n",
+    "\n",
+    "comment = \"\"\"\n",
+    "1) Was it necessary to convert to pandas to calculate the moving average return?\n",
+    "    No, it was not necessary. Dask supports rolling window operations through \n",
+    "    dask.dataframe.rolling(), so the moving average could have been computed \n",
+    "    directly on the Dask dataframe without converting to pandas.\n",
+    "\n",
+    "2) Would it have been better to do it in Dask? Why?\n",
+    "    It depends on the dataset size:\n",
+    "    - For large datasets that don't fit in memory: YES, Dask is better because \n",
+    "      it maintains parallel processing and out-of-core computation.\n",
+    "    - For smaller datasets that fit comfortably in memory: Pandas is acceptable \n",
+    "      and simpler to use.\n",
+    "    \n",
+    "    In this case, since we converted to pandas anyway (via .compute()), the \n",
+    "    operation was already materialized in memory, so using pandas for rolling \n",
+    "    is fine. However, if memory is constrained, computing the rolling mean in \n",
+    "    Dask first (before converting to pandas) would have been more efficient.\n",
+    "\"\"\"\n",
+    "\n",
+    "print(comment)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -165,7 +270,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "env",
+   "display_name": "production-env",
    "language": "python",
    "name": "python3"
   },
@@ -179,7 +284,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.0"
+   "version": "3.11.14"
   }
  },
  "nbformat": 4,