Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 124 additions & 19 deletions 02_activities/assignments/assignment_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@
"metadata": {},
"outputs": [],
"source": [
"# Write your code below.\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"\n",
"import dask.dataframe as dd\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -55,14 +59,34 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"(0, [])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"from glob import glob\n",
"\n",
"# Write your code below.\n",
"import os\n",
"from glob import glob\n",
"\n",
"PRICE_DATA = os.getenv(\"PRICE_DATA\")\n",
"\n",
"parquet_paths = glob(os.path.join(PRICE_DATA, \"**\", \"*.parquet\"), recursive=True)\n",
"parquet_paths[:5], len(parquet_paths)\n",
"\n",
"all_files = glob(os.path.join(PRICE_DATA, \"**\", \"*.*\"), recursive=True)\n",
"len(all_files), all_files[:20]\n",
"\n"
]
},
Expand All @@ -88,32 +112,83 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PRICE_DATA = ../../05_src/data/prices/\n",
"Exists? False\n"
]
}
],
"source": [
"# Write your code below.\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from glob import glob\n",
"\n",
"load_dotenv()\n",
"\n",
"PRICE_DATA = os.getenv(\"PRICE_DATA\")\n",
"print(\"PRICE_DATA =\", PRICE_DATA)\n",
"\n",
"if PRICE_DATA is None:\n",
" print(\"❌ PRICE_DATA is None (not set).\")\n",
"else:\n",
" print(\"Exists?\", os.path.exists(PRICE_DATA))\n",
" if os.path.exists(PRICE_DATA):\n",
" # Show what’s inside the folder\n",
" print(\"Top-level items:\", os.listdir(PRICE_DATA)[:20])\n",
"\n",
" # Look for parquet files anywhere under PRICE_DATA\n",
" parquet_paths = glob(os.path.join(PRICE_DATA, \"**\", \"*.parquet\"), recursive=True)\n",
" print(\"Parquet files found:\", len(parquet_paths))\n",
" print(\"Example parquet paths:\", parquet_paths[:5])\n",
"import dask.dataframe as dd\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'dd_feat' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df_feat = \u001b[43mdd_feat\u001b[49m.compute()\n\u001b[32m 3\u001b[39m df_feat[\u001b[33m\"\u001b[39m\u001b[33mreturns_ma_10\u001b[39m\u001b[33m\"\u001b[39m] = (\n\u001b[32m 4\u001b[39m df_feat.groupby(\u001b[33m\"\u001b[39m\u001b[33mTicker\u001b[39m\u001b[33m\"\u001b[39m)[\u001b[33m\"\u001b[39m\u001b[33mreturns\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 5\u001b[39m .rolling(\u001b[32m10\u001b[39m)\n\u001b[32m 6\u001b[39m .mean()\n\u001b[32m 7\u001b[39m .reset_index(level=\u001b[32m0\u001b[39m, drop=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 8\u001b[39m )\n\u001b[32m 10\u001b[39m df_feat.head(\u001b[32m15\u001b[39m)\n",
"\u001b[31mNameError\u001b[39m: name 'dd_feat' is not defined"
]
}
],
"source": [
"+ Convert the Dask data frame to a pandas data frame. \n",
"+ Add a new feature containing the moving average of `returns` using a window of 10 days. There are several ways to solve this task, a simple one uses `.rolling(10).mean()`.\n",
"df_feat = dd_feat.compute()\n",
"\n",
"(3 pt)"
"df_feat[\"returns_ma_10\"] = (\n",
" df_feat.groupby(\"Ticker\")[\"returns\"]\n",
" .rolling(10)\n",
" .mean()\n",
" .reset_index(level=0, drop=True)\n",
")\n",
"\n",
"df_feat.head(15)\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"# Write your code below.\n",
"\n"
"+ Convert the Dask data frame to a pandas data frame. \n",
"+ Add a new feature containing the moving average of `returns` using a window of 10 days. There are several ways to solve this task, a simple one uses `.rolling(10).mean()`.\n",
"\n",
"(3 pt)"
]
},
{
Expand All @@ -128,6 +203,36 @@
"(1 pt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Comment on moving average calculation approach\n",
"\n",
"comment = \"\"\"\n",
"1) Was it necessary to convert to pandas to calculate the moving average return?\n",
" No, it was not necessary. Dask supports rolling window operations through \n",
" dask.dataframe.rolling(), so the moving average could have been computed \n",
" directly on the Dask dataframe without converting to pandas.\n",
"\n",
"2) Would it have been better to do it in Dask? Why?\n",
" It depends on the dataset size:\n",
" - For large datasets that don't fit in memory: YES, Dask is better because \n",
" it maintains parallel processing and out-of-core computation.\n",
" - For smaller datasets that fit comfortably in memory: Pandas is acceptable \n",
" and simpler to use.\n",
" \n",
" In this case, since we converted to pandas anyway (via .compute()), the \n",
" operation was already materialized in memory, so using pandas for rolling \n",
" is fine. However, if memory is constrained, computing the rolling mean in \n",
" Dask first (before converting to pandas) would have been more efficient.\n",
"\"\"\"\n",
"\n",
"print(comment)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -165,7 +270,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "env",
"display_name": "production-env",
"language": "python",
"name": "python3"
},
Expand All @@ -179,7 +284,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.11.14"
}
},
"nbformat": 4,
Expand Down
Loading