Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 127 additions & 14 deletions 02_activities/assignments/assignment_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,27 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The dotenv extension is already loaded. To reload it, use:\n",
" %reload_ext dotenv\n"
]
}
],
"source": [
"# Write your code below.\n",
"\n"
"%load_ext dotenv\n",
"%dotenv\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -55,14 +65,37 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 20,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PRICE_DATA path: ../../05_src/data/prices/\n",
"Number of parquet files found: 2858\n",
"First few parquet files: ['../../05_src/data/prices\\\\ACES\\\\ACES_2016\\\\part.0.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2018\\\\part.0.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2018\\\\part.1.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2019\\\\part.0.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2019\\\\part.1.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2020\\\\part.0.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2020\\\\part.1.parquet', '../../05_src/data/prices\\\\ACN\\\\ACN_2001\\\\part.0.parquet', '../../05_src/data/prices\\\\ACN\\\\ACN_2001\\\\part.1.parquet', '../../05_src/data/prices\\\\ACN\\\\ACN_2002\\\\part.0.parquet']\n"
]
}
],
"source": [
"import os\n",
"from glob import glob\n",
"\n",
"# Write your code below.\n",
"\n",
"# Load the environment variable PRICE_DATA\n",
"price_data_dir = os.getenv(\"PRICE_DATA\")\n",
"print(\"PRICE_DATA path:\", price_data_dir)\n",
"\n",
"# Use glob to find the path to all parquet files in the PRICE_DATA directory\n",
"parquet_files = glob(os.path.join(price_data_dir, \"**/*.parquet\"), recursive=True)\n",
"\n",
"# Number of parquet files found\n",
"print(f\"Number of parquet files found: {len(parquet_files)}\")\n",
"\n",
"# First few parquet files\n",
"print(\"First few parquet files:\", parquet_files[:10])\n",
"\n"
]
},
Expand All @@ -88,11 +121,45 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 21,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\alanm\\AppData\\Local\\Temp\\ipykernel_8760\\2477076351.py:7: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n",
" Before: .apply(func)\n",
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n",
" or: .apply(func, meta=('x', 'f8')) for series result\n",
" ddf['Close_lag1'] = ddf.groupby('ticker')['Close'].shift(1)\n",
"C:\\Users\\alanm\\AppData\\Local\\Temp\\ipykernel_8760\\2477076351.py:8: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n",
" Before: .apply(func)\n",
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n",
" or: .apply(func, meta=('x', 'f8')) for series result\n",
" ddf['Adj_Close_lag1'] = ddf.groupby('ticker')['Adj Close'].shift(1)\n"
]
}
],
"source": [
"# Write your code below.\n",
"\n",
"# Load all the parquet files into a single Dask DataFrame\n",
"ddf = dd.read_parquet(parquet_files)\n",
"\n",
"# Add lags for variable Close and Adj_Close\n",
"ddf['Close_lag1'] = ddf.groupby('ticker')['Close'].shift(1)\n",
"ddf['Adj_Close_lag1'] = ddf.groupby('ticker')['Adj Close'].shift(1)\n",
"\n",
"# Add returns based on Close\n",
"ddf['returns'] = (ddf['Close'] / ddf['Close_lag1']) - 1\n",
"\n",
"# Add hi_lo_range\n",
"ddf['hi_lo_range'] = ddf['High'] - ddf['Low']\n",
"\n",
"# Assign the results to dd_feat\n",
"dd_feat = ddf\n",
"\n",
"\n"
]
},
Expand All @@ -108,11 +175,30 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\alanm\\AppData\\Local\\Temp\\ipykernel_8760\\857855713.py:7: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n",
" Before: .apply(func)\n",
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n",
" or: .apply(func, meta=('x', 'f8')) for series result\n",
" df_feat['returns_mavg_10'] = df_feat.groupby('ticker')['returns'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())\n"
]
}
],
"source": [
"# Write your code below.\n",
"\n",
"# Convert the Dask DataFrame to a Pandas DataFrame\n",
"df_feat = dd_feat.compute()\n",
"\n",
"# Add a new feature: 10-day moving average of returns per ticker\n",
"df_feat['returns_mavg_10'] = df_feat.groupby('ticker')['returns'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())\n",
"\n",
"\n"
]
},
Expand All @@ -122,8 +208,35 @@
"source": [
"Please comment:\n",
"\n",
"+ Was it necessary to convert to pandas to calculate the moving average return?\n",
"+ Would it have been better to do it in Dask? Why?\n",
"+ Was it necessary to convert to pandas to calculate the moving average return?\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"No, but converting to pandas made it much simpler to apply a groupwise rolling window.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"+ Would it have been better to do it in Dask? Why?\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Yes, for very large datasets, Dask can compute in parallel and avoid memory issues, making it more efficient than pandas.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"(1 pt)"
]
Expand Down Expand Up @@ -165,7 +278,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "env",
"display_name": "dsi_participant",
"language": "python",
"name": "python3"
},
Expand All @@ -179,7 +292,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.9.19"
}
},
"nbformat": 4,
Expand Down