diff --git a/01_materials/labs/update_path.py b/01_materials/labs/update_path.py index f4e301e5f..c141f21ca 100644 --- a/01_materials/labs/update_path.py +++ b/01_materials/labs/update_path.py @@ -5,4 +5,6 @@ src_path = (notebook_dir / "../../05_src").resolve() if str(src_path) not in sys.path: - sys.path.insert(0, str(src_path)) # insert(0) gives it priority \ No newline at end of file + sys.path.insert(0, str(src_path)) # insert(0) gives it priority + + \ No newline at end of file diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 45cfc9cd7..9b8800b31 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -28,15 +28,26 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dotenv extension is already loaded. To reload it, use:\n", + " %reload_ext dotenv\n" + ] + } + ], "source": [ "# Write your code below.\n", + "%load_ext dotenv\n", + "%dotenv\n", "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -55,14 +66,29 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 2045 price files.\n" + ] + } + ], "source": [ "import os\n", "from glob import glob\n", + "import logging\n", "\n", - "# Write your code below.\n", + "\n", + "_logs = logging.getLogger(__name__)\n", + "if not logging.getLogger().handlers:\n", + "\tlogging.basicConfig(level=logging.INFO)\n", + "\n", + "price_files = glob(os.path.join(os.getenv('PRICE_DATA'), \"**/**\", \"*.parquet\"))\n", + "_logs.info(f'Found {len(price_files)} price files.')\n", "\n" ] }, @@ -88,12 +114,202 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Admin\\AppData\\Local\\Temp\\ipykernel_25160\\3543449060.py:9: UserWarning: `meta` is not specified, inferred from partial data.\n", + "Please provide `meta` if the result is unexpected.\n", + " Before: .shift(func)\n", + " After: .shift(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n", + " or: .shift(func, meta=('x', 'f8')) for series result\n", + "\n", + " dd_feat['Close_lag_1'] = dd_feat.groupby('ticker')['Close'].shift(1)\n", + "C:\\Users\\Admin\\AppData\\Local\\Temp\\ipykernel_25160\\3543449060.py:10: UserWarning: `meta` is not specified, inferred from partial data.\n", + "Please provide `meta` if the result is unexpected.\n", + " Before: .shift(func)\n", + " After: .shift(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n", + " or: .shift(func, meta=('x', 'f8')) for series result\n", + "\n", + " dd_feat['Adj_Close_lag_1'] = dd_feat.groupby('ticker')['Adj_Close'].shift(1)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateOpenHighLowCloseAdj_CloseVolumesourcetickerYearClose_lag_1Adj_Close_lag_1returnshi_lo_range
1758642012-01-1988.19999788.88999987.61000188.83000288.830002894300.0LH.csvLH201288.47000188.4700010.0040691.279999
760092014-02-2653.77999954.04999953.47000153.84999847.8561522025700.0ALL.csvALL201453.65000247.6784250.0037280.579998
1721521997-04-217.5000007.8125007.5000007.5000007.5000005200.0LH.csvLH19977.8125007.812500-0.0400000.312500
1715031994-09-2631.87500033.43750031.25000033.12500033.125000147600.0LH.csvLH199432.18750032.1875000.0291262.187500
1711661993-05-2645.00000045.62500044.68750045.62500044.662472341200.0LH.csvLH199345.00000044.0506550.0138890.937500
\n", + "
" + ], + "text/plain": [ + " Date Open High Low Close Adj_Close \\\n", + "175864 2012-01-19 88.199997 88.889999 87.610001 88.830002 88.830002 \n", + "76009 2014-02-26 53.779999 54.049999 53.470001 53.849998 47.856152 \n", + "172152 1997-04-21 7.500000 7.812500 7.500000 7.500000 7.500000 \n", + "171503 1994-09-26 31.875000 33.437500 31.250000 33.125000 33.125000 \n", + "171166 1993-05-26 45.000000 45.625000 44.687500 45.625000 44.662472 \n", + "\n", + " Volume source ticker Year Close_lag_1 Adj_Close_lag_1 \\\n", + "175864 894300.0 LH.csv LH 2012 88.470001 88.470001 \n", + "76009 2025700.0 ALL.csv ALL 2014 53.650002 47.678425 \n", + "172152 5200.0 LH.csv LH 1997 7.812500 7.812500 \n", + "171503 147600.0 LH.csv LH 1994 32.187500 32.187500 \n", + "171166 341200.0 LH.csv LH 1993 45.000000 44.050655 \n", + "\n", + " returns hi_lo_range \n", + "175864 0.004069 1.279999 \n", + "76009 0.003728 0.579998 \n", + "172152 -0.040000 0.312500 \n", + "171503 0.029126 2.187500 \n", + "171166 0.013889 0.937500 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Write your code below.\n", - "\n" + "# Load data into Dask dataframe\n", + "dd_feat = dd.read_parquet(price_files)\n", + "\n", + "# Normalize column names (replace spaces with underscores)\n", + "dd_feat = dd_feat.rename(columns=lambda x: x.replace(' ', '_'))\n", + "\n", + "# Add lag features for Close and Adj_Close\n", + "dd_feat['Close_lag_1'] = dd_feat.groupby('ticker')['Close'].shift(1)\n", + "dd_feat['Adj_Close_lag_1'] = dd_feat.groupby('ticker')['Adj_Close'].shift(1)\n", + "\n", + "# Add returns based on Close\n", + "dd_feat['returns'] = (dd_feat['Close'] / dd_feat['Close_lag_1']) - 1\n", + "\n", + "# Add hi_lo_range (High minus Low)\n", + "dd_feat['hi_lo_range'] = dd_feat['High'] - dd_feat['Low']\n", + "dd_feat.head()\n" ] }, { @@ -108,12 +324,19 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Write your code below.\n", - "\n" + "\n", + "# Convert Dask dataframe to pandas\n", + "df = dd_feat.compute()\n", + "\n", + "# Add moving average of returns with 10-day window\n", + "df['returns_ma_10'] = df.groupby('ticker')['returns'].transform(\n", + " lambda x: x.rolling(10).mean()\n", + ")\n" ] }, { @@ -165,7 +388,7 @@ ], "metadata": { "kernelspec": { - "display_name": "env", + "display_name": "production-env", "language": "python", "name": "python3" }, @@ -179,7 +402,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.11.14" } }, "nbformat": 4, diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/utils/logger.py b/utils/logger.py new file mode 100644 index 000000000..e69de29bb