diff --git a/EDA.ipynb b/EDA.ipynb new file mode 100644 index 0000000..d9a9764 --- /dev/null +++ b/EDA.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "8c27d092", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "594e37d2", + "metadata": {}, + "outputs": [], + "source": [ + "df_train = pd.read_csv('data/returns_train.csv', index_col='month_end')\n", + "df_train.sort_index()\n", + "df_test = pd.read_csv('data/returns_test.csv', index_col='month_end')\n", + "df_test.sort_index();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab9a7f05", + "metadata": {}, + "outputs": [], + "source": [ + "df_all = pd.concat([df_train, df_test])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23429d59", + "metadata": {}, + "outputs": [], + "source": [ + "df_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f59d07e2", + "metadata": {}, + "outputs": [], + "source": [ + "df_all" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f45c8e11", + "metadata": {}, + "outputs": [], + "source": [ + "stocks = df_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c19773d", + "metadata": {}, + "outputs": [], + "source": [ + "(1 + df_train.Stock1).cumprod().plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e173c714", + "metadata": {}, + "outputs": [], + "source": [ + "len(df_all.columns)\n", + "1/54" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adeead11", + "metadata": {}, + "outputs": [], + "source": [ + "for stock in stocks:\n", + " ax = (1 + df_all[stock]).cumprod().plot()\n", + "ax.axvline(len(df_train), c='r')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "876417ed", + "metadata": {}, + "outputs": [], + "source": [ + "(1 + df_all.Stock66).cumprod().plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2604ffcd", + "metadata": {}, + "outputs": [], + "source": [ + "(1 + df_all.Stock54).cumprod().plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5291f5", + "metadata": {}, + "outputs": [], + "source": [ + "df_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1e14611", + "metadata": {}, + "outputs": [], + "source": [ + "(1 + df_all).cumprod().loc[['2017-09-30']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a079265d", + "metadata": {}, + "outputs": [], + "source": [ + "df_all.Stock54.iloc[-13:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "602b26ea", + "metadata": {}, + "outputs": [], + "source": [ + "from pypfopt.expected_returns import mean_historical_return\n", + "from pypfopt.risk_models import CovarianceShrinkage\n", + "df_train_cum = (df_train + 1).cumprod()\n", + "mu = mean_historical_return(df_train_cum)\n", + "S = CovarianceShrinkage(df_train_cum).ledoit_wolf()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "893784c5", + "metadata": {}, + "outputs": [], + "source": [ + "df_train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22396233", + "metadata": {}, + "outputs": [], + "source": [ + "from pypfopt.efficient_frontier import EfficientFrontier\n", + "ef = EfficientFrontier(mu, S, weight_bounds=(0,0.1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2ba7e90", + "metadata": {}, + "outputs": [], + "source": [ + "weights = ef.max_sharpe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9524682f", + "metadata": {}, + "outputs": [], + "source": [ + "list(weights.keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc685705", + "metadata": {}, + "outputs": [], + "source": [ + "def dict_to_df(dic):\n", + " new_dic = {}\n", + " for key in dic:\n", + " new_dic[key] = [dic[key]]\n", + "\n", + " return pd.DataFrame(new_dic)\n", + "dict_to_df(weights)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4a2cf47", + "metadata": {}, + "outputs": [], + "source": [ + "(1.5)**(1/(4*12))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d147a818", + "metadata": {}, + "outputs": [], + "source": [ + "(1.07)**(1/12) - 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2e99242", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import normaltest\n", + "\n", + "for i in range(len(df_all)):\n", + " print(normaltest(df_all[stocks[i]])[1])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f64a8ba", + "metadata": {}, + "outputs": [], + "source": [ + "df_all[stocks[7]].hist(bins=30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc03e60d", + "metadata": {}, + "outputs": [], + "source": [ + "len(df_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e13378b", + "metadata": {}, + "outputs": [], + "source": [ + "scatter_data = []\n", + "for stock in stocks:\n", + " series = df_all[stock]\n", + " for i in range(len(series) - 1):\n", + " scatter_data.append([series[i], series[i + 1]])\n", + "x = [i[0] for i in scatter_data]\n", + "y = [i[1] for i in scatter_data]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "636e23ca", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import linregress\n", + "linregress(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe3791a9", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import pearsonr\n", + "pearsonr(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "874962a9", + "metadata": {}, + "outputs": [], + "source": [ + "len({'a': 1, 'b': 5})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fd8af38", + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter([i[0] for i in scatter_data], [i[1] for i in scatter_data])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78714014", + "metadata": {}, + "outputs": [], + "source": [ + "from pypfopt.risk_models import CovarianceShrinkage, semicovariance, risk_matrix\n", + "risk_df = risk_matrix((1 + df_train).cumprod(), method='sample_cov')\n", + "\n", + "for i in range(len(risk_df)):\n", + " for j in range(len(risk_df)):\n", + " risk_df.iloc[i, j] /= (risk_df.iloc[i, i]*risk_df.iloc[j, j])**0.5\n", + " \n", + "risk_df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Final_notebook.ipynb b/Final_notebook.ipynb new file mode 100644 index 0000000..706f2d7 --- /dev/null +++ b/Final_notebook.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "ec2209b7", + "metadata": {}, + "outputs": [], + "source": [ + "5# %%\n", + "import numpy as np\n", + "import pandas as pd\n", + "import datetime\n", + "import plotly.express as px\n", + "from pypfopt.efficient_frontier import EfficientCVaR\n", + "from pypfopt.risk_models import risk_matrix\n", + "from pypfopt.expected_returns import mean_historical_return\n", + "\n", + "\n", + "\n", + "\n", + "print('---Python script Start---', str(datetime.datetime.now()))\n", + "\n", + "# %%\n", + "\n", + "# data reads\n", + "df_returns_train = pd.read_csv('data/returns_train.csv')\n", + "df_returns_test = pd.read_csv('data/returns_test.csv')\n", + "df_returns_train['month_end'] = pd.to_datetime(arg=df_returns_train['month_end']).apply(lambda d: d.date())\n", + "df_returns_test['month_end'] = pd.to_datetime(arg=df_returns_test['month_end']).apply(lambda d: d.date())\n", + "\n", + "# %%\n", + "\n", + "def equalise_weights(df: pd.DataFrame):\n", + "\n", + " '''\n", + " Function to generate the equal weights, i.e. 1/p for each active stock within a month\n", + "\n", + " Args:\n", + " df: A return data frame. First column is month end and remaining columns are stocks\n", + "\n", + " Returns:\n", + " A dataframe of the same dimension but with values 1/p on active funds within a month\n", + "\n", + " '''\n", + "\n", + " # create df to house weights\n", + " n_length = len(df)\n", + " df_returns = df\n", + " df_weights = df_returns[:n_length].copy()\n", + " df_weights.set_index('month_end', inplace=True)\n", + "\n", + " # list of stock names\n", + " list_stocks = list(df_returns.columns)\n", + " list_stocks.remove('month_end')\n", + "\n", + " # assign 1/p\n", + " df_weights[list_stocks] = 1/len(list_stocks)\n", + "\n", + " return df_weights\n", + "\n", + "\n", + "# %%\n", + "\n", + "def generate_portfolio(df_train: pd.DataFrame, df_test: pd.DataFrame):\n", + "\n", + " '''\n", + " Function to generate stocks weight allocation for time t+1 using historic data. Initial weights generated as 1/p for active stock within a month\n", + "\n", + " Args:\n", + " df_train: The training set of returns. First column is month end and remaining columns are stocks\n", + " df_test: The testing set of returns. First column is month end and remaining columns are stocks\n", + "\n", + " Returns:\n", + " The returns dataframe and the weights\n", + " '''\n", + "\n", + " print('---> training set spans', df_train['month_end'].min(), df_train['month_end'].max())\n", + " print('---> training set spans', df_test['month_end'].min(), df_test['month_end'].max())\n", + "\n", + " # initialise data\n", + " n_train = len(df_train)\n", + " df_returns = pd.concat(objs=[df_train, df_test], ignore_index=True)\n", + "\n", + " df_weights = equalise_weights(df_returns[:n_train]) # df to store weights and create initial\n", + "\n", + " # list of stock names\n", + " list_stocks = list(df_returns.columns)\n", + " list_stocks.remove('month_end')\n", + "\n", + " # <<--------------------- YOUR CODE GOES BELOW THIS LINE --------------------->>\n", + "\n", + " # This is your playground. Delete/modify any of the code here and replace with \n", + " # your methodology. Below we provide a simple, naive estimation to illustrate \n", + " # how we think you should go about structuring your submission and your comments:\n", + "\n", + " # We use a static Inverse Volatility Weighting (https://en.wikipedia.org/wiki/Inverse-variance_weighting) \n", + " # strategy to generate portfolio weights.\n", + " # Use the latest available data at that point in time\n", + " \n", + " # It's simpler to write our own than figure out what's going\n", + " # wrong with pandas's inbuilt dict_to_df\n", + " def dict_to_df(dic):\n", + " new_dic = {}\n", + " \n", + " for key in dic:\n", + " new_dic[key] = [dic[key]]\n", + "\n", + " return pd.DataFrame(new_dic)\n", + " \n", + " for i in range(len(df_test)):\n", + "\n", + " # latest data at this point\n", + " df_latest = df_returns[(df_returns['month_end'] < df_test.loc[i, 'month_end'])]\n", + " \n", + " df_window = df_latest.set_index('month_end').iloc[-5*12:] # We only use the last 5 years in our analysis\n", + " \n", + " df_cum = (1 + df_window).cumprod() # cumulative returns\n", + " \n", + " # constants\n", + " TARGET_RETURN = 0.006\n", + " LOWER_BOUND = 0 # No diversification. divesification bad!\n", + " UPPER_BOUND = 0.1 # Forced diversification. Me angry!\n", + " \n", + " mu = mean_historical_return(df_cum)\n", + " S = risk_matrix(df_cum, method='sample_cov') # Simple sample covariance works best\n", + " \n", + " ef = EfficientCVaR(mu, df_cum, weight_bounds=(LOWER_BOUND, UPPER_BOUND))\n", + " \n", + " weights = ef.efficient_return(TARGET_RETURN)\n", + " df_w = dict_to_df(weights)\n", + "\n", + " # add to all weights\n", + " df_weights = pd.concat(objs=[df_weights, df_w], ignore_index=True)\n", + " \n", + " # <<--------------------- YOUR CODE GOES ABOVE THIS LINE --------------------->>\n", + " \n", + " # 10% limit check\n", + " if len(np.array(df_weights[list_stocks])[np.array(df_weights[list_stocks]) > 0.101]):\n", + "\n", + " raise Exception(r'---> 10% limit exceeded')\n", + "\n", + " return df_returns, df_weights\n", + "\n", + "\n", + "# %%\n", + "\n", + "\n", + "def plot_total_return(df_returns: pd.DataFrame, df_weights_index: pd.DataFrame, df_weights_portfolio: pd.DataFrame):\n", + "\n", + " '''\n", + " Function to generate the two total return indices.\n", + "\n", + " Args:\n", + " df_returns: Ascending date ordered combined training and test returns data.\n", + " df_weights_index: Index weights. Equally weighted\n", + " df_weights_index: Portfolio weights. Your portfolio should use equally weighted for the training date range. If blank will be ignored\n", + "\n", + " Returns:\n", + " A plot of the two total return indices and the total return indices as a dataframe\n", + " '''\n", + "\n", + " # list of stock names\n", + " list_stocks = list(df_returns.columns)\n", + " list_stocks.remove('month_end')\n", + "\n", + " # replace nans with 0 in return array\n", + " ar_returns = np.array(df_returns[list_stocks])\n", + " np.nan_to_num(x=ar_returns, copy=False, nan=0)\n", + "\n", + " # calc index\n", + " ar_rtn_index = np.array(df_weights_index[list_stocks])*ar_returns\n", + " ar_rtn_port = np.array(df_weights_portfolio[list_stocks])*ar_returns\n", + "\n", + " v_rtn_index = np.sum(ar_rtn_index, axis=1)\n", + " v_rtn_port = np.sum(ar_rtn_port, axis=1)\n", + "\n", + " # add return series to dataframe\n", + " df_rtn = pd.DataFrame(data=df_returns['month_end'], columns=['month_end'])\n", + " df_rtn['index'] = v_rtn_index\n", + " df_rtn['portfolio'] = v_rtn_port\n", + " df_rtn\n", + "\n", + " # create total return\n", + " base_price = 100\n", + " df_rtn.sort_values(by = 'month_end', inplace = True)\n", + " df_rtn['index_tr'] = ((1 + df_rtn['index']).cumprod()) * base_price\n", + " df_rtn['portfolio_tr'] = ((1 + df_rtn['portfolio']).cumprod()) * base_price\n", + " df_rtn\n", + "\n", + " df_rtn_long = df_rtn[['month_end', 'index_tr', 'portfolio_tr']].melt(id_vars='month_end', var_name='series', value_name='Total Return')\n", + "\n", + " # plot\n", + " fig1 = px.line(data_frame=df_rtn_long, x='month_end', y='Total Return', color='series')\n", + "\n", + " return fig1, df_rtn\n", + "\n", + "# %%\n", + "\n", + "# running solution\n", + "df_returns = pd.concat(objs=[df_returns_train, df_returns_test], ignore_index=True)\n", + "df_weights_index = equalise_weights(df_returns)\n", + "df_returns, df_weights_portfolio = generate_portfolio(df_returns_train, df_returns_test)\n", + "fig1, df_rtn = plot_total_return(df_returns, df_weights_index=df_weights_index, df_weights_portfolio=df_weights_portfolio)\n", + "fig1\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Write up.pdf b/Write up.pdf new file mode 100644 index 0000000..07c9566 Binary files /dev/null and b/Write up.pdf differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..852e649 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pyportfolioopt==1.5.5 + diff --git a/solution_skeleton.py b/solution_skeleton.py index 7fc2fa5..a791924 100644 --- a/solution_skeleton.py +++ b/solution_skeleton.py @@ -4,6 +4,13 @@ import datetime import plotly.express as px +# Use the pypfopt library +from pypfopt.efficient_frontier import EfficientCVaR +from pypfopt.risk_models import risk_matrix +from pypfopt.expected_returns import mean_historical_return + + + print('---Python script Start---', str(datetime.datetime.now())) @@ -84,22 +91,40 @@ def generate_portfolio(df_train: pd.DataFrame, df_test: pd.DataFrame): # strategy to generate portfolio weights. # Use the latest available data at that point in time + # It's simpler to write our own than figure out what's going + # wrong with pandas's inbuilt dict_to_df + def dict_to_df(dic): + new_dic = {} + + for key in dic: + new_dic[key] = [dic[key]] + + return pd.DataFrame(new_dic) + for i in range(len(df_test)): # latest data at this point df_latest = df_returns[(df_returns['month_end'] < df_test.loc[i, 'month_end'])] - # vol calc - df_w = pd.DataFrame() - df_w['vol'] = df_latest.std(numeric_only=True) # calculate stock volatility - df_w['inv_vol'] = 1/df_w['vol'] # calculate the inverse volatility - df_w['tot_inv_vol'] = df_w['inv_vol'].sum() # calculate the total inverse volatility - df_w['weight'] = df_w['inv_vol']/df_w['tot_inv_vol'] # calculate weight based on inverse volatility - df_w.reset_index(inplace=True, names='name') + df_window = df_latest.set_index('month_end').iloc[-5*12:] # We only use the last 5 years in our analysis + + df_cum = (1 + df_window).cumprod() # cumulative returns + + # constants + TARGET_RETURN = 0.006 + LOWER_BOUND = 0 # No diversification. divesification bad! + UPPER_BOUND = 0.1 # Forced diversification. Me angry! + + mu = mean_historical_return(df_cum) + S = risk_matrix(df_cum, method='sample_cov') # Simple sample covariance works best + + ef = EfficientCVaR(mu, df_cum, weight_bounds=(LOWER_BOUND, UPPER_BOUND)) + + weights = ef.efficient_return(TARGET_RETURN) + df_w = dict_to_df(weights) # add to all weights - df_this = pd.DataFrame(data=[[df_test.loc[i, 'month_end']] + df_w['weight'].to_list()], columns=df_latest.columns) - df_weights = pd.concat(objs=[df_weights, df_this], ignore_index=True) + df_weights = pd.concat(objs=[df_weights, df_w], ignore_index=True) # <<--------------------- YOUR CODE GOES ABOVE THIS LINE --------------------->>