diff --git a/solution.py b/solution.py index 7dda741..bdf6844 100644 --- a/solution.py +++ b/solution.py @@ -1,5 +1,3 @@ -# %% - import numpy as np import pandas as pd import datetime @@ -8,150 +6,144 @@ import plotly.graph_objects as go from sklearn.preprocessing import MinMaxScaler -from sklearn.ensemble import RandomForestClassifier +import xgboost as xgb print('---> Python Script Start', t0 := datetime.datetime.now()) -# %% - -print('---> the parameters') - -# training and test dates +# Parameters start_train = datetime.date(2017, 1, 1) -end_train = datetime.date(2023, 11, 30) # gap for embargo (no overlap between train and test) -start_test = datetime.date(2024, 1, 1) # test set is this datasets 2024 data +end_train = datetime.date(2023, 11, 30) +start_test = datetime.date(2024, 1, 1) end_test = datetime.date(2024, 6, 30) - n_buys = 10 verbose = False -print('---> initial data set up') - -# sector data +# Data loading and preprocessing df_sectors = pd.read_csv('data/data0.csv') - -# price and fin data df_data = pd.read_csv('data/data1.csv') -df_data['date'] = pd.to_datetime(df_data['date']).apply(lambda d: d.date()) +df_data['date'] = pd.to_datetime(df_data['date']).dt.date df_x = df_data[['date', 'security', 'price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']].copy() df_y = df_data[['date', 'security', 'label']].copy() list_vars1 = ['price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa'] -# we will perform walk forward validation for testing the buys - https://www.linkedin.com/pulse/walk-forward-validation-yeshwanth-n df_signals = pd.DataFrame(data={'date':df_x.loc[(df_x['date']>=start_test) & (df_x['date']<=end_test), 'date'].values}) df_signals.drop_duplicates(inplace=True) df_signals.reset_index(drop=True, inplace=True) -df_signals.sort_values(by='date', inplace=True) # this code just gets the dates that we need to generate buy signals for +df_signals.sort_values(by='date', inplace=True) -# %% -for i in range(len(df_signals)): +# Define models +xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42) - if verbose: print('---> doing', df_signals.loc[i, 'date']) +classifiers = { + # 'LogisticRegression': lr_model, + 'XGBoost': xgb_model, + # 'CatBoost': cat_model +} - # this iteretaions training set - df_trainx = df_x[df_x['date']=df_signals.loc[i, 'date']].copy() - df_testy = df_y[df_y['date']>=df_signals.loc[i, 'date']].copy() +scaler = MinMaxScaler(feature_range=(-1,1)) +df_trainx[list_vars1] = scaler.fit_transform(df_trainx[list_vars1]) - # scale, and store scaling objects for test set - dict_scaler = {} - for col in list_vars1: +# Fit all models once +for clf_name, clf in classifiers.items(): + print(f'---> Fitting {clf_name}') + clf.fit(df_trainx[list_vars1], df_trainy['label']) - dict_scaler[col] = MinMaxScaler(feature_range=(-1,1)) - df_trainx[col] = dict_scaler[col].fit_transform(np.array(df_trainx[col]).reshape((len(df_trainx[col]),1)))[:, 0] - df_testx[col] = dict_scaler[col].transform(np.array(df_testx[col]).reshape((len(df_testx[col]),1)))[:, 0] +for clf_name, clf in classifiers.items(): + print(f'---> Processing {clf_name}') + df_signals_clf = df_signals.copy() + + for i in range(len(df_signals)): + if verbose: print('---> doing', df_signals_clf.loc[i, 'date']) - # fit a classifier - if i == 0: - clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=10, min_samples_split=1000, min_samples_leaf=1000, min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=0) - clf.fit(np.array(df_trainx[list_vars1]), df_trainy['label'].values) + df_testx = df_x[df_x['date']==df_signals_clf.loc[i, 'date']].copy() + df_testy = df_y[df_y['date']==df_signals_clf.loc[i, 'date']].copy() - # predict and calc accuracy - 0.5 is the implicit cuttoff here - df_testy['signal'] = clf.predict_proba(np.array(df_testx[list_vars1]))[:, 1] # use probs to get strength of classification - df_testy['pred'] = clf.predict(np.array(df_testx[list_vars1])) - df_testy['count'] = 1 + df_testx[list_vars1] = scaler.transform(df_testx[list_vars1]) + + y_pred_proba = clf.predict_proba(df_testx[list_vars1]) + if clf_name != 'LogisticRegression': + y_pred_proba = y_pred_proba[:, 1] + y_pred = (y_pred_proba > 0.5).astype(int) - df_current = df_testy[df_testy['date']==df_signals.loc[i, 'date']] + df_testy['signal'] = y_pred_proba + df_testy['pred'] = y_pred - acc_total = (df_testy['label'] == df_testy['pred']).sum()/len(df_testy) - acc_current = (df_current['label'] == df_current['pred']).sum()/len(df_current) - - print('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2)) + df_current = df_testy[df_testy['date']==df_signals_clf.loc[i, 'date']] + + acc_current = (df_current['label'] == df_current['pred']).mean() + + print(f'---> {clf_name} accuracy current date {acc_current:.2f}') - # add accuracy and signal to dataframe - df_signals.loc[i, 'acc_total'] = acc_total - df_signals.loc[i, 'acc_current'] = acc_current + df_signals_clf.loc[i, 'acc_current'] = acc_current + df_signals_clf.loc[i, df_current['security'].values] = df_current['signal'].values - df_signals.loc[i, df_current['security'].values] = df_current['signal'].values + df_signals_clf['10th'] = df_signals_clf[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1) + df_index = pd.DataFrame(np.array(df_signals_clf[df_sectors['security'].values]) > np.array(df_signals_clf['10th']).reshape((len(df_signals_clf),1))) -# %% + df_buys = pd.DataFrame(0, index=df_signals_clf.index, columns=df_sectors['security'].values) + df_buys[df_index.values] = 1 + df_buys = pd.concat([df_signals_clf['date'], df_buys], axis=1) -# create buy matrix for payoff plot -df_signals['10th'] = df_signals[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1) + results[clf_name] = df_buys -df_index = pd.DataFrame(np.array(df_signals[df_sectors['security'].values]) > np.array(df_signals['10th']).reshape((len(df_signals),1))) +# Manual Ensemble +print('---> Processing Ensemble') +df_signals_ensemble = df_signals.copy() -# set 1 for top 10 strongest signals -df_buys = pd.DataFrame() -df_buys[df_sectors['security'].values] = np.zeros((len(df_signals), len(df_sectors))) -df_buys[df_index.values] = 1 -df_buys.insert(0, 'date', df_signals['date'].copy()) -df_buys +for i in range(len(df_signals)): + ensemble_preds = np.mean([results[clf_name].iloc[i, 1:] for clf_name in classifiers.keys()], axis=0) + df_signals_ensemble.loc[i, df_sectors['security'].values] = ensemble_preds -# check some signal plots -fig_aapl = px.line(df_signals, x='date', y='AAPL') -fig_aapl.show() +df_signals_ensemble['10th'] = df_signals_ensemble[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1) +df_index_ensemble = pd.DataFrame(np.array(df_signals_ensemble[df_sectors['security'].values]) > np.array(df_signals_ensemble['10th']).reshape((len(df_signals_ensemble),1))) -fig_pixel = px.imshow(np.array(df_buys[df_sectors['security'].values])) -fig_pixel.show() +df_buys_ensemble = pd.DataFrame(0, index=df_signals_ensemble.index, columns=df_sectors['security'].values) +df_buys_ensemble[df_index_ensemble.values] = 1 +df_buys_ensemble = pd.concat([df_signals_ensemble['date'], df_buys_ensemble], axis=1) -# %% +results['Ensemble'] = df_buys_ensemble -# create return matrix +# Create return matrix df_returns = pd.read_csv('data/returns.csv') -df_returns['date']= pd.to_datetime(df_returns['date']).apply(lambda d: d.date()) +df_returns['date'] = pd.to_datetime(df_returns['date']).dt.date df_returns = df_returns[df_returns['date']>=start_test] df_returns = df_returns.pivot(index='date', columns='security', values='return1') def plot_payoff(df_buys): - df = df_buys.copy() - - assert (df.sum(axis=1)==10).sum() == len(df), '---> must have exactly 10 buys each day' - - # matrix of buys df_payoff = df[['date']].copy() - del df['date'] - arr_buys = np.array(df) + arr_buys = np.array(df.iloc[:, 1:]) arr_buys = arr_buys*(1/n_buys) # equally weighted - - # return matrix - arr_ret = np.array(df_returns) - arr_ret = arr_ret + 1 - + arr_ret = np.array(df_returns) + 1 df_payoff['payoff'] = (arr_buys * arr_ret @ np.ones(len(df_sectors)).reshape((len(df_sectors), 1)))[:, 0] df_payoff['tri'] = df_payoff['payoff'].cumprod() + return df_payoff +for clf_name, df_buys in results.items(): + # Payoff plot + df_payoff = plot_payoff(df_buys) fig_payoff = px.line(df_payoff, x='date', y='tri') fig_payoff.show() - print(f"---> payoff for these buys between period {df_payoff['date'].min()} and {df_payoff['date'].max()} is {(df_payoff['tri'].values[-1]-1)*100 :.2f}%") + print(f"---> Payoff for {clf_name} between {df_payoff['date'].min()} and {df_payoff['date'].max()} is {(df_payoff['tri'].values[-1]-1)*100:.2f}%") - return df_payoff +# Comparison plot +fig = go.Figure() +for clf_name, df_buys in results.items(): + df_payoff = plot_payoff(df_buys) + fig.add_trace(go.Scatter(x=df_payoff['date'], y=df_payoff['tri'], mode='lines', name=clf_name)) -df_payoff = plot_payoff(df_buys) - -# %% +fig.update_layout(title='Classifier Comparison', xaxis_title='Date', yaxis_title='Total Return Index') +fig.show() print('---> Python Script End', t1 := datetime.datetime.now()) -print('---> Total time taken', t1 - t0) - +print('---> Total time taken', t1 - t0) \ No newline at end of file