Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 82 additions & 90 deletions solution.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# %%

import numpy as np
import pandas as pd
import datetime
Expand All @@ -8,150 +6,144 @@
import plotly.graph_objects as go

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

print('---> Python Script Start', t0 := datetime.datetime.now())

# %%

print('---> the parameters')

# training and test dates
# Parameters
start_train = datetime.date(2017, 1, 1)
end_train = datetime.date(2023, 11, 30) # gap for embargo (no overlap between train and test)
start_test = datetime.date(2024, 1, 1) # test set is this datasets 2024 data
end_train = datetime.date(2023, 11, 30)
start_test = datetime.date(2024, 1, 1)
end_test = datetime.date(2024, 6, 30)

n_buys = 10
verbose = False

print('---> initial data set up')

# sector data
# Data loading and preprocessing
df_sectors = pd.read_csv('data/data0.csv')

# price and fin data
df_data = pd.read_csv('data/data1.csv')
df_data['date'] = pd.to_datetime(df_data['date']).apply(lambda d: d.date())
df_data['date'] = pd.to_datetime(df_data['date']).dt.date

df_x = df_data[['date', 'security', 'price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']].copy()
df_y = df_data[['date', 'security', 'label']].copy()

list_vars1 = ['price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']

# we will perform walk forward validation for testing the buys - https://www.linkedin.com/pulse/walk-forward-validation-yeshwanth-n
df_signals = pd.DataFrame(data={'date':df_x.loc[(df_x['date']>=start_test) & (df_x['date']<=end_test), 'date'].values})
df_signals.drop_duplicates(inplace=True)
df_signals.reset_index(drop=True, inplace=True)
df_signals.sort_values(by='date', inplace=True) # this code just gets the dates that we need to generate buy signals for
df_signals.sort_values(by='date', inplace=True)

# %%

for i in range(len(df_signals)):
# Define models
xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42)

if verbose: print('---> doing', df_signals.loc[i, 'date'])
classifiers = {
# 'LogisticRegression': lr_model,
'XGBoost': xgb_model,
# 'CatBoost': cat_model
}

# this iteretaions training set
df_trainx = df_x[df_x['date']<df_signals.loc[i, 'date']].copy()
df_trainx.drop(labels=df_trainx[df_trainx['date']==df_trainx['date'].max()].index, inplace=True) # no overlap with test set
results = {}

df_trainy = df_y[df_y['date']<df_signals.loc[i, 'date']].copy()
df_trainy.drop(labels=df_trainy[df_trainy['date']==df_trainy['date'].max()].index, inplace=True) # no overlap with test set
# Prepare the entire training set
df_trainx = df_x[df_x['date'] < start_test].copy()
df_trainy = df_y[df_y['date'] < start_test].copy()

# this iteretaions test set
df_testx = df_x[df_x['date']>=df_signals.loc[i, 'date']].copy()
df_testy = df_y[df_y['date']>=df_signals.loc[i, 'date']].copy()
scaler = MinMaxScaler(feature_range=(-1,1))
df_trainx[list_vars1] = scaler.fit_transform(df_trainx[list_vars1])

# scale, and store scaling objects for test set
dict_scaler = {}
for col in list_vars1:
# Fit all models once
for clf_name, clf in classifiers.items():
print(f'---> Fitting {clf_name}')
clf.fit(df_trainx[list_vars1], df_trainy['label'])

dict_scaler[col] = MinMaxScaler(feature_range=(-1,1))
df_trainx[col] = dict_scaler[col].fit_transform(np.array(df_trainx[col]).reshape((len(df_trainx[col]),1)))[:, 0]
df_testx[col] = dict_scaler[col].transform(np.array(df_testx[col]).reshape((len(df_testx[col]),1)))[:, 0]
for clf_name, clf in classifiers.items():
print(f'---> Processing {clf_name}')
df_signals_clf = df_signals.copy()

for i in range(len(df_signals)):
if verbose: print('---> doing', df_signals_clf.loc[i, 'date'])

# fit a classifier
if i == 0:
clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=10, min_samples_split=1000, min_samples_leaf=1000, min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=0)
clf.fit(np.array(df_trainx[list_vars1]), df_trainy['label'].values)
df_testx = df_x[df_x['date']==df_signals_clf.loc[i, 'date']].copy()
df_testy = df_y[df_y['date']==df_signals_clf.loc[i, 'date']].copy()

# predict and calc accuracy - 0.5 is the implicit cuttoff here
df_testy['signal'] = clf.predict_proba(np.array(df_testx[list_vars1]))[:, 1] # use probs to get strength of classification
df_testy['pred'] = clf.predict(np.array(df_testx[list_vars1]))
df_testy['count'] = 1
df_testx[list_vars1] = scaler.transform(df_testx[list_vars1])

y_pred_proba = clf.predict_proba(df_testx[list_vars1])
if clf_name != 'LogisticRegression':
y_pred_proba = y_pred_proba[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

df_current = df_testy[df_testy['date']==df_signals.loc[i, 'date']]
df_testy['signal'] = y_pred_proba
df_testy['pred'] = y_pred

acc_total = (df_testy['label'] == df_testy['pred']).sum()/len(df_testy)
acc_current = (df_current['label'] == df_current['pred']).sum()/len(df_current)

print('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2))
df_current = df_testy[df_testy['date']==df_signals_clf.loc[i, 'date']]

acc_current = (df_current['label'] == df_current['pred']).mean()

print(f'---> {clf_name} accuracy current date {acc_current:.2f}')

# add accuracy and signal to dataframe
df_signals.loc[i, 'acc_total'] = acc_total
df_signals.loc[i, 'acc_current'] = acc_current
df_signals_clf.loc[i, 'acc_current'] = acc_current
df_signals_clf.loc[i, df_current['security'].values] = df_current['signal'].values

df_signals.loc[i, df_current['security'].values] = df_current['signal'].values
df_signals_clf['10th'] = df_signals_clf[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1)
df_index = pd.DataFrame(np.array(df_signals_clf[df_sectors['security'].values]) > np.array(df_signals_clf['10th']).reshape((len(df_signals_clf),1)))

# %%
df_buys = pd.DataFrame(0, index=df_signals_clf.index, columns=df_sectors['security'].values)
df_buys[df_index.values] = 1
df_buys = pd.concat([df_signals_clf['date'], df_buys], axis=1)

# create buy matrix for payoff plot
df_signals['10th'] = df_signals[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1)
results[clf_name] = df_buys

df_index = pd.DataFrame(np.array(df_signals[df_sectors['security'].values]) > np.array(df_signals['10th']).reshape((len(df_signals),1)))
# Manual Ensemble
print('---> Processing Ensemble')
df_signals_ensemble = df_signals.copy()

# set 1 for top 10 strongest signals
df_buys = pd.DataFrame()
df_buys[df_sectors['security'].values] = np.zeros((len(df_signals), len(df_sectors)))
df_buys[df_index.values] = 1
df_buys.insert(0, 'date', df_signals['date'].copy())
df_buys
for i in range(len(df_signals)):
ensemble_preds = np.mean([results[clf_name].iloc[i, 1:] for clf_name in classifiers.keys()], axis=0)
df_signals_ensemble.loc[i, df_sectors['security'].values] = ensemble_preds

# check some signal plots
fig_aapl = px.line(df_signals, x='date', y='AAPL')
fig_aapl.show()
df_signals_ensemble['10th'] = df_signals_ensemble[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1)
df_index_ensemble = pd.DataFrame(np.array(df_signals_ensemble[df_sectors['security'].values]) > np.array(df_signals_ensemble['10th']).reshape((len(df_signals_ensemble),1)))

fig_pixel = px.imshow(np.array(df_buys[df_sectors['security'].values]))
fig_pixel.show()
df_buys_ensemble = pd.DataFrame(0, index=df_signals_ensemble.index, columns=df_sectors['security'].values)
df_buys_ensemble[df_index_ensemble.values] = 1
df_buys_ensemble = pd.concat([df_signals_ensemble['date'], df_buys_ensemble], axis=1)

# %%
results['Ensemble'] = df_buys_ensemble

# create return matrix
# Create return matrix
df_returns = pd.read_csv('data/returns.csv')
df_returns['date']= pd.to_datetime(df_returns['date']).apply(lambda d: d.date())
df_returns['date'] = pd.to_datetime(df_returns['date']).dt.date
df_returns = df_returns[df_returns['date']>=start_test]
df_returns = df_returns.pivot(index='date', columns='security', values='return1')

def plot_payoff(df_buys):

df = df_buys.copy()

assert (df.sum(axis=1)==10).sum() == len(df), '---> must have exactly 10 buys each day'

# matrix of buys
df_payoff = df[['date']].copy()
del df['date']
arr_buys = np.array(df)
arr_buys = np.array(df.iloc[:, 1:])
arr_buys = arr_buys*(1/n_buys) # equally weighted

# return matrix
arr_ret = np.array(df_returns)
arr_ret = arr_ret + 1

arr_ret = np.array(df_returns) + 1
df_payoff['payoff'] = (arr_buys * arr_ret @ np.ones(len(df_sectors)).reshape((len(df_sectors), 1)))[:, 0]
df_payoff['tri'] = df_payoff['payoff'].cumprod()
return df_payoff

for clf_name, df_buys in results.items():
# Payoff plot
df_payoff = plot_payoff(df_buys)
fig_payoff = px.line(df_payoff, x='date', y='tri')
fig_payoff.show()

print(f"---> payoff for these buys between period {df_payoff['date'].min()} and {df_payoff['date'].max()} is {(df_payoff['tri'].values[-1]-1)*100 :.2f}%")
print(f"---> Payoff for {clf_name} between {df_payoff['date'].min()} and {df_payoff['date'].max()} is {(df_payoff['tri'].values[-1]-1)*100:.2f}%")

return df_payoff
# Comparison plot
fig = go.Figure()
for clf_name, df_buys in results.items():
df_payoff = plot_payoff(df_buys)
fig.add_trace(go.Scatter(x=df_payoff['date'], y=df_payoff['tri'], mode='lines', name=clf_name))

df_payoff = plot_payoff(df_buys)

# %%
fig.update_layout(title='Classifier Comparison', xaxis_title='Date', yaxis_title='Total Return Index')
fig.show()

print('---> Python Script End', t1 := datetime.datetime.now())
print('---> Total time taken', t1 - t0)

print('---> Total time taken', t1 - t0)