Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
172 changes: 60 additions & 112 deletions solution.py
Original file line number Diff line number Diff line change
@@ -1,157 +1,105 @@
# %%

import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

print('---> Python Script Start', t0 := datetime.datetime.now())

# %%

print('---> the parameters')

# training and test dates
start_train = datetime.date(2017, 1, 1)
end_train = datetime.date(2023, 11, 30) # gap for embargo (no overlap between train and test)
start_test = datetime.date(2024, 1, 1) # test set is this datasets 2024 data
start_test = datetime.date(2024, 1, 1) # test set is this dataset's 2024 data
end_test = datetime.date(2024, 6, 30)

n_buys = 10
verbose = False

# %%
print('---> initial data set up')

# sector data
df_sectors = pd.read_csv('data/data0.csv')

# price and fin data
# Load sector data
df_sectors = pd.read_csv('data/data0.csv')
# Load price and financial data
df_data = pd.read_csv('data/data1.csv')
df_data['date'] = pd.to_datetime(df_data['date']).apply(lambda d: d.date())

# Select features (X) and target (y)
df_x = df_data[['date', 'security', 'price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']].copy()
df_y = df_data[['date', 'security', 'label']].copy()

# Define the list of features to be used for model training
list_vars1 = ['price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']

# we will perform walk forward validation for testing the buys - https://www.linkedin.com/pulse/walk-forward-validation-yeshwanth-n
df_signals = pd.DataFrame(data={'date':df_x.loc[(df_x['date']>=start_test) & (df_x['date']<=end_test), 'date'].values})
df_signals.drop_duplicates(inplace=True)
df_signals.reset_index(drop=True, inplace=True)
df_signals.sort_values(by='date', inplace=True) # this code just gets the dates that we need to generate buy signals for

# %%
# Feature scaling (optional, but improves performance for some models)
#scaler = MinMaxScaler()

for i in range(len(df_signals)):

if verbose: print('---> doing', df_signals.loc[i, 'date'])

# this iteretaions training set
df_trainx = df_x[df_x['date']<df_signals.loc[i, 'date']].copy()
df_trainx.drop(labels=df_trainx[df_trainx['date']==df_trainx['date'].max()].index, inplace=True) # no overlap with test set

df_trainy = df_y[df_y['date']<df_signals.loc[i, 'date']].copy()
df_trainy.drop(labels=df_trainy[df_trainy['date']==df_trainy['date'].max()].index, inplace=True) # no overlap with test set

# this iteretaions test set
df_testx = df_x[df_x['date']>=df_signals.loc[i, 'date']].copy()
df_testy = df_y[df_y['date']>=df_signals.loc[i, 'date']].copy()

# scale, and store scaling objects for test set
dict_scaler = {}
for col in list_vars1:

dict_scaler[col] = MinMaxScaler(feature_range=(-1,1))
df_trainx[col] = dict_scaler[col].fit_transform(np.array(df_trainx[col]).reshape((len(df_trainx[col]),1)))[:, 0]
df_testx[col] = dict_scaler[col].transform(np.array(df_testx[col]).reshape((len(df_testx[col]),1)))[:, 0]

# fit a classifier
if i == 0:
clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=10, min_samples_split=1000, min_samples_leaf=1000, min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=0)
clf.fit(np.array(df_trainx[list_vars1]), df_trainy['label'].values)

# predict and calc accuracy - 0.5 is the implicit cuttoff here
df_testy['signal'] = clf.predict_proba(np.array(df_testx[list_vars1]))[:, 1] # use probs to get strength of classification
df_testy['pred'] = clf.predict(np.array(df_testx[list_vars1]))
df_testy['count'] = 1

df_current = df_testy[df_testy['date']==df_signals.loc[i, 'date']]

acc_total = (df_testy['label'] == df_testy['pred']).sum()/len(df_testy)
acc_current = (df_current['label'] == df_current['pred']).sum()/len(df_current)

print('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2))

# add accuracy and signal to dataframe
df_signals.loc[i, 'acc_total'] = acc_total
df_signals.loc[i, 'acc_current'] = acc_current

df_signals.loc[i, df_current['security'].values] = df_current['signal'].values
# Merge the features and labels for easier data splitting later
#df_x[list_vars1] = scaler.fit_transform(df_x[list_vars1])

# %%
# Split the data into training and testing sets
print('---> splitting train and test sets')

# create buy matrix for payoff plot
df_signals['10th'] = df_signals[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1)

df_index = pd.DataFrame(np.array(df_signals[df_sectors['security'].values]) > np.array(df_signals['10th']).reshape((len(df_signals),1)))
# Train on data between 2017 and 2023
train_mask = (df_x['date'] >= start_train) & (df_x['date'] <= end_train)
test_mask = (df_x['date'] >= start_test) & (df_x['date'] <= end_test)

# set 1 for top 10 strongest signals
df_buys = pd.DataFrame()
df_buys[df_sectors['security'].values] = np.zeros((len(df_signals), len(df_sectors)))
df_buys[df_index.values] = 1
df_buys.insert(0, 'date', df_signals['date'].copy())
df_buys

# check some signal plots
fig_aapl = px.line(df_signals, x='date', y='AAPL')
fig_aapl.show()

fig_pixel = px.imshow(np.array(df_buys[df_sectors['security'].values]))
fig_pixel.show()
# Get training and test data for both X (features) and y (labels)
X_train = df_x.loc[train_mask, list_vars1]
X_test = df_x.loc[test_mask, list_vars1]
y_train = df_y.loc[train_mask, 'label']
y_test = df_y.loc[test_mask, 'label']

# %%
print('---> training the Random Forest Classifier')

# create return matrix
df_returns = pd.read_csv('data/returns.csv')
df_returns['date']= pd.to_datetime(df_returns['date']).apply(lambda d: d.date())
df_returns = df_returns[df_returns['date']>=start_test]
df_returns = df_returns.pivot(index='date', columns='security', values='return1')

def plot_payoff(df_buys):

df = df_buys.copy()
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

assert (df.sum(axis=1)==10).sum() == len(df), '---> must have exactly 10 buys each day'
# Train the Random Forest on the training data
rf_model.fit(X_train, y_train)
predicted_probabilities = rf_model.predict_proba(X_test)
loops = int(len(predicted_probabilities) / 100)

# matrix of buys
df_payoff = df[['date']].copy()
del df['date']
arr_buys = np.array(df)
arr_buys = arr_buys*(1/n_buys) # equally weighted
days = [f'Day {i+1}' for i in range(loops)] # Adjust the range as needed for the number of days
stock_names = df_sectors['security'].tolist() # Extract security names

# return matrix
arr_ret = np.array(df_returns)
arr_ret = arr_ret + 1
# Initialize the DataFrame with zeros
results_df = pd.DataFrame(0, index=stock_names, columns=days)

df_payoff['payoff'] = (arr_buys * arr_ret @ np.ones(len(df_sectors)).reshape((len(df_sectors), 1)))[:, 0]
df_payoff['tri'] = df_payoff['payoff'].cumprod()

fig_payoff = px.line(df_payoff, x='date', y='tri')
fig_payoff.show()

print(f"---> payoff for these buys between period {df_payoff['date'].min()} and {df_payoff['date'].max()} is {(df_payoff['tri'].values[-1]-1)*100 :.2f}%")

return df_payoff

df_payoff = plot_payoff(df_buys)

# %%

print('---> Python Script End', t1 := datetime.datetime.now())
print('---> Total time taken', t1 - t0)
for i in range(loops):
probability = predicted_probabilities[i::loops] # Use slicing directly on the NumPy array

# Extract the second values (class probabilities for class 1)
second_values = probability[:, 1] # No need for .astype(float) since it's already float
top_10_indices = np.argsort(second_values)[-10:][::-1]
top_10_securities = []

for idx in top_10_indices:
top_10_securities.append(df_sectors.iloc[idx])
top_10_securities_df = pd.DataFrame(top_10_securities)
security_list = top_10_securities_df['security'].tolist()
# Step 3: Update the DataFrame
day_label = f'Day {i + 1}'
for security in stock_names:
if security in security_list:
results_df.loc[security, day_label] = 1 # Place a 1 if in top 10
else:
results_df.loc[security, day_label] = 0

# Step 4: Save the DataFrame to a CSV file
results_df.to_csv('data/returns.csv')

# Optionally print the DataFrame
print(results_df)