Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 115 additions & 136 deletions solution.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# %%
#!/usr/bin/python3
#!encoding=utf8

import numpy as np
import pandas as pd
Expand All @@ -7,151 +8,129 @@
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

print('---> Python Script Start', t0 := datetime.datetime.now())

# %%

print('---> the parameters')

# training and test dates
start_train = datetime.date(2017, 1, 1)
end_train = datetime.date(2023, 11, 30) # gap for embargo (no overlap between train and test)
start_test = datetime.date(2024, 1, 1) # test set is this datasets 2024 data
end_test = datetime.date(2024, 6, 30)

n_buys = 10
verbose = False

print('---> initial data set up')

# sector data
df_sectors = pd.read_csv('data/data0.csv')

# price and fin data
df_data = pd.read_csv('data/data1.csv')
df_data['date'] = pd.to_datetime(df_data['date']).apply(lambda d: d.date())

df_x = df_data[['date', 'security', 'price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']].copy()
df_y = df_data[['date', 'security', 'label']].copy()

list_vars1 = ['price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']

# we will perform walk forward validation for testing the buys - https://www.linkedin.com/pulse/walk-forward-validation-yeshwanth-n
df_signals = pd.DataFrame(data={'date':df_x.loc[(df_x['date']>=start_test) & (df_x['date']<=end_test), 'date'].values})
df_signals.drop_duplicates(inplace=True)
df_signals.reset_index(drop=True, inplace=True)
df_signals.sort_values(by='date', inplace=True) # this code just gets the dates that we need to generate buy signals for

# %%

for i in range(len(df_signals)):

if verbose: print('---> doing', df_signals.loc[i, 'date'])

# this iteretaions training set
df_trainx = df_x[df_x['date']<df_signals.loc[i, 'date']].copy()
df_trainx.drop(labels=df_trainx[df_trainx['date']==df_trainx['date'].max()].index, inplace=True) # no overlap with test set

df_trainy = df_y[df_y['date']<df_signals.loc[i, 'date']].copy()
df_trainy.drop(labels=df_trainy[df_trainy['date']==df_trainy['date'].max()].index, inplace=True) # no overlap with test set

# this iteretaions test set
df_testx = df_x[df_x['date']>=df_signals.loc[i, 'date']].copy()
df_testy = df_y[df_y['date']>=df_signals.loc[i, 'date']].copy()

# scale, and store scaling objects for test set
dict_scaler = {}
for col in list_vars1:

dict_scaler[col] = MinMaxScaler(feature_range=(-1,1))
df_trainx[col] = dict_scaler[col].fit_transform(np.array(df_trainx[col]).reshape((len(df_trainx[col]),1)))[:, 0]
df_testx[col] = dict_scaler[col].transform(np.array(df_testx[col]).reshape((len(df_testx[col]),1)))[:, 0]

# fit a classifier
if i == 0:
clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=10, min_samples_split=1000, min_samples_leaf=1000, min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=0)
clf.fit(np.array(df_trainx[list_vars1]), df_trainy['label'].values)

# predict and calc accuracy - 0.5 is the implicit cuttoff here
df_testy['signal'] = clf.predict_proba(np.array(df_testx[list_vars1]))[:, 1] # use probs to get strength of classification
df_testy['pred'] = clf.predict(np.array(df_testx[list_vars1]))
df_testy['count'] = 1

df_current = df_testy[df_testy['date']==df_signals.loc[i, 'date']]

acc_total = (df_testy['label'] == df_testy['pred']).sum()/len(df_testy)
acc_current = (df_current['label'] == df_current['pred']).sum()/len(df_current)

print('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2))

# add accuracy and signal to dataframe
df_signals.loc[i, 'acc_total'] = acc_total
df_signals.loc[i, 'acc_current'] = acc_current

df_signals.loc[i, df_current['security'].values] = df_current['signal'].values

# %%

# create buy matrix for payoff plot
df_signals['10th'] = df_signals[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1)

df_index = pd.DataFrame(np.array(df_signals[df_sectors['security'].values]) > np.array(df_signals['10th']).reshape((len(df_signals),1)))

# set 1 for top 10 strongest signals
df_buys = pd.DataFrame()
df_buys[df_sectors['security'].values] = np.zeros((len(df_signals), len(df_sectors)))
df_buys[df_index.values] = 1
df_buys.insert(0, 'date', df_signals['date'].copy())
df_buys

# check some signal plots
fig_aapl = px.line(df_signals, x='date', y='AAPL')
fig_aapl.show()

fig_pixel = px.imshow(np.array(df_buys[df_sectors['security'].values]))
fig_pixel.show()

# %%

# create return matrix
df_returns = pd.read_csv('data/returns.csv')
df_returns['date']= pd.to_datetime(df_returns['date']).apply(lambda d: d.date())
df_returns = df_returns[df_returns['date']>=start_test]
df_returns = df_returns.pivot(index='date', columns='security', values='return1')

def plot_payoff(df_buys):

df = df_buys.copy()

assert (df.sum(axis=1)==10).sum() == len(df), '---> must have exactly 10 buys each day'

def momentum_model():
# Define the training period, the previous year is the most important to train to determine the growth rate of the stock
start_train = datetime.datetime(2022, 11, 29)
end_train = datetime.datetime(2023, 11, 29)

# Define the performance period for the additional column
start_perf = datetime.datetime(2024, 1, 1)
end_perf = datetime.datetime(2024, 6, 30)

# Read data1.csv and extract relevant columns (date, stock, price)
data1 = pd.read_csv('data/data1.csv', usecols=[0, 1, 2], parse_dates=[0], names=['Date', 'Stock', 'Price'], header=0)
print("--> Imported data")
# Copy the date range directly from data1.csv (A1761:A1884)
date_range = data1.loc[1760:1883, 'Date'].reset_index(drop=True)
# Filter data to keep only the training period data
data_train = data1[(data1['Date'] >= start_train) & (data1['Date'] <= end_train)]
# Calculate performance ratio for each stock during the training period
performance_ratios = {}
for stock in data_train['Stock'].unique():
stock_data = data_train[data_train['Stock'] == stock]
start_price = stock_data['Price'].iloc[0] # First price in the training period
end_price = stock_data['Price'].iloc[-1] # Last price in the training period

# Calculate performance ratio
performance_ratio = end_price / start_price
performance_ratios[stock] = performance_ratio

# Convert the performance ratios into a DataFrame
performance_df = pd.DataFrame(list(performance_ratios.items()), columns=['Stock', 'Performance_Ratio'])

# Get the top 10 highest performing stocks
top_performance_df = performance_df.nlargest(10, 'Performance_Ratio')

# Convert performance ratios to percentage increase
top_performance_df['Performance_Ratio'] = (top_performance_df['Performance_Ratio'] - 1) * 100

# Calculate performance from January 1, 2024, to June 30, 2024
for stock in top_performance_df['Stock']:
stock_data_perf = data1[(data1['Stock'] == stock) & (data1['Date'] >= start_perf) & (data1['Date'] <= end_perf)]
if not stock_data_perf.empty:
start_price_perf = stock_data_perf['Price'].iloc[0] # First price in the performance period
end_price_perf = stock_data_perf['Price'].iloc[-1] # Last price in the performance period

# Calculate performance ratio for the performance period
performance_ratio_perf = end_price_perf / start_price_perf
top_performance_df.loc[top_performance_df['Stock'] == stock, 'Performance_Ratio_2024'] = performance_ratio_perf
else:
top_performance_df.loc[top_performance_df['Stock'] == stock, 'Performance_Ratio_2024'] = None # No data

# Convert the 2024 performance ratios to percentage increase
top_performance_df['Performance_Ratio_2024'] = (top_performance_df['Performance_Ratio_2024'] - 1) * 100

# Calculate the average performance ratio for the last column
average_performance_ratio_2024 = top_performance_df['Performance_Ratio_2024'].mean()

# Create a DataFrame to store buy signals
buy_signals = pd.DataFrame(0, index=date_range, columns=data_train['Stock'].unique()) # Initialize with 0s

# Fill the DataFrame with 1s for the top stocks, indicating a buy on each day
buy_signals.loc[:, top_performance_df['Stock']] = 1

# Change the cell A1 to "date" and save to output.csv
buy_signals.to_csv('output.csv', header=True, index=True)

# Load the output CSV and change the first cell to "date"
output_df = pd.read_csv('output.csv')

# Change the first header to "date" and drop the index column if it exists
output_df.columns.values[0] = "date"
return output_df

def plot_payoff(sectors_dataframe, buys_dataframe, returns_dataframe, allowed_buys):
df = buys_dataframe.copy()
# assert (df.sum(axis=2)==10).sum() == len(df), '---> must have exactly 10 buys each day'
# matrix of buys
df_payoff = df[['date']].copy()
del df['date']
arr_buys = np.array(df)
arr_buys = arr_buys*(1/n_buys) # equally weighted

arr_buys = arr_buys*(1/allowed_buys) # equally weighted
# return matrix
arr_ret = np.array(df_returns)
arr_ret = np.array(returns_dataframe)
arr_ret = arr_ret + 1

df_payoff['payoff'] = (arr_buys * arr_ret @ np.ones(len(df_sectors)).reshape((len(df_sectors), 1)))[:, 0]
df_payoff['payoff'] = (arr_buys * arr_ret @ np.ones(len(sectors_dataframe)).reshape((len(sectors_dataframe), 1)))[:, 0]
df_payoff['tri'] = df_payoff['payoff'].cumprod()

fig_payoff = px.line(df_payoff, x='date', y='tri')
fig_payoff.show()

print(f"---> payoff for these buys between period {df_payoff['date'].min()} and {df_payoff['date'].max()} is {(df_payoff['tri'].values[-1]-1)*100 :.2f}%")

return df_payoff

df_payoff = plot_payoff(df_buys)

# %%

print('---> Python Script End', t1 := datetime.datetime.now())
print('---> Total time taken', t1 - t0)

def main ():
print('---> Python Script Start', t0 := datetime.datetime.now())
# training and test dates
start_train = datetime.date(2017, 1, 1)
end_train = datetime.date(2023, 11, 30) # gap for embargo (no overlap between train and test)
start_test = datetime.date(2024, 1, 1) # test set is this datasets 2024 data
end_test = datetime.date(2024, 6, 30)

n_buys = 10

print('---> initial data set up')

# sector data
df_sectors = pd.read_csv('data/data0.csv')

# price and fin data
# df_data = pd.read_csv('data/data1.csv')
# df_data['date'] = pd.to_datetime(df_data['date']).apply(lambda d: d.date())
# df_x = df_data[['date', 'security', 'price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']].copy()
# df_y = df_data[['date', 'security', 'label']].copy()
list_vars1 = ['price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']
#buy matrix test
df_buy = momentum_model()

# create return matrix
df_returns = pd.read_csv('data/returns.csv')
df_returns['date']= pd.to_datetime(df_returns['date']).apply(lambda d: d.date())
df_returns = df_returns[df_returns['date']>=start_test]
df_returns = df_returns.pivot(index='date', columns='security', values='return1')
#determine payoff and generate graph
df_payoff = plot_payoff(df_sectors, df_buy, df_returns, n_buys)
print('---> Python Script End', t1 := datetime.datetime.now())
print('---> Total time taken', t1 - t0)

if __name__ == '__main__':
main()