diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..a1155d8 Binary files /dev/null and b/.DS_Store differ diff --git a/solution.py b/solution.py index 7dda741..14ea7c5 100644 --- a/solution.py +++ b/solution.py @@ -1,157 +1,105 @@ # %% - import numpy as np import pandas as pd import datetime +from sklearn.preprocessing import MinMaxScaler +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score import plotly.express as px import plotly.graph_objects as go -from sklearn.preprocessing import MinMaxScaler -from sklearn.ensemble import RandomForestClassifier - print('---> Python Script Start', t0 := datetime.datetime.now()) # %% - print('---> the parameters') # training and test dates start_train = datetime.date(2017, 1, 1) end_train = datetime.date(2023, 11, 30) # gap for embargo (no overlap between train and test) -start_test = datetime.date(2024, 1, 1) # test set is this datasets 2024 data +start_test = datetime.date(2024, 1, 1) # test set is this dataset's 2024 data end_test = datetime.date(2024, 6, 30) n_buys = 10 verbose = False +# %% print('---> initial data set up') -# sector data -df_sectors = pd.read_csv('data/data0.csv') - -# price and fin data +# Load sector data +df_sectors = pd.read_csv('data/data0.csv') +# Load price and financial data df_data = pd.read_csv('data/data1.csv') df_data['date'] = pd.to_datetime(df_data['date']).apply(lambda d: d.date()) +# Select features (X) and target (y) df_x = df_data[['date', 'security', 'price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']].copy() df_y = df_data[['date', 'security', 'label']].copy() +# Define the list of features to be used for model training list_vars1 = ['price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa'] -# we will perform walk forward validation for testing the buys - https://www.linkedin.com/pulse/walk-forward-validation-yeshwanth-n -df_signals = pd.DataFrame(data={'date':df_x.loc[(df_x['date']>=start_test) & (df_x['date']<=end_test), 'date'].values}) -df_signals.drop_duplicates(inplace=True) -df_signals.reset_index(drop=True, inplace=True) -df_signals.sort_values(by='date', inplace=True) # this code just gets the dates that we need to generate buy signals for - # %% +# Feature scaling (optional, but improves performance for some models) +#scaler = MinMaxScaler() -for i in range(len(df_signals)): - - if verbose: print('---> doing', df_signals.loc[i, 'date']) - - # this iteretaions training set - df_trainx = df_x[df_x['date']=df_signals.loc[i, 'date']].copy() - df_testy = df_y[df_y['date']>=df_signals.loc[i, 'date']].copy() - - # scale, and store scaling objects for test set - dict_scaler = {} - for col in list_vars1: - - dict_scaler[col] = MinMaxScaler(feature_range=(-1,1)) - df_trainx[col] = dict_scaler[col].fit_transform(np.array(df_trainx[col]).reshape((len(df_trainx[col]),1)))[:, 0] - df_testx[col] = dict_scaler[col].transform(np.array(df_testx[col]).reshape((len(df_testx[col]),1)))[:, 0] - - # fit a classifier - if i == 0: - clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=10, min_samples_split=1000, min_samples_leaf=1000, min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=0) - clf.fit(np.array(df_trainx[list_vars1]), df_trainy['label'].values) - - # predict and calc accuracy - 0.5 is the implicit cuttoff here - df_testy['signal'] = clf.predict_proba(np.array(df_testx[list_vars1]))[:, 1] # use probs to get strength of classification - df_testy['pred'] = clf.predict(np.array(df_testx[list_vars1])) - df_testy['count'] = 1 - - df_current = df_testy[df_testy['date']==df_signals.loc[i, 'date']] - - acc_total = (df_testy['label'] == df_testy['pred']).sum()/len(df_testy) - acc_current = (df_current['label'] == df_current['pred']).sum()/len(df_current) - - print('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2)) - - # add accuracy and signal to dataframe - df_signals.loc[i, 'acc_total'] = acc_total - df_signals.loc[i, 'acc_current'] = acc_current - - df_signals.loc[i, df_current['security'].values] = df_current['signal'].values +# Merge the features and labels for easier data splitting later +#df_x[list_vars1] = scaler.fit_transform(df_x[list_vars1]) # %% +# Split the data into training and testing sets +print('---> splitting train and test sets') -# create buy matrix for payoff plot -df_signals['10th'] = df_signals[df_sectors['security'].values].apply(lambda x: sorted(x)[len(df_sectors)-n_buys-1], axis=1) - -df_index = pd.DataFrame(np.array(df_signals[df_sectors['security'].values]) > np.array(df_signals['10th']).reshape((len(df_signals),1))) +# Train on data between 2017 and 2023 +train_mask = (df_x['date'] >= start_train) & (df_x['date'] <= end_train) +test_mask = (df_x['date'] >= start_test) & (df_x['date'] <= end_test) -# set 1 for top 10 strongest signals -df_buys = pd.DataFrame() -df_buys[df_sectors['security'].values] = np.zeros((len(df_signals), len(df_sectors))) -df_buys[df_index.values] = 1 -df_buys.insert(0, 'date', df_signals['date'].copy()) -df_buys - -# check some signal plots -fig_aapl = px.line(df_signals, x='date', y='AAPL') -fig_aapl.show() - -fig_pixel = px.imshow(np.array(df_buys[df_sectors['security'].values])) -fig_pixel.show() +# Get training and test data for both X (features) and y (labels) +X_train = df_x.loc[train_mask, list_vars1] +X_test = df_x.loc[test_mask, list_vars1] +y_train = df_y.loc[train_mask, 'label'] +y_test = df_y.loc[test_mask, 'label'] # %% +print('---> training the Random Forest Classifier') -# create return matrix -df_returns = pd.read_csv('data/returns.csv') -df_returns['date']= pd.to_datetime(df_returns['date']).apply(lambda d: d.date()) -df_returns = df_returns[df_returns['date']>=start_test] -df_returns = df_returns.pivot(index='date', columns='security', values='return1') - -def plot_payoff(df_buys): - - df = df_buys.copy() +# Initialize the Random Forest model +rf_model = RandomForestClassifier(n_estimators=100, random_state=42) - assert (df.sum(axis=1)==10).sum() == len(df), '---> must have exactly 10 buys each day' +# Train the Random Forest on the training data +rf_model.fit(X_train, y_train) +predicted_probabilities = rf_model.predict_proba(X_test) +loops = int(len(predicted_probabilities) / 100) - # matrix of buys - df_payoff = df[['date']].copy() - del df['date'] - arr_buys = np.array(df) - arr_buys = arr_buys*(1/n_buys) # equally weighted +days = [f'Day {i+1}' for i in range(loops)] # Adjust the range as needed for the number of days +stock_names = df_sectors['security'].tolist() # Extract security names - # return matrix - arr_ret = np.array(df_returns) - arr_ret = arr_ret + 1 +# Initialize the DataFrame with zeros +results_df = pd.DataFrame(0, index=stock_names, columns=days) - df_payoff['payoff'] = (arr_buys * arr_ret @ np.ones(len(df_sectors)).reshape((len(df_sectors), 1)))[:, 0] - df_payoff['tri'] = df_payoff['payoff'].cumprod() - - fig_payoff = px.line(df_payoff, x='date', y='tri') - fig_payoff.show() - - print(f"---> payoff for these buys between period {df_payoff['date'].min()} and {df_payoff['date'].max()} is {(df_payoff['tri'].values[-1]-1)*100 :.2f}%") - - return df_payoff - -df_payoff = plot_payoff(df_buys) - -# %% - -print('---> Python Script End', t1 := datetime.datetime.now()) -print('---> Total time taken', t1 - t0) +for i in range(loops): + probability = predicted_probabilities[i::loops] # Use slicing directly on the NumPy array + + # Extract the second values (class probabilities for class 1) + second_values = probability[:, 1] # No need for .astype(float) since it's already float + top_10_indices = np.argsort(second_values)[-10:][::-1] + top_10_securities = [] + + for idx in top_10_indices: + top_10_securities.append(df_sectors.iloc[idx]) + top_10_securities_df = pd.DataFrame(top_10_securities) + security_list = top_10_securities_df['security'].tolist() + # Step 3: Update the DataFrame + day_label = f'Day {i + 1}' + for security in stock_names: + if security in security_list: + results_df.loc[security, day_label] = 1 # Place a 1 if in top 10 + else: + results_df.loc[security, day_label] = 0 + +# Step 4: Save the DataFrame to a CSV file +results_df.to_csv('data/returns.csv') +# Optionally print the DataFrame +print(results_df) \ No newline at end of file