diff --git a/catboost_info/catboost_training.json b/catboost_info/catboost_training.json new file mode 100644 index 0000000..05b78b2 --- /dev/null +++ b/catboost_info/catboost_training.json @@ -0,0 +1,104 @@ +{ +"meta":{"test_sets":[],"test_metrics":[],"learn_metrics":[{"best_value":"Min","name":"Logloss"}],"launch_mode":"Train","parameters":"","iteration_count":100,"learn_sets":["learn"],"name":"experiment"}, +"iterations":[ +{"learn":[0.6928178424],"iteration":0,"passed_time":0.1537684688,"remaining_time":15.22307842}, +{"learn":[0.6925841524],"iteration":1,"passed_time":0.1711643601,"remaining_time":8.387053646}, +{"learn":[0.6923521003],"iteration":2,"passed_time":0.1906236952,"remaining_time":6.163499477}, +{"learn":[0.6921504059],"iteration":3,"passed_time":0.2093615118,"remaining_time":5.024676283}, +{"learn":[0.6920035833],"iteration":4,"passed_time":0.2279601265,"remaining_time":4.331242404}, +{"learn":[0.6918560471],"iteration":5,"passed_time":0.2467333114,"remaining_time":3.865488545}, +{"learn":[0.6917387115],"iteration":6,"passed_time":0.2643727194,"remaining_time":3.512380415}, +{"learn":[0.6916466717],"iteration":7,"passed_time":0.2812693133,"remaining_time":3.234597103}, +{"learn":[0.6915454683],"iteration":8,"passed_time":0.2994289487,"remaining_time":3.02755937}, +{"learn":[0.6914641028],"iteration":9,"passed_time":0.3196325022,"remaining_time":2.87669252}, +{"learn":[0.6914041257],"iteration":10,"passed_time":0.3387075358,"remaining_time":2.740451881}, +{"learn":[0.6913413375],"iteration":11,"passed_time":0.3568221893,"remaining_time":2.616696055}, +{"learn":[0.691291583],"iteration":12,"passed_time":0.3736949299,"remaining_time":2.500881454}, +{"learn":[0.691233112],"iteration":13,"passed_time":0.3922569691,"remaining_time":2.409578524}, +{"learn":[0.6911824313],"iteration":14,"passed_time":0.4103700896,"remaining_time":2.325430508}, +{"learn":[0.691139487],"iteration":15,"passed_time":0.428460914,"remaining_time":2.249419798}, +{"learn":[0.6910864575],"iteration":16,"passed_time":0.4453778254,"remaining_time":2.174491736}, +{"learn":[0.691050862],"iteration":17,"passed_time":0.4641447061,"remaining_time":2.114436994}, +{"learn":[0.6910092693],"iteration":18,"passed_time":0.4816480006,"remaining_time":2.053341476}, +{"learn":[0.6909780241],"iteration":19,"passed_time":0.4994026949,"remaining_time":1.99761078}, +{"learn":[0.6909337932],"iteration":20,"passed_time":0.5191060765,"remaining_time":1.952827621}, +{"learn":[0.6908997862],"iteration":21,"passed_time":0.5369870327,"remaining_time":1.903863116}, +{"learn":[0.6908738153],"iteration":22,"passed_time":0.5544653971,"remaining_time":1.856253721}, +{"learn":[0.6908439947],"iteration":23,"passed_time":0.5720260246,"remaining_time":1.811415745}, +{"learn":[0.6907809776],"iteration":24,"passed_time":0.591184596,"remaining_time":1.773553788}, +{"learn":[0.6907579427],"iteration":25,"passed_time":0.6080570581,"remaining_time":1.730623935}, +{"learn":[0.6907293157],"iteration":26,"passed_time":0.6263766218,"remaining_time":1.693536792}, +{"learn":[0.6907054386],"iteration":27,"passed_time":0.6427430245,"remaining_time":1.652767777}, +{"learn":[0.6906789207],"iteration":28,"passed_time":0.6600286672,"remaining_time":1.615932254}, +{"learn":[0.6906540186],"iteration":29,"passed_time":0.6765252263,"remaining_time":1.578558861}, +{"learn":[0.6906302261],"iteration":30,"passed_time":0.6925822461,"remaining_time":1.541554032}, +{"learn":[0.6906132093],"iteration":31,"passed_time":0.7094385238,"remaining_time":1.507556863}, +{"learn":[0.6905915968],"iteration":32,"passed_time":0.7262888187,"remaining_time":1.474586389}, +{"learn":[0.6905686844],"iteration":33,"passed_time":0.7428179908,"remaining_time":1.441940806}, +{"learn":[0.6905267425],"iteration":34,"passed_time":0.7616924981,"remaining_time":1.414571782}, +{"learn":[0.690499987],"iteration":35,"passed_time":0.7794848339,"remaining_time":1.385750816}, +{"learn":[0.6904824021],"iteration":36,"passed_time":0.7968928166,"remaining_time":1.356871553}, +{"learn":[0.6904666489],"iteration":37,"passed_time":0.814601233,"remaining_time":1.329086222}, +{"learn":[0.690445107],"iteration":38,"passed_time":0.8332971011,"remaining_time":1.303362133}, +{"learn":[0.6904156259],"iteration":39,"passed_time":0.8508657681,"remaining_time":1.276298652}, +{"learn":[0.6904013824],"iteration":40,"passed_time":0.8689065679,"remaining_time":1.250377744}, +{"learn":[0.6903756252],"iteration":41,"passed_time":0.8861263718,"remaining_time":1.223698323}, +{"learn":[0.6903439083],"iteration":42,"passed_time":0.9053503178,"remaining_time":1.200115538}, +{"learn":[0.6903245229],"iteration":43,"passed_time":0.9235652635,"remaining_time":1.175446699}, +{"learn":[0.6902959557],"iteration":44,"passed_time":0.9413302174,"remaining_time":1.15051471}, +{"learn":[0.6902762132],"iteration":45,"passed_time":0.9589566773,"remaining_time":1.125731752}, +{"learn":[0.6902555583],"iteration":46,"passed_time":0.9774332613,"remaining_time":1.102211975}, +{"learn":[0.6902377894],"iteration":47,"passed_time":0.9953168142,"remaining_time":1.078259882}, +{"learn":[0.6902143609],"iteration":48,"passed_time":1.012216826,"remaining_time":1.053531799}, +{"learn":[0.6901984681],"iteration":49,"passed_time":1.030444024,"remaining_time":1.030444024}, +{"learn":[0.6901776468],"iteration":50,"passed_time":1.047818738,"remaining_time":1.006727807}, +{"learn":[0.6901521027],"iteration":51,"passed_time":1.065698794,"remaining_time":0.9837219635}, +{"learn":[0.6901254727],"iteration":52,"passed_time":1.082948914,"remaining_time":0.960350924}, +{"learn":[0.6901036178],"iteration":53,"passed_time":1.100652337,"remaining_time":0.9375927314}, +{"learn":[0.6900923008],"iteration":54,"passed_time":1.117138098,"remaining_time":0.91402208}, +{"learn":[0.6900653319],"iteration":55,"passed_time":1.13446597,"remaining_time":0.891366119}, +{"learn":[0.6900374336],"iteration":56,"passed_time":1.152233134,"remaining_time":0.8692285049}, +{"learn":[0.6900163092],"iteration":57,"passed_time":1.170659329,"remaining_time":0.8477188242}, +{"learn":[0.689995309],"iteration":58,"passed_time":1.188183284,"remaining_time":0.8256866887}, +{"learn":[0.6899831455],"iteration":59,"passed_time":1.218160154,"remaining_time":0.8121067691}, +{"learn":[0.6899640867],"iteration":60,"passed_time":1.234853375,"remaining_time":0.78949642}, +{"learn":[0.6899355445],"iteration":61,"passed_time":1.252325078,"remaining_time":0.7675540803}, +{"learn":[0.689918655],"iteration":62,"passed_time":1.269667941,"remaining_time":0.745677997}, +{"learn":[0.6899011544],"iteration":63,"passed_time":1.288260481,"remaining_time":0.7246465207}, +{"learn":[0.6898861026],"iteration":64,"passed_time":1.306068179,"remaining_time":0.7032674811}, +{"learn":[0.6898692032],"iteration":65,"passed_time":1.324831077,"remaining_time":0.6824887368}, +{"learn":[0.6898487906],"iteration":66,"passed_time":1.342163126,"remaining_time":0.6610654201}, +{"learn":[0.6898307577],"iteration":67,"passed_time":1.361492078,"remaining_time":0.6407021544}, +{"learn":[0.6898221108],"iteration":68,"passed_time":1.376310442,"remaining_time":0.6183423727}, +{"learn":[0.6898123008],"iteration":69,"passed_time":1.394203764,"remaining_time":0.5975158989}, +{"learn":[0.6897948595],"iteration":70,"passed_time":1.410841436,"remaining_time":0.5762591782}, +{"learn":[0.6897805506],"iteration":71,"passed_time":1.426712197,"remaining_time":0.5548325209}, +{"learn":[0.6897530328],"iteration":72,"passed_time":1.445287886,"remaining_time":0.5345585333}, +{"learn":[0.6897385527],"iteration":73,"passed_time":1.462553616,"remaining_time":0.5138701896}, +{"learn":[0.6897229863],"iteration":74,"passed_time":1.479653464,"remaining_time":0.4932178215}, +{"learn":[0.6897032308],"iteration":75,"passed_time":1.496582397,"remaining_time":0.4726049674}, +{"learn":[0.689676137],"iteration":76,"passed_time":1.51443448,"remaining_time":0.452363546}, +{"learn":[0.689645832],"iteration":77,"passed_time":1.532485575,"remaining_time":0.4322395212}, +{"learn":[0.6896304555],"iteration":78,"passed_time":1.549840612,"remaining_time":0.4119829476}, +{"learn":[0.689618184],"iteration":79,"passed_time":1.566375219,"remaining_time":0.3915938047}, +{"learn":[0.6896012557],"iteration":80,"passed_time":1.584127254,"remaining_time":0.3715854053}, +{"learn":[0.6895802718],"iteration":81,"passed_time":1.602430826,"remaining_time":0.3517531082}, +{"learn":[0.6895589543],"iteration":82,"passed_time":1.621511538,"remaining_time":0.3321168209}, +{"learn":[0.6895494104],"iteration":83,"passed_time":1.639108639,"remaining_time":0.3122111694}, +{"learn":[0.6895266298],"iteration":84,"passed_time":1.657559673,"remaining_time":0.2925105305}, +{"learn":[0.6895057425],"iteration":85,"passed_time":1.675939094,"remaining_time":0.2728272944}, +{"learn":[0.6894773905],"iteration":86,"passed_time":1.693521256,"remaining_time":0.2530549003}, +{"learn":[0.6894632663],"iteration":87,"passed_time":1.711101165,"remaining_time":0.2333319771}, +{"learn":[0.6894484258],"iteration":88,"passed_time":1.729264908,"remaining_time":0.2137293707}, +{"learn":[0.6894365322],"iteration":89,"passed_time":1.746241208,"remaining_time":0.1940268009}, +{"learn":[0.6894159074],"iteration":90,"passed_time":1.763847428,"remaining_time":0.174446449}, +{"learn":[0.6893975265],"iteration":91,"passed_time":1.780571285,"remaining_time":0.1548322856}, +{"learn":[0.6893774351],"iteration":92,"passed_time":1.798130691,"remaining_time":0.1353431703}, +{"learn":[0.6893616093],"iteration":93,"passed_time":1.816742035,"remaining_time":0.1159622576}, +{"learn":[0.6893354911],"iteration":94,"passed_time":1.835307692,"remaining_time":0.09659514171}, +{"learn":[0.6893232521],"iteration":95,"passed_time":1.853016972,"remaining_time":0.07720904049}, +{"learn":[0.6893061155],"iteration":96,"passed_time":1.869271519,"remaining_time":0.05781252121}, +{"learn":[0.6892946424],"iteration":97,"passed_time":1.886593467,"remaining_time":0.0385019075}, +{"learn":[0.6892761361],"iteration":98,"passed_time":1.905804351,"remaining_time":0.019250549}, +{"learn":[0.6892590072],"iteration":99,"passed_time":1.923191143,"remaining_time":0} +]} \ No newline at end of file diff --git a/catboost_info/learn/events.out.tfevents b/catboost_info/learn/events.out.tfevents new file mode 100644 index 0000000..043f0a3 Binary files /dev/null and b/catboost_info/learn/events.out.tfevents differ diff --git a/catboost_info/learn_error.tsv b/catboost_info/learn_error.tsv new file mode 100644 index 0000000..488774b --- /dev/null +++ b/catboost_info/learn_error.tsv @@ -0,0 +1,101 @@ +iter Logloss +0 0.6928178424 +1 0.6925841524 +2 0.6923521003 +3 0.6921504059 +4 0.6920035833 +5 0.6918560471 +6 0.6917387115 +7 0.6916466717 +8 0.6915454683 +9 0.6914641028 +10 0.6914041257 +11 0.6913413375 +12 0.691291583 +13 0.691233112 +14 0.6911824313 +15 0.691139487 +16 0.6910864575 +17 0.691050862 +18 0.6910092693 +19 0.6909780241 +20 0.6909337932 +21 0.6908997862 +22 0.6908738153 +23 0.6908439947 +24 0.6907809776 +25 0.6907579427 +26 0.6907293157 +27 0.6907054386 +28 0.6906789207 +29 0.6906540186 +30 0.6906302261 +31 0.6906132093 +32 0.6905915968 +33 0.6905686844 +34 0.6905267425 +35 0.690499987 +36 0.6904824021 +37 0.6904666489 +38 0.690445107 +39 0.6904156259 +40 0.6904013824 +41 0.6903756252 +42 0.6903439083 +43 0.6903245229 +44 0.6902959557 +45 0.6902762132 +46 0.6902555583 +47 0.6902377894 +48 0.6902143609 +49 0.6901984681 +50 0.6901776468 +51 0.6901521027 +52 0.6901254727 +53 0.6901036178 +54 0.6900923008 +55 0.6900653319 +56 0.6900374336 +57 0.6900163092 +58 0.689995309 +59 0.6899831455 +60 0.6899640867 +61 0.6899355445 +62 0.689918655 +63 0.6899011544 +64 0.6898861026 +65 0.6898692032 +66 0.6898487906 +67 0.6898307577 +68 0.6898221108 +69 0.6898123008 +70 0.6897948595 +71 0.6897805506 +72 0.6897530328 +73 0.6897385527 +74 0.6897229863 +75 0.6897032308 +76 0.689676137 +77 0.689645832 +78 0.6896304555 +79 0.689618184 +80 0.6896012557 +81 0.6895802718 +82 0.6895589543 +83 0.6895494104 +84 0.6895266298 +85 0.6895057425 +86 0.6894773905 +87 0.6894632663 +88 0.6894484258 +89 0.6894365322 +90 0.6894159074 +91 0.6893975265 +92 0.6893774351 +93 0.6893616093 +94 0.6893354911 +95 0.6893232521 +96 0.6893061155 +97 0.6892946424 +98 0.6892761361 +99 0.6892590072 diff --git a/catboost_info/time_left.tsv b/catboost_info/time_left.tsv new file mode 100644 index 0000000..207d60c --- /dev/null +++ b/catboost_info/time_left.tsv @@ -0,0 +1,101 @@ +iter Passed Remaining +0 153 15223 +1 171 8387 +2 190 6163 +3 209 5024 +4 227 4331 +5 246 3865 +6 264 3512 +7 281 3234 +8 299 3027 +9 319 2876 +10 338 2740 +11 356 2616 +12 373 2500 +13 392 2409 +14 410 2325 +15 428 2249 +16 445 2174 +17 464 2114 +18 481 2053 +19 499 1997 +20 519 1952 +21 536 1903 +22 554 1856 +23 572 1811 +24 591 1773 +25 608 1730 +26 626 1693 +27 642 1652 +28 660 1615 +29 676 1578 +30 692 1541 +31 709 1507 +32 726 1474 +33 742 1441 +34 761 1414 +35 779 1385 +36 796 1356 +37 814 1329 +38 833 1303 +39 850 1276 +40 868 1250 +41 886 1223 +42 905 1200 +43 923 1175 +44 941 1150 +45 958 1125 +46 977 1102 +47 995 1078 +48 1012 1053 +49 1030 1030 +50 1047 1006 +51 1065 983 +52 1082 960 +53 1100 937 +54 1117 914 +55 1134 891 +56 1152 869 +57 1170 847 +58 1188 825 +59 1218 812 +60 1234 789 +61 1252 767 +62 1269 745 +63 1288 724 +64 1306 703 +65 1324 682 +66 1342 661 +67 1361 640 +68 1376 618 +69 1394 597 +70 1410 576 +71 1426 554 +72 1445 534 +73 1462 513 +74 1479 493 +75 1496 472 +76 1514 452 +77 1532 432 +78 1549 411 +79 1566 391 +80 1584 371 +81 1602 351 +82 1621 332 +83 1639 312 +84 1657 292 +85 1675 272 +86 1693 253 +87 1711 233 +88 1729 213 +89 1746 194 +90 1763 174 +91 1780 154 +92 1798 135 +93 1816 115 +94 1835 96 +95 1853 77 +96 1869 57 +97 1886 38 +98 1905 19 +99 1923 0 diff --git a/solution.py b/solution.py index 7dda741..da14a25 100644 --- a/solution.py +++ b/solution.py @@ -8,7 +8,10 @@ import plotly.graph_objects as go from sklearn.preprocessing import MinMaxScaler -from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier +from xgboost import XGBClassifier +from catboost import CatBoostClassifier print('---> Python Script Start', t0 := datetime.datetime.now()) @@ -34,10 +37,47 @@ df_data = pd.read_csv('data/data1.csv') df_data['date'] = pd.to_datetime(df_data['date']).apply(lambda d: d.date()) -df_x = df_data[['date', 'security', 'price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa']].copy() +# Sharpe Ratio calculation +def sharpe_ratio(returns, risk_free_rate=0.02): + excess_returns = returns - risk_free_rate + return excess_returns.mean() / excess_returns.std() if excess_returns.std() != 0 else 0 + +# Beta calculation +market_returns = df_data.groupby('date')['return30'].mean() + +def calculate_beta(stock_returns, market_returns): + # Align dates between stock and market returns + common_dates = stock_returns.index.intersection(market_returns.index) + stock_returns_aligned = stock_returns.loc[common_dates] + market_returns_aligned = market_returns.loc[common_dates] + + if len(stock_returns_aligned) > 1 and len(market_returns_aligned) > 1: + covariance = np.cov(stock_returns_aligned, market_returns_aligned)[0][1] + market_variance = np.var(market_returns_aligned) + return covariance / market_variance if market_variance != 0 else 0 + else: + return 0 + +# Apply Sharpe Ratio and Beta to each security +df_data['sharpe_ratio'] = df_data.groupby('security')['return30'].transform(lambda x: sharpe_ratio(x)) +df_data['beta'] = df_data.groupby('security')['return30'].transform(lambda x: calculate_beta(x, market_returns)) + + +# Coefficient of Variation (CV) calculation +df_data['cv'] = df_data.groupby('security')['return30'].transform(lambda x: x.std() / x.mean() if x.mean() != 0 else 0) + +# Calculate Covariance (can also be skipped if not needed as a feature column) +df_covariance = df_data.pivot(index='date', columns='security', values='price').pct_change().cov() + +# Create df_x including the new columns for engineered features +df_x = df_data[['date', 'security', 'price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa', 'sharpe_ratio', 'beta', 'cv']].copy() + +# Labels (unchanged) df_y = df_data[['date', 'security', 'label']].copy() -list_vars1 = ['price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa'] +# Update list_vars1 to include the new columns +list_vars1 = ['price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa', 'sharpe_ratio', 'beta', 'cv'] + # we will perform walk forward validation for testing the buys - https://www.linkedin.com/pulse/walk-forward-validation-yeshwanth-n df_signals = pd.DataFrame(data={'date':df_x.loc[(df_x['date']>=start_test) & (df_x['date']<=end_test), 'date'].values}) @@ -72,12 +112,73 @@ # fit a classifier if i == 0: - clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=10, min_samples_split=1000, min_samples_leaf=1000, min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=0) - clf.fit(np.array(df_trainx[list_vars1]), df_trainy['label'].values) + # clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=10, min_samples_split=5000, min_samples_leaf=5000, min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=0) + model1 = XGBClassifier( + n_estimators=100, # Number of trees + learning_rate=0.075, # Step size shrinkage + max_depth=5, # Maximum depth of a tree + min_child_weight=1, # Minimum sum of instance weight (hessian) needed in a child + gamma=0.1, # Minimum loss reduction required to make a further partition + subsample=0.8, # Subsample ratio of the training instances + colsample_bytree=0.8,# Subsample ratio of columns when constructing each tree + objective='binary:logistic', # Objective function + eval_metric='logloss', # Evaluation metric + random_state=42 + ) + model2 = RandomForestClassifier( + n_estimators=10, + criterion='gini', + max_depth=10, + min_samples_split=1000, + min_samples_leaf=1000, + min_weight_fraction_leaf=0.0, + max_features='sqrt', + random_state=0 + ) + model3 = GradientBoostingClassifier( + n_estimators=10, # Start with 100 + learning_rate=0.8, # Typical starting value + max_depth=10, # Keep it shallow to avoid overfitting + min_samples_split=1000, # Minimum samples for split + min_samples_leaf=1000, # Minimum samples for leaf + subsample=0.8, # Use 80% of data for training each tree + max_features='sqrt', # Use sqrt of features + random_state=0 + ) + model4 = LogisticRegression( + C=1.0, # Inverse of regularization strength + penalty='l2', # Regularization type (l1 or l2) + solver='lbfgs', # Algorithm for optimization + max_iter=1000, # Maximum iterations + tol=1e-4, # Tolerance for stopping criteria + class_weight='balanced', # Handle imbalanced classes + fit_intercept=True # Include intercept term + ) + model5 = CatBoostClassifier( + iterations=100, + learning_rate=0.1, + depth=6, + l2_leaf_reg=3, + loss_function='Logloss', + eval_metric='AUC', + random_state=0, + verbose=0 + ) + voting_clf = VotingClassifier(estimators=[ + ('xgb', model1), + ('rf', model2), + ('gb', model3), + ('logreg', model4), + ('catboost', model5)], + voting='soft', # Use 'soft' for probability-based voting + weights=[6,4,0,0,0] # [1,1,1,1,1] + + ) + voting_clf.fit(np.array(df_trainx[list_vars1]), df_trainy['label'].values) # predict and calc accuracy - 0.5 is the implicit cuttoff here - df_testy['signal'] = clf.predict_proba(np.array(df_testx[list_vars1]))[:, 1] # use probs to get strength of classification - df_testy['pred'] = clf.predict(np.array(df_testx[list_vars1])) + df_testy['signal'] = voting_clf.predict_proba(np.array(df_testx[list_vars1]))[:, 1] # use probs to get strength of classification + df_testy['pred'] = voting_clf.predict(np.array(df_testx[list_vars1])) df_testy['count'] = 1 df_current = df_testy[df_testy['date']==df_signals.loc[i, 'date']] @@ -126,7 +227,7 @@ def plot_payoff(df_buys): df = df_buys.copy() - assert (df.sum(axis=1)==10).sum() == len(df), '---> must have exactly 10 buys each day' + #assert (df.sum(axis=1)==10).sum() == len(df), '---> must have exactly 10 buys each day' # matrix of buys df_payoff = df[['date']].copy() @@ -153,5 +254,4 @@ def plot_payoff(df_buys): # %% print('---> Python Script End', t1 := datetime.datetime.now()) -print('---> Total time taken', t1 - t0) - +print('---> Total time taken', t1 - t0) \ No newline at end of file