-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrun_fwd_return_analysis.py
223 lines (190 loc) · 8.01 KB
/
run_fwd_return_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# pylint: disable=C0121
# pylint: disable=E2515
import sys
from typing import List
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from constants import (
DEFAULT_BOOTSTRAP_CONFIDENCE_LEVEL,
FEATURE_COL_NAME_BASIC,
LOG_FILE,
tickers_all,
)
from customizable.misc import get_ma_200_relation_label
from features.f_v1_basic import add_features_v1_basic
from utils.bootstrap import analyze_values_by_group, get_bootstrapped_mean_ci
from utils.get_df_with_fwd_ret import add_fwd_ret
from utils.local_data import TickersData
def _check_feature_for_fwd_ret_days(
tickers_data: TickersData,
res_to_return: List[dict],
fwd_ret_days: int,
feature_col_name: str,
insert_empty_row: bool = True,
) -> List[dict]:
"""
Compare returns over fwd_ret_days subsequent days
in situations where the feature is True and False.
The function returns a list with dictionaries.
Later they will become rows in the final dataframe.
The function can also add an empty row
to make the dataframe easier to view.
To make this clear, see the dataframe example in the README.
"""
for ticker in tickers_data.tickers_data_with_features:
if (
f"fwd_ret_{fwd_ret_days}"
not in tickers_data.tickers_data_with_features[ticker].columns
):
tickers_data.tickers_data_with_features[ticker] = add_fwd_ret(
ohlc_df=tickers_data.tickers_data_with_features[ticker],
num_days=fwd_ret_days,
)
combined_df_all = pd.DataFrame()
for ticker in tickers_data.tickers_data_with_features:
combined_df_all = pd.concat(
[combined_df_all, tickers_data.tickers_data_with_features[ticker]]
)
# Filter returns on days when the feature value is True and False,
# so that we can compare them.
mask_feature_true = combined_df_all[feature_col_name] == True
mask_feature_false = combined_df_all[feature_col_name] == False
returns_f_true = (
combined_df_all[mask_feature_true][f"fwd_ret_{fwd_ret_days}"].dropna().values
)
returns_f_false = (
combined_df_all[mask_feature_false][f"fwd_ret_{fwd_ret_days}"].dropna().values
)
# This saves memory and speeds up the execution.
for ticker in tickers_data.tickers_data_with_features:
del tickers_data.tickers_data_with_features[ticker][f"fwd_ret_{fwd_ret_days}"]
# Finding the mean and confidence intervals for returns.
# The get_bootstrapped_mean_ci function also returns the sample size
# and the percentage of days where the results are positive.
res_f_true = get_bootstrapped_mean_ci(
data=returns_f_true, conf_level=DEFAULT_BOOTSTRAP_CONFIDENCE_LEVEL # type: ignore
)
res_f_false = get_bootstrapped_mean_ci(
data=returns_f_false, conf_level=DEFAULT_BOOTSTRAP_CONFIDENCE_LEVEL # type: ignore
)
# Adding columns fwd_ret_days and feature
# to later sort the dataframe and make it easy to view
res_f_true["fwd_ret_days"] = res_f_false["fwd_ret_days"] = fwd_ret_days
res_f_true["feature"] = True
res_f_false["feature"] = False
res_to_return.append(res_f_true)
res_to_return.append(res_f_false)
if insert_empty_row:
# create an empty row and append it to the resulting list
# to improve the viewing experience of the resulting DataFrame
res_f_empty = res_f_true.copy()
res_f_empty["feature"] = np.nan
res_f_empty[f"ci_left_{DEFAULT_BOOTSTRAP_CONFIDENCE_LEVEL}"] = np.nan
res_f_empty[f"ci_right_{DEFAULT_BOOTSTRAP_CONFIDENCE_LEVEL}"] = np.nan
res_f_empty["mean_val"] = np.nan
res_f_empty["positive_pct"] = np.nan
res_f_empty["count"] = np.nan
res_to_return.append(res_f_empty)
return res_to_return
def add_group_label_analyze_save(
tickers_data: TickersData, excel_file_name: str, fwd_red_n_days: int
) -> None:
group_col_name = "close_rel_ma_200_group"
# Now add forward returns column to analyze it
# NOTE We don't need forward returns to run backtests,
# so we add them only here,
# not inside the TickersData class or anywhere else.
for ticker in tickers_data.tickers_data_with_features:
tickers_data.tickers_data_with_features[ticker] = add_fwd_ret(
ohlc_df=tickers_data.tickers_data_with_features[ticker],
num_days=fwd_red_n_days,
)
# Add a column with a group label
# and concatenate the DFs of all tickers into one large DF.
combined_ohlc_all = pd.DataFrame()
for ticker, ohlc_df in tickers_data.tickers_data_with_features.items():
ohlc_df[group_col_name] = ohlc_df.apply(get_ma_200_relation_label, axis=1)
combined_ohlc_all = pd.concat([combined_ohlc_all, ohlc_df])
del ohlc_df[group_col_name]
del ohlc_df[f"fwd_ret_{fwd_red_n_days}"]
combined_ohlc_all = combined_ohlc_all.dropna()
# just in case...
print(f"{combined_ohlc_all.shape=}")
print(f"{combined_ohlc_all.columns=}")
print(combined_ohlc_all.tail())
# Up until this point there has been preparation,
# and now the analysis will be carried out.
# NOTE This is for convenient sorting of rows
# in the resulting Excel file.
group_order_ma_200_rel = {
"HIGHLY_ABOVE": 1,
"MODERATELY_ABOVE": 2,
"SLIGHTLY_ABOVE": 3,
"SLIGHTLY_BELOW": 4,
"MODERATELY_BELOW": 5,
"HIGHLY_BELOW": 6,
"all_data": 7, # all_data row is important, don't miss it
}
analyze_values_by_group(
df=combined_ohlc_all,
group_col_name=group_col_name,
values_col_name=f"fwd_ret_{fwd_red_n_days}",
group_order_map=group_order_ma_200_rel,
excel_file_name=excel_file_name,
)
if __name__ == "__main__":
load_dotenv()
# clear LOG_FILE every time
open(LOG_FILE, "w", encoding="UTF-8").close()
EXCEL_FILE_NAME_BY_GROUP = "res_ma_200_by_group.xlsx"
EXCEL_FILE_NAME_SIMPLE = "res_ma_200_above_below.xlsx"
# The first step is to collect DataFrames with data and derived columns
# for all the tickers we are interested in.
# This data is stored in the TickersData class instance
# as a dictionary whose keys are tickers and values are DFs.
# For more details, see the class TickersData internals
# and the add_features_v1_basic function.
tickers_data_instance = TickersData(
tickers=tickers_all,
add_feature_cols_func=add_features_v1_basic,
)
# Now we run the test for each number of days in the range,
# get the list of dictionaries
# and make the resulting DataFrame from it.
res: List[dict] = list()
FWD_RETURN_DAYS_MAX = 16
for fwd_return_days in range(2, FWD_RETURN_DAYS_MAX + 1):
print(
f"Now check for fwd returns {fwd_return_days} days - up to {FWD_RETURN_DAYS_MAX}"
)
res = _check_feature_for_fwd_ret_days(
tickers_data=tickers_data_instance,
res_to_return=res,
fwd_ret_days=fwd_return_days,
insert_empty_row=True,
feature_col_name=FEATURE_COL_NAME_BASIC,
)
df = pd.DataFrame(res)
# Manipulate with columns for viewing convenience
df.insert(0, "feature", df.pop("feature"))
df.insert(0, "fwd_ret_days", df.pop("fwd_ret_days"))
df.sort_values(["fwd_ret_days", "feature"], ascending=[True, True])
df.loc[df["mean_val"] != df["mean_val"], "fwd_ret_days"] = np.nan
df.to_excel(EXCEL_FILE_NAME_SIMPLE, index=False)
print(
f"Analysis complete! Now you may explore the results file {EXCEL_FILE_NAME_SIMPLE}",
file=sys.stderr,
)
# NOTE It is not obligatory to bother with splitting into groups
# and calling add_group_label_analyze_save().
# The method demonstrated above may be enough for you.
add_group_label_analyze_save(
tickers_data=tickers_data_instance,
excel_file_name=EXCEL_FILE_NAME_BY_GROUP,
fwd_red_n_days=4,
)
print(
f"Complete! Please see the files {EXCEL_FILE_NAME_SIMPLE}, {EXCEL_FILE_NAME_BY_GROUP}",
file=sys.stderr,
)