Skip to content

Commit

Permalink
add backtest error column to scan and advanced reports
Browse files Browse the repository at this point in the history
Signed-off-by: ivelin <[email protected]>
  • Loading branch information
ivelin committed Mar 1, 2024
1 parent f2fe51d commit b4e06fc
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 77 deletions.
2 changes: 1 addition & 1 deletion example.env
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ data_3rd_party="data-3rd-party"
forecast_subdir="forecast/"

# Keep changes local or sync with remote HF Hub repo
local_mode=False
hfhub_sync=False
4 changes: 4 additions & 0 deletions optuna-dashboard.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/bash
echo "Starting Optuna Dashboard"
optuna-dashboard sqlite:///data/optuna_study.db

41 changes: 38 additions & 3 deletions src/canswim/covariates.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,27 @@ def prepare_sectors_series(self, train_date_start=None):
)
return sectors_series

def prepare_industry_fund_series(self, train_date_start=None):
logger.info("preparing past covariates: industry funds")
industry_funds_df = self.industry_funds_df.copy()
# flatten column hierarchy so Darts can use as covariate series
industry_funds_df.columns = [f"{i}_{j}" for i, j in industry_funds_df.columns]
# fix datetime index type issue
# https://stackoverflow.com/questions/48248239/pandas-how-to-convert-rangeindex-into-datetimeindex
industry_funds_df.index = pd.to_datetime(industry_funds_df.index)
industry_funds_series = TimeSeries.from_dataframe(industry_funds_df, freq="B")
industry_funds_series = industry_funds_series.slice(
train_date_start, industry_funds_series.end_time()
)
filler = MissingValuesFiller(n_jobs=-1)
industry_funds_filled = filler.transform(industry_funds_series)
assert len(industry_funds_filled.gaps()) == 0
industry_funds_series = industry_funds_filled
logger.info(
f"Finished preparing past covariates: industry funds. {len(industry_funds_series.columns)} columns, {len(industry_funds_series)} records, \ncolumns: \n{industry_funds_series.columns}"
)
return industry_funds_series

def load_data(self, stock_tickers: set = None, start_date: pd.Timestamp = None):
self.__start_date = start_date
self.__load_tickers = stock_tickers
Expand All @@ -386,6 +407,7 @@ def load_past_covariates(self):
self.load_key_metrics()
self.load_broad_market()
self.load_sectors()
self.load_industry_funds()
self.load_institutional_symbol_ownership()

def load_institutional_symbol_ownership(self):
Expand Down Expand Up @@ -413,6 +435,10 @@ def load_sectors(self):
file = "data/data-3rd-party/sectors.parquet"
self.sectors_df = pd.read_parquet(file)

def load_industry_funds(self):
file = "data/data-3rd-party/industry_funds.parquet"
self.industry_funds_df = pd.read_parquet(file)

def load_future_covariates(self):
self.load_estimates()

Expand Down Expand Up @@ -630,16 +656,25 @@ def prepare_past_covariates(
train_date_start=train_date_start
)
broad_market_dict = {t: broad_market_series for t in stock_price_series.keys()}
past_covariates_tmp = self.stack_covariates(
past_covariates = self.stack_covariates(
old_covs=past_covariates, new_covs=broad_market_dict
)
sectors_series = self.prepare_sectors_series(train_date_start=train_date_start)
sectors_dict = {t: sectors_series for t in stock_price_series.keys()}
past_covariates_tmp = self.stack_covariates(
past_covariates = self.stack_covariates(
old_covs=past_covariates, new_covs=sectors_dict
)
past_covariates = past_covariates_tmp
industry_funds_series = self.prepare_industry_fund_series(
train_date_start=train_date_start
)
industry_funds_dict = {
t: industry_funds_series for t in stock_price_series.keys()
}
past_covariates = self.stack_covariates(
old_covs=past_covariates, new_covs=industry_funds_dict
)
self.past_covariates = past_covariates
logger.info(f"Prepared past covariates.")

def __add_holidays(self, series_dict: dict = None):
logger.info("preparing future covariates: holidays")
Expand Down
11 changes: 11 additions & 0 deletions src/canswim/dashboard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ def initdb(self):
ON cp.symbol = stock_tickers.symbol;
"""
)
duckdb.sql(
f"""
CREATE VIEW backtest_error
AS SELECT f.symbol, mean(log(1+abs(f."close_quantile_0.5"-cp.Close))) as mal_error
FROM forecast as f, close_price as cp
WHERE cp.symbol = f.symbol AND cp.date = f.date
GROUP BY f.symbol, cp.symbol
HAVING cp.symbol = f.symbol
"""
)

# access protected via read only remote access tokebs
# restricting access prevents sql views from working
# duckdb.sql("SET enable_external_access = false; ")
Expand Down
19 changes: 10 additions & 9 deletions src/canswim/dashboard/advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,16 @@ def __init__(self, canswim_model: CanswimModel = None):
f.symbol,
min(f.date) as forecast_start_date,
max(c.date) as prior_close_date,
arg_max(c.close, c.date) as prior_close_price,
min("close_quantile_0.2") as forecast_low_quantile,
max("close_quantile_0.5") as forecast_mean_quantile,
ROUND(100*(forecast_mean_quantile - prior_close_price) / prior_close_price) as reward_percent,
ROUND((forecast_mean_quantile - prior_close_price)/GREATEST(prior_close_price-forecast_low_quantile, 0.01),2) as reward_risk
FROM forecast f, close_price c
WHERE f.symbol = c.symbol
GROUP BY f.symbol, f.forecast_start_year, f.forecast_start_month, f.forecast_start_day, c.symbol
HAVING prior_close_date < forecast_start_date AND forecast_mean_quantile > prior_close_price
round(arg_max(c.close, c.date), 2) as prior_close_price,
round(min("close_quantile_0.2"), 2) as forecast_close_low,
round(max("close_quantile_0.5"), 2) as forecast_close_high,
round(100*(forecast_close_high / prior_close_price - 1), 2) as reward_percent,
round((forecast_close_high - prior_close_price)/GREATEST(prior_close_price-forecast_close_low, 0.01),2) as reward_risk,
round(max(e.mal_error), 4) as backtest_error
FROM forecast f, close_price c, backtest_error as e
WHERE f.symbol = c.symbol and f.symbol = e.symbol
GROUP BY f.symbol, f.forecast_start_year, f.forecast_start_month, f.forecast_start_day, c.symbol, e.symbol
HAVING prior_close_date < forecast_start_date AND forecast_close_high > prior_close_price
AND reward_risk> 3 AND reward_percent >= 20
"""
)
Expand Down
23 changes: 12 additions & 11 deletions src/canswim/dashboard/scans.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,23 +43,24 @@ def __init__(self, canswim_model: CanswimModel = None):

def scan_forecasts(self, lowq, reward, rr):
lq = (100 - lowq) / 100
quantile_col = f"close_quantile_{lq}"
low_quantile_col = f"close_quantile_{lq}"
mean_col = "close_quantile_0.5"
return duckdb.sql(
f"""--sql
SELECT
f.symbol,
min(f.date) as forecast_start_date,
max(c.date) as prior_close_date,
arg_max(c.close, c.date) as prior_close_price,
min("{quantile_col}") as forecast_low_quantile,
max("{mean_col}") as forecast_mean_quantile,
ROUND(100*(forecast_mean_quantile - prior_close_price) / prior_close_price) as reward_percent,
ROUND((forecast_mean_quantile - prior_close_price)/GREATEST(prior_close_price-forecast_low_quantile, 0.01),2) as reward_risk
FROM forecast f, close_price c
WHERE f.symbol = c.symbol
GROUP BY f.symbol, f.forecast_start_year, f.forecast_start_month, f.forecast_start_day, c.symbol
HAVING prior_close_date < forecast_start_date AND forecast_mean_quantile > prior_close_price
AND reward_risk >= {rr} AND reward_percent >= {reward}
round(arg_max(c.close, c.date), 2) as prior_close_price,
round(min("{low_quantile_col}"), 2) as forecast_close_low,
round(max("{mean_col}"), 2) as forecast_close_high,
round(100*(forecast_close_high / prior_close_price - 1), 2) as reward_percent,
round((forecast_close_high - prior_close_price)/GREATEST(prior_close_price-forecast_close_low, 0.01),2) as reward_risk,
round(max(e.mal_error), 4) as backtest_error
FROM forecast f, close_price c, backtest_error as e
WHERE f.symbol = c.symbol and f.symbol = e.symbol
GROUP BY f.symbol, f.forecast_start_year, f.forecast_start_month, f.forecast_start_day, c.symbol, e.symbol
HAVING prior_close_date < forecast_start_date AND forecast_close_high > prior_close_price
AND reward_risk> {rr} AND reward_percent >= {reward}
"""
).df()
71 changes: 32 additions & 39 deletions src/canswim/gather_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,12 @@ class MarketDataGatherer:
def __init__(self) -> None:
load_dotenv(override=True)
self.FMP_API_KEY = os.getenv("FMP_API_KEY")
logger.info(f"FMP_API_KEY found: {self.FMP_API_KEY!= None}")
logger.info(f"FMP_API_KEY found: {self.FMP_API_KEY != None}")
self.data_dir = os.getenv("data_dir", "data")
self.data_3rd_party = os.getenv("data-3rd-party", "data-3rd-party")
self.all_stocks_file = "all_stocks.csv"
self.price_frequency = "1d" # "1wk"
self.min_start_date = os.getenv("train_date_start", "1991-01-01")
self.data_dir = os.getenv("data_dir", "data")
self.data_3rd_party = os.getenv("data-3rd-party", "data-3rd-party")

def gather_stock_tickers(self):
# Prepare list of stocks for training
Expand Down Expand Up @@ -168,6 +168,27 @@ def gather_stock_tickers(self):
stocks_df.to_csv(stocks_file)
logger.info(f"Saved stock set to {stocks_file}")

def gather_fund_tickers(self):
# Prepare list of funds to use as covariates series
all_funds_set = set()
fund_files = ["ibdfunds.csv", "industry_funds.csv"]
logger.info("Compiling list of funds: {files}", files=fund_files)
for f in fund_files:
fp = f"{self.data_dir}/{self.data_3rd_party}/{f}"
if Path(fp).is_file():
funds = pd.read_csv(fp)
logger.info(f"loaded {len(funds)} fund symbols from {fp}")
logger.info(f"fund file columns: {funds.columns}")
fund_set = set(funds["Symbol"])
logger.info(f"{len(fund_set)} symbols in fund set")
all_funds_set |= fund_set
logger.info(f"total fund symbols loaded: {len(all_funds_set)}")
else:
logger.error(f"{fp} not found.")

logger.info(f"Loaded fund tickers: \n{sorted(list(all_funds_set))}")
return all_funds_set

def _gather_yfdata_date_index(self, data_file: str = None, tickers: str = None):
start_date = self.min_start_date
old_df = None
Expand All @@ -178,7 +199,7 @@ def _gather_yfdata_date_index(self, data_file: str = None, tickers: str = None):
logger.info(f"Columns: \n{old_df.columns}")
logger.info(f"Latest saved record is after {start_date}")
except Exception as e:
logger.exception(f"Could not load data from file: {data_file}. Error: {e}")
logger.warning(f"Could not load data from file: {data_file}. Error: {e}")
new_df = yf.download(tickers, start=start_date, group_by="tickers", period="1d")
new_df = new_df.dropna(how="all")
logger.info("New data gathered. Sample: \n{bm}", bm=new_df)
Expand Down Expand Up @@ -219,44 +240,15 @@ def gather_sectors_data(self):
data_file = "data/data-3rd-party/sectors.parquet"
self._gather_yfdata_date_index(data_file=data_file, tickers=sector_indicies)

def gather_subindustries_data(self):
"""
WARN: YFinance does not provide rich historical price and volume data for Industries and Sub-industries
the way it does for Sectors.
Do not use this method with YFinance.
def gather_industry_fund_data(self):
"""
return # See warning message above.
"""
Gather historic price and volume data for S&P 1500 GICS subindustries indexes.
S&P 1500 includes S&P 400, S&P 500, S&P 600 and overall about 90% of the US stock market capitalization.
The dataset has 163 GICS sub-industry indexes active as of 2023 plus 7 that were removed in 2023.
https://www.msci.com/documents/1296102/11185224/GICS+Map+2023.xlsx/82cc6504-9919-29e5-9789-a24fc039d0a5?t=1679087572540
Gather historic price and volume data for key industry ETFs.
The goal of these covariates is to provide the model with a more granural breakdown of stock grouping by industry.
Since stocks usually move together with their group, the model can learn the patterns how an individual stock trend
relates to its group price and volume action.
Since stocks usually move together with their group, the model can learn which group(s) a stock moves with from these covariates.
"""
subindustry_indicies = []
gics_file = f"{self.data_dir}/{self.data_3rd_party}/GICS2023.csv"
gics_df = pd.read_csv(gics_file)
logger.info(f"Loaded {len(gics_df)} GICS records. Columns: {gics_df}")
code_col_name = "Sub-Industry Code"
subindustry_codes = gics_df[
pd.to_numeric(gics_df[code_col_name], errors="coerce").notnull()
]
subindustry_codes = subindustry_codes[code_col_name].unique()
logger.info(
f"Loaded {len(subindustry_codes)} subindustry codes: {subindustry_codes}"
)
sub_prefix = "^sp1500-"
subindustry_symbols = []
for c in subindustry_codes:
subindustry_symbols.append(f"{sub_prefix}{c}")
logger.info(
f"Prepared list of {len(subindustry_symbols)} S&P1500 subindustry symbols: {subindustry_symbols}"
)
data_file = "data/data-3rd-party/subindustries.parquet"
self._gather_yfdata_date_index(data_file=data_file, tickers=subindustry_symbols)
logger.info("Finished gathering subindisty data")
fund_tickers = self.gather_fund_tickers()
data_file = f"{self.data_dir}/{self.data_3rd_party}/industry_funds.parquet"
self._gather_yfdata_date_index(data_file=data_file, tickers=fund_tickers)

def gather_stock_price_data(self):
data_file = (
Expand Down Expand Up @@ -538,6 +530,7 @@ def main():
g = MarketDataGatherer()
g.gather_broad_market_data()
g.gather_sectors_data()
g.gather_industry_fund_data()
g.gather_stock_tickers()
g.gather_stock_price_data()
g.gather_earnings_data()
Expand Down
18 changes: 9 additions & 9 deletions src/canswim/hfhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,21 @@ def __init__(self, api_key: Optional[str] = None):
self.HF_TOKEN = api_key
self.data_dir = os.getenv("data_dir", "data")
self.repo_id = os.getenv("repo_id")
lm = os.getenv("local_mode", False)
lm = os.getenv("hfhub_sync", False)
if isinstance(lm, str) and lm == "False":
lm = False
else:
lm = True
self.local_mode = lm
logger.info(f"local_mode: {self.local_mode}")
self.hfhub_sync = lm
logger.info(f"hfhub_sync: {self.hfhub_sync}")

def upload_model(
self,
repo_id: str = None,
model: ForecastingModel = None,
private: Optional[bool] = True,
):
if self.local_mode:
if not self.hfhub_sync:
logger.info("Local mode selected. Skipping download.")
return
# Create repo if not existing yet and get the associated repo_id
Expand All @@ -67,7 +67,7 @@ def download_model(
model_class: object = None,
**kwargs,
) -> ForecastingModel:
if self.local_mode:
if not self.hfhub_sync:
logger.info("Local mode selected. Skipping download.")
return
if torch.cuda.is_available():
Expand Down Expand Up @@ -98,7 +98,7 @@ def upload_timeseries(
series_name: str = None,
private: Optional[bool] = True,
):
if self.local_mode:
if not self.hfhub_sync:
logger.info("Local mode selected. Skipping download.")
return
# Create repo if not existing
Expand Down Expand Up @@ -126,7 +126,7 @@ def download_timeseries(
repo_id: str = None,
series_name: str = None,
) -> TimeSeries:
if self.local_mode:
if not self.hfhub_sync:
logger.info("Local mode selected. Skipping download.")
return
with tempfile.TemporaryDirectory() as tmpdirname:
Expand All @@ -145,7 +145,7 @@ def download_timeseries(
return ts

def download_data(self, repo_id: str = None, local_dir: str = None):
if self.local_mode:
if not self.hfhub_sync:
logger.info("Local mode selected. Skipping download.")
return
if local_dir is not None:
Expand All @@ -168,7 +168,7 @@ def download_data(self, repo_id: str = None, local_dir: str = None):
def upload_data(
self, repo_id: str = None, private: bool = True, local_dir: str = None
):
if self.local_mode:
if not self.hfhub_sync:
logger.info("Local mode selected. Skipping upload.")
return
if local_dir is not None:
Expand Down
8 changes: 4 additions & 4 deletions src/canswim/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,7 +897,7 @@ def _optuna_objective(self, trial):
"input_chunk_length",
low=168,
high=252,
step=42,
step=84, # 42,
# low=252,
# high=self.train_history,
# step=21,
Expand All @@ -912,7 +912,7 @@ def _optuna_objective(self, trial):

# Other hyperparameters
hidden_size = trial.suggest_int(
"hidden_size", low=1024, high=2048, step=512
"hidden_size", low=2048, high=2048, step=256
) # low=256, high=1024, step=256)
num_encoder_layers = trial.suggest_int(
"num_encoder_layers", low=3, high=3
Expand All @@ -925,8 +925,8 @@ def _optuna_objective(self, trial):
)
temporal_decoder_hidden = trial.suggest_int(
"temporal_decoder_hidden",
low=48,
high=128,
low=80, # 48,
high=112,
step=32, # low=16, high=128, step=16
)
dropout = trial.suggest_float(
Expand Down
2 changes: 1 addition & 1 deletion src/canswim/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def build_new_model(self):
self.canswim_model.build(
input_chunk_length=168,
output_chunk_length=42,
hidden_size=1536,
hidden_size=2048,
num_encoder_layers=3,
num_decoder_layers=2,
decoder_output_dim=8,
Expand Down

0 comments on commit b4e06fc

Please sign in to comment.