add backtest error column to scan and advanced reports

Signed-off-by: ivelin <[email protected]>
ivelin · Mar 1, 2024 · b4e06fc · b4e06fc
1 parent f2fe51d
commit b4e06fc
Show file tree

Hide file tree

Showing 10 changed files with 122 additions and 77 deletions.
diff --git a/example.env b/example.env
@@ -35,4 +35,4 @@ data_3rd_party="data-3rd-party"
 forecast_subdir="forecast/"
 
 # Keep changes local or sync with remote HF Hub repo
-local_mode=False
+hfhub_sync=False
diff --git a/optuna-dashboard.sh b/optuna-dashboard.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/bash
+echo "Starting Optuna Dashboard"
+optuna-dashboard sqlite:///data/optuna_study.db
+
diff --git a/src/canswim/covariates.py b/src/canswim/covariates.py
@@ -374,6 +374,27 @@ def prepare_sectors_series(self, train_date_start=None):
         )
         return sectors_series
 
+    def prepare_industry_fund_series(self, train_date_start=None):
+        logger.info("preparing past covariates: industry funds")
+        industry_funds_df = self.industry_funds_df.copy()
+        # flatten column hierarchy so Darts can use as covariate series
+        industry_funds_df.columns = [f"{i}_{j}" for i, j in industry_funds_df.columns]
+        # fix datetime index type issue
+        # https://stackoverflow.com/questions/48248239/pandas-how-to-convert-rangeindex-into-datetimeindex
+        industry_funds_df.index = pd.to_datetime(industry_funds_df.index)
+        industry_funds_series = TimeSeries.from_dataframe(industry_funds_df, freq="B")
+        industry_funds_series = industry_funds_series.slice(
+            train_date_start, industry_funds_series.end_time()
+        )
+        filler = MissingValuesFiller(n_jobs=-1)
+        industry_funds_filled = filler.transform(industry_funds_series)
+        assert len(industry_funds_filled.gaps()) == 0
+        industry_funds_series = industry_funds_filled
+        logger.info(
+            f"Finished preparing past covariates: industry funds. {len(industry_funds_series.columns)} columns, {len(industry_funds_series)} records,  \ncolumns: \n{industry_funds_series.columns}"
+        )
+        return industry_funds_series
+
     def load_data(self, stock_tickers: set = None, start_date: pd.Timestamp = None):
         self.__start_date = start_date
         self.__load_tickers = stock_tickers
@@ -386,6 +407,7 @@ def load_past_covariates(self):
         self.load_key_metrics()
         self.load_broad_market()
         self.load_sectors()
+        self.load_industry_funds()
         self.load_institutional_symbol_ownership()
 
     def load_institutional_symbol_ownership(self):
@@ -413,6 +435,10 @@ def load_sectors(self):
         file = "data/data-3rd-party/sectors.parquet"
         self.sectors_df = pd.read_parquet(file)
 
+    def load_industry_funds(self):
+        file = "data/data-3rd-party/industry_funds.parquet"
+        self.industry_funds_df = pd.read_parquet(file)
+
     def load_future_covariates(self):
         self.load_estimates()
 
@@ -630,16 +656,25 @@ def prepare_past_covariates(
             train_date_start=train_date_start
         )
         broad_market_dict = {t: broad_market_series for t in stock_price_series.keys()}
-        past_covariates_tmp = self.stack_covariates(
+        past_covariates = self.stack_covariates(
             old_covs=past_covariates, new_covs=broad_market_dict
         )
         sectors_series = self.prepare_sectors_series(train_date_start=train_date_start)
         sectors_dict = {t: sectors_series for t in stock_price_series.keys()}
-        past_covariates_tmp = self.stack_covariates(
+        past_covariates = self.stack_covariates(
             old_covs=past_covariates, new_covs=sectors_dict
         )
-        past_covariates = past_covariates_tmp
+        industry_funds_series = self.prepare_industry_fund_series(
+            train_date_start=train_date_start
+        )
+        industry_funds_dict = {
+            t: industry_funds_series for t in stock_price_series.keys()
+        }
+        past_covariates = self.stack_covariates(
+            old_covs=past_covariates, new_covs=industry_funds_dict
+        )
         self.past_covariates = past_covariates
+        logger.info(f"Prepared past covariates.")
 
     def __add_holidays(self, series_dict: dict = None):
         logger.info("preparing future covariates: holidays")

diff --git a/src/canswim/dashboard/__init__.py b/src/canswim/dashboard/__init__.py
@@ -74,6 +74,17 @@ def initdb(self):
             ON cp.symbol = stock_tickers.symbol;
             """
         )
+        duckdb.sql(
+            f"""
+            CREATE VIEW backtest_error 
+            AS SELECT f.symbol, mean(log(1+abs(f."close_quantile_0.5"-cp.Close))) as mal_error
+            FROM forecast as f, close_price as cp
+            WHERE cp.symbol = f.symbol AND cp.date = f.date
+            GROUP BY f.symbol, cp.symbol
+            HAVING cp.symbol = f.symbol
+            """
+        )
+
         # access protected via read only remote access tokebs
         # restricting access prevents sql views from working
         # duckdb.sql("SET enable_external_access = false; ")

diff --git a/src/canswim/dashboard/advanced.py b/src/canswim/dashboard/advanced.py
@@ -26,15 +26,16 @@ def __init__(self, canswim_model: CanswimModel = None):
                     f.symbol, 
                     min(f.date) as forecast_start_date, 
                     max(c.date) as prior_close_date, 
-                    arg_max(c.close, c.date) as prior_close_price, 
-                    min("close_quantile_0.2") as forecast_low_quantile, 
-                    max("close_quantile_0.5") as forecast_mean_quantile, 
-                    ROUND(100*(forecast_mean_quantile - prior_close_price) / prior_close_price) as reward_percent, 
-                    ROUND((forecast_mean_quantile - prior_close_price)/GREATEST(prior_close_price-forecast_low_quantile, 0.01),2) as reward_risk 
-                FROM forecast f, close_price c
-                WHERE f.symbol = c.symbol
-                GROUP BY f.symbol, f.forecast_start_year, f.forecast_start_month, f.forecast_start_day, c.symbol
-                HAVING prior_close_date < forecast_start_date AND forecast_mean_quantile > prior_close_price 
+                    round(arg_max(c.close, c.date), 2) as prior_close_price, 
+                    round(min("close_quantile_0.2"), 2) as forecast_close_low, 
+                    round(max("close_quantile_0.5"), 2) as forecast_close_high, 
+                    round(100*(forecast_close_high / prior_close_price - 1), 2) as reward_percent, 
+                    round((forecast_close_high - prior_close_price)/GREATEST(prior_close_price-forecast_close_low, 0.01),2) as reward_risk,
+                    round(max(e.mal_error), 4) as backtest_error
+                FROM forecast f, close_price c, backtest_error as e
+                WHERE f.symbol = c.symbol and f.symbol = e.symbol
+                GROUP BY f.symbol, f.forecast_start_year, f.forecast_start_month, f.forecast_start_day, c.symbol, e.symbol
+                HAVING prior_close_date < forecast_start_date AND forecast_close_high > prior_close_price 
                     AND reward_risk> 3 AND reward_percent >= 20
             """
             )

diff --git a/src/canswim/dashboard/scans.py b/src/canswim/dashboard/scans.py
@@ -43,23 +43,24 @@ def __init__(self, canswim_model: CanswimModel = None):
 
     def scan_forecasts(self, lowq, reward, rr):
         lq = (100 - lowq) / 100
-        quantile_col = f"close_quantile_{lq}"
+        low_quantile_col = f"close_quantile_{lq}"
         mean_col = "close_quantile_0.5"
         return duckdb.sql(
             f"""--sql
             SELECT 
                 f.symbol, 
                 min(f.date) as forecast_start_date, 
                 max(c.date) as prior_close_date, 
-                arg_max(c.close, c.date) as prior_close_price, 
-                min("{quantile_col}") as forecast_low_quantile, 
-                max("{mean_col}") as forecast_mean_quantile, 
-                ROUND(100*(forecast_mean_quantile - prior_close_price) / prior_close_price) as reward_percent, 
-                ROUND((forecast_mean_quantile - prior_close_price)/GREATEST(prior_close_price-forecast_low_quantile, 0.01),2) as reward_risk 
-            FROM forecast f, close_price c
-            WHERE f.symbol = c.symbol
-            GROUP BY f.symbol, f.forecast_start_year, f.forecast_start_month, f.forecast_start_day, c.symbol
-            HAVING prior_close_date < forecast_start_date AND forecast_mean_quantile > prior_close_price 
-                AND reward_risk >= {rr} AND reward_percent >= {reward}
+                round(arg_max(c.close, c.date), 2) as prior_close_price, 
+                round(min("{low_quantile_col}"), 2) as forecast_close_low, 
+                round(max("{mean_col}"), 2) as forecast_close_high, 
+                round(100*(forecast_close_high / prior_close_price - 1), 2) as reward_percent, 
+                round((forecast_close_high - prior_close_price)/GREATEST(prior_close_price-forecast_close_low, 0.01),2) as reward_risk,
+                round(max(e.mal_error), 4) as backtest_error
+            FROM forecast f, close_price c, backtest_error as e
+            WHERE f.symbol = c.symbol and f.symbol = e.symbol
+            GROUP BY f.symbol, f.forecast_start_year, f.forecast_start_month, f.forecast_start_day, c.symbol, e.symbol
+            HAVING prior_close_date < forecast_start_date AND forecast_close_high > prior_close_price 
+                AND reward_risk> {rr} AND reward_percent >= {reward}
             """
         ).df()
diff --git a/src/canswim/gather_data.py b/src/canswim/gather_data.py
@@ -84,12 +84,12 @@ class MarketDataGatherer:
     def __init__(self) -> None:
         load_dotenv(override=True)
         self.FMP_API_KEY = os.getenv("FMP_API_KEY")
-        logger.info(f"FMP_API_KEY found: {self.FMP_API_KEY!= None}")
+        logger.info(f"FMP_API_KEY found: {self.FMP_API_KEY != None}")
+        self.data_dir = os.getenv("data_dir", "data")
+        self.data_3rd_party = os.getenv("data-3rd-party", "data-3rd-party")
         self.all_stocks_file = "all_stocks.csv"
         self.price_frequency = "1d"  # "1wk"
         self.min_start_date = os.getenv("train_date_start", "1991-01-01")
-        self.data_dir = os.getenv("data_dir", "data")
-        self.data_3rd_party = os.getenv("data-3rd-party", "data-3rd-party")
 
     def gather_stock_tickers(self):
         # Prepare list of stocks for training
@@ -168,6 +168,27 @@ def gather_stock_tickers(self):
         stocks_df.to_csv(stocks_file)
         logger.info(f"Saved stock set to {stocks_file}")
 
+    def gather_fund_tickers(self):
+        # Prepare list of funds to use as covariates series
+        all_funds_set = set()
+        fund_files = ["ibdfunds.csv", "industry_funds.csv"]
+        logger.info("Compiling list of funds: {files}", files=fund_files)
+        for f in fund_files:
+            fp = f"{self.data_dir}/{self.data_3rd_party}/{f}"
+            if Path(fp).is_file():
+                funds = pd.read_csv(fp)
+                logger.info(f"loaded {len(funds)} fund symbols from {fp}")
+                logger.info(f"fund file columns: {funds.columns}")
+                fund_set = set(funds["Symbol"])
+                logger.info(f"{len(fund_set)} symbols in fund set")
+                all_funds_set |= fund_set
+                logger.info(f"total fund symbols loaded: {len(all_funds_set)}")
+            else:
+                logger.error(f"{fp} not found.")
+
+        logger.info(f"Loaded fund tickers: \n{sorted(list(all_funds_set))}")
+        return all_funds_set
+
     def _gather_yfdata_date_index(self, data_file: str = None, tickers: str = None):
         start_date = self.min_start_date
         old_df = None
@@ -178,7 +199,7 @@ def _gather_yfdata_date_index(self, data_file: str = None, tickers: str = None):
             logger.info(f"Columns: \n{old_df.columns}")
             logger.info(f"Latest saved record is after {start_date}")
         except Exception as e:
-            logger.exception(f"Could not load data from file: {data_file}. Error: {e}")
+            logger.warning(f"Could not load data from file: {data_file}. Error: {e}")
         new_df = yf.download(tickers, start=start_date, group_by="tickers", period="1d")
         new_df = new_df.dropna(how="all")
         logger.info("New data gathered. Sample: \n{bm}", bm=new_df)
@@ -219,44 +240,15 @@ def gather_sectors_data(self):
         data_file = "data/data-3rd-party/sectors.parquet"
         self._gather_yfdata_date_index(data_file=data_file, tickers=sector_indicies)
 
-    def gather_subindustries_data(self):
-        """
-        WARN: YFinance does not provide rich historical price and volume data for Industries and Sub-industries
-        the way it does for Sectors.
-        Do not use this method with YFinance.
+    def gather_industry_fund_data(self):
         """
-        return  # See warning message above.
-        """
-        Gather historic price and volume data for S&P 1500 GICS subindustries indexes.
-        S&P 1500 includes S&P 400, S&P 500, S&P 600 and overall about 90% of the US stock market capitalization.
-        The dataset has 163 GICS sub-industry indexes active as of 2023 plus 7 that were removed in 2023.
-        https://www.msci.com/documents/1296102/11185224/GICS+Map+2023.xlsx/82cc6504-9919-29e5-9789-a24fc039d0a5?t=1679087572540
+        Gather historic price and volume data for key industry ETFs.
         The goal of these covariates is to provide the model with a more granural breakdown of stock grouping by industry.
-        Since stocks usually move together with their group, the model can learn the patterns how an individual stock trend
-        relates to its group price and volume action.
+        Since stocks usually move together with their group, the model can learn which group(s) a stock moves with from these covariates.
         """
-        subindustry_indicies = []
-        gics_file = f"{self.data_dir}/{self.data_3rd_party}/GICS2023.csv"
-        gics_df = pd.read_csv(gics_file)
-        logger.info(f"Loaded {len(gics_df)} GICS records. Columns: {gics_df}")
-        code_col_name = "Sub-Industry Code"
-        subindustry_codes = gics_df[
-            pd.to_numeric(gics_df[code_col_name], errors="coerce").notnull()
-        ]
-        subindustry_codes = subindustry_codes[code_col_name].unique()
-        logger.info(
-            f"Loaded {len(subindustry_codes)} subindustry codes: {subindustry_codes}"
-        )
-        sub_prefix = "^sp1500-"
-        subindustry_symbols = []
-        for c in subindustry_codes:
-            subindustry_symbols.append(f"{sub_prefix}{c}")
-        logger.info(
-            f"Prepared list of {len(subindustry_symbols)} S&P1500 subindustry symbols: {subindustry_symbols}"
-        )
-        data_file = "data/data-3rd-party/subindustries.parquet"
-        self._gather_yfdata_date_index(data_file=data_file, tickers=subindustry_symbols)
-        logger.info("Finished gathering subindisty data")
+        fund_tickers = self.gather_fund_tickers()
+        data_file = f"{self.data_dir}/{self.data_3rd_party}/industry_funds.parquet"
+        self._gather_yfdata_date_index(data_file=data_file, tickers=fund_tickers)
 
     def gather_stock_price_data(self):
         data_file = (
@@ -538,6 +530,7 @@ def main():
     g = MarketDataGatherer()
     g.gather_broad_market_data()
     g.gather_sectors_data()
+    g.gather_industry_fund_data()
     g.gather_stock_tickers()
     g.gather_stock_price_data()
     g.gather_earnings_data()

diff --git a/src/canswim/hfhub.py b/src/canswim/hfhub.py
@@ -28,21 +28,21 @@ def __init__(self, api_key: Optional[str] = None):
         self.HF_TOKEN = api_key
         self.data_dir = os.getenv("data_dir", "data")
         self.repo_id = os.getenv("repo_id")
-        lm = os.getenv("local_mode", False)
+        lm = os.getenv("hfhub_sync", False)
         if isinstance(lm, str) and lm == "False":
             lm = False
         else:
             lm = True
-        self.local_mode = lm
-        logger.info(f"local_mode: {self.local_mode}")
+        self.hfhub_sync = lm
+        logger.info(f"hfhub_sync: {self.hfhub_sync}")
 
     def upload_model(
         self,
         repo_id: str = None,
         model: ForecastingModel = None,
         private: Optional[bool] = True,
     ):
-        if self.local_mode:
+        if not self.hfhub_sync:
             logger.info("Local mode selected. Skipping download.")
             return
         # Create repo if not existing yet and get the associated repo_id
@@ -67,7 +67,7 @@ def download_model(
         model_class: object = None,
         **kwargs,
     ) -> ForecastingModel:
-        if self.local_mode:
+        if not self.hfhub_sync:
             logger.info("Local mode selected. Skipping download.")
             return
         if torch.cuda.is_available():
@@ -98,7 +98,7 @@ def upload_timeseries(
         series_name: str = None,
         private: Optional[bool] = True,
     ):
-        if self.local_mode:
+        if not self.hfhub_sync:
             logger.info("Local mode selected. Skipping download.")
             return
         # Create repo if not existing
@@ -126,7 +126,7 @@ def download_timeseries(
         repo_id: str = None,
         series_name: str = None,
     ) -> TimeSeries:
-        if self.local_mode:
+        if not self.hfhub_sync:
             logger.info("Local mode selected. Skipping download.")
             return
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -145,7 +145,7 @@ def download_timeseries(
             return ts
 
     def download_data(self, repo_id: str = None, local_dir: str = None):
-        if self.local_mode:
+        if not self.hfhub_sync:
             logger.info("Local mode selected. Skipping download.")
             return
         if local_dir is not None:
@@ -168,7 +168,7 @@ def download_data(self, repo_id: str = None, local_dir: str = None):
     def upload_data(
         self, repo_id: str = None, private: bool = True, local_dir: str = None
     ):
-        if self.local_mode:
+        if not self.hfhub_sync:
             logger.info("Local mode selected. Skipping upload.")
             return
         if local_dir is not None:

diff --git a/src/canswim/model.py b/src/canswim/model.py
@@ -897,7 +897,7 @@ def _optuna_objective(self, trial):
             "input_chunk_length",
             low=168,
             high=252,
-            step=42,
+            step=84,  # 42,
             # low=252,
             # high=self.train_history,
             # step=21,
@@ -912,7 +912,7 @@ def _optuna_objective(self, trial):
 
         # Other hyperparameters
         hidden_size = trial.suggest_int(
-            "hidden_size", low=1024, high=2048, step=512
+            "hidden_size", low=2048, high=2048, step=256
         )  # low=256, high=1024, step=256)
         num_encoder_layers = trial.suggest_int(
             "num_encoder_layers", low=3, high=3
@@ -925,8 +925,8 @@ def _optuna_objective(self, trial):
         )
         temporal_decoder_hidden = trial.suggest_int(
             "temporal_decoder_hidden",
-            low=48,
-            high=128,
+            low=80,  # 48,
+            high=112,
             step=32,  # low=16, high=128, step=16
         )
         dropout = trial.suggest_float(

diff --git a/src/canswim/train.py b/src/canswim/train.py
@@ -25,7 +25,7 @@ def build_new_model(self):
         self.canswim_model.build(
             input_chunk_length=168,
             output_chunk_length=42,
-            hidden_size=1536,
+            hidden_size=2048,
             num_encoder_layers=3,
             num_decoder_layers=2,
             decoder_output_dim=8,