feat: add dividends as future covariates and splits as past covariates

Signed-off-by: ivelin <[email protected]>
ivelin · Mar 8, 2024 · 56f45ac · 56f45ac
1 parent 38f479d
commit 56f45ac
Show file tree

Hide file tree

Showing 10 changed files with 358 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -25,14 +25,13 @@ usage: canswim [-h] {dashboard,train}
 CANSWIM is a toolkit for CANSLIM style investors. Aims to complement the Simple Moving Average and other technical indicators.
 
 positional arguments:
-  {dashboard,gatherdata,uploaddata,modelsearch,train,finetune,forecast}
+  {dashboard,gatherdata,uploaddata,modelsearch,train,forecast}
                         Which canswim task to run:
                         `dashboard` - start Web App service with stock charting and forecast scans.
                         'gatherdata` to gather 3rd party stock market data and save to HF Hub.
                         'uploaddata` upload to HF Hub any interim changes to local data storage.
                         `modelsearch` to find and save optimal hyperparameters for model training.
                         `train` for continuous model training.
-                        `finetune` to fine tune pretrained model on new stock market data. `forecast` to run forecast on stocks and upload dataset to HF Hub.
 
 options:
   -h, --help            show this help message and exit

diff --git a/example.env b/example.env
@@ -1,6 +1,9 @@
 #FMP_API_KEY=
 #HF_TOKEN=
 
+# Logging level
+#LOGURU_LEVEL=INFO
+
 # number of samples in each train loop. -1 for all, otherwise a number like 50, 100, 300
 n_stocks = 50
 

diff --git a/hparams.yaml b/hparams.yaml
@@ -1,4 +1,4 @@
-input_chunk_length: 168
+input_chunk_length: 252
 output_chunk_length: 42
 hidden_size: 2048
 num_encoder_layers: 3

diff --git a/src/canswim/__main__.py b/src/canswim/__main__.py
@@ -17,8 +17,13 @@
 
 import signal
 import sys
-from loguru import logger
 from dotenv import load_dotenv
+
+# load os env vars before loguru import
+# otherwise it won't pickup LOGURU_LEVEL
+load_dotenv(override=True)
+
+from loguru import logger
 import os
 import argparse
 from canswim import dashboard, gather_data, model_search, train, forecast
@@ -43,7 +48,6 @@
         'uploaddata` upload to HF Hub any interim changes to local train and forecast data.
         `modelsearch` to find and save optimal hyperparameters for model training.
         `train` for continuous model training.
-        `finetune` to fine tune pretrained model on new stock market data.
         `forecast` to run forecast on stocks and upload dataset to HF Hub.
         """,
     choices=[
@@ -53,7 +57,6 @@
         "uploaddata",
         "modelsearch",
         "train",
-        "finetune",
         "forecast",
     ],
 )
@@ -75,7 +78,6 @@
 
 args = parser.parse_args()
 
-load_dotenv(override=True)
 logging_dir = os.getenv("logging_dir", "tmp")
 logging_path = f"{logging_dir}/canswim.log"
 rot = "24 hours"
@@ -116,8 +118,6 @@ def signal_handler(sig, frame):
         hfhub.upload_data()
     case "train":
         train.main(new_model=args.new_model)
-    case "finetune":
-        raise NotImplementedError("finetune task not implemented yet")
     case "forecast":
         forecast.main(forecast_start_date=args.forecast_start_date)
     case _:

diff --git a/src/canswim/covariates.py b/src/canswim/covariates.py
diff --git a/src/canswim/forecast.py b/src/canswim/forecast.py
@@ -13,6 +13,7 @@
 from loguru import logger
 import os
 from canswim import constants
+from typing import List
 
 
 class CanswimForecaster:
@@ -109,17 +110,17 @@ def prep_next_stock_group(self):
             logger.info(f"Prepared forecast data for {len(stock_group)}: {stock_group}")
             yield pos
 
-    def save_forecast(self, forecast_list: [] = None):
+    def save_forecast(self, forecast_list: List = None):
         """Saves forecast data to local database"""
 
-        def _list_to_df(forecast_list: [] = None):
+        def _list_to_df(forecast_list: list = None):
             """Format list of forecasts as a dataframe to be saved as a partitioned parquet dir"""
             forecast_df = pd.DataFrame()
             for i, t in enumerate(self.canswim_model.targets_ticker_list):
                 ts = forecast_list[i]
                 pred_start = ts.start_time()
                 # logger.debug(f"Next forecast timeseries: {ts}")
-                # normalize name of target series column ("Adj Close" -> "Close")
+                # normalize name of target series column if needed (e.g. "Adj Close" -> "Close")
                 if self.canswim_model.target_column != "Close":
                     ts = ts.with_columns_renamed(
                         self.canswim_model.target_column, "Close"

diff --git a/src/canswim/gather_data.py b/src/canswim/gather_data.py
@@ -95,6 +95,7 @@ def gather_stock_tickers(self):
         # Prepare list of stocks for training
         all_stock_set = set()
         stock_files = [
+            # "test_stocks.csv"
             "IBD50.csv",
             "IBD250.csv",
             "ibdlive_picks.csv",
@@ -300,10 +301,8 @@ def gather_stock_price_data(self):
         )
 
     def gather_earnings_data(self):
-        logger.info("Gathering  earnings and sales data...")
-        earnings_all_df = (
-            None  # pd.DataFrame() -- Pandas prefers None to empty df in concat
-        )
+        logger.info("Gathering earnings and sales data...")
+        earnings_all_df = None  # Pandas prefers None to empty df in concat
         for ticker in self.stocks_ticker_set:  # ['AAON']: #
             earnings = fmpsdk.historical_earning_calendar(
                 apikey=self.FMP_API_KEY, symbol=ticker, limit=-1
@@ -346,7 +345,7 @@ def gather_earnings_data(self):
 
     def gather_stock_key_metrics(self):
         logger.info("Gathering key metrics data with company fundamentals...")
-        keymetrics_all_df = pd.DataFrame()
+        keymetrics_all_df = None
         for ticker in self.stocks_ticker_set:
             kms = fmpsdk.key_metrics(
                 apikey=self.FMP_API_KEY, symbol=ticker, period="quarter", limit=-1
@@ -386,9 +385,87 @@ def gather_stock_key_metrics(self):
             f"Sanity check passed for key metrics data. Loaded OK from file: {kms_file}"
         )
 
+    def gather_stock_dividends(self):
+        logger.info("Gathering stock dividends data...")
+        all_df = None
+        for ticker in self.stocks_ticker_set:
+            logger.info(f"Gathering report for {ticker}")
+            raw = fmpsdk.historical_stock_dividend(
+                apikey=self.FMP_API_KEY, symbol=ticker
+            )
+            # skip symbols without any data
+            if raw is not None:
+                raw = raw.get("historical")
+            if raw is not None and len(raw) > 0:
+                # logger.info(f"Sample raw report for {ticker}: \n{raw}")
+                df = pd.DataFrame(raw)
+                # logger.debug(f"df for {ticker}: \n{df}")
+                df = df.dropna(how="all")
+                df["date"] = pd.to_datetime(df["date"])
+                df["symbol"] = ticker
+                df = df.set_index(["symbol", "date"])
+                all_df = pd.concat([all_df, df])
+                logger.debug(f"Total reports for {ticker}: {len(df)}")
+
+        if all_df is not None:
+            logger.debug(f"Sample report for {ticker}: \n{df}")
+            all_df.index.names = ["Symbol", "Date"]
+            all_df = all_df.sort_index()
+            logger.info(f"Total number of records for all stocks: \n{len(all_df)}")
+            logger.debug(f"all_df: \n{all_df}")
+            logger.info(f"len(all_df.index.levels[0]): \n{len(all_df.index.levels[0])}")
+            file = f"{self.data_dir}/{self.data_3rd_party}/stock_dividends.parquet"
+            all_df.to_parquet(file)
+            ### Read back data and verify it
+            tmp_df = pd.read_parquet(file)
+            logger.debug(f"tmp_df: \n{tmp_df}")
+            assert tmp_df.index.names == ["Symbol", "Date"]
+            assert len(tmp_df) == len(all_df)
+            assert sorted(tmp_df.columns) == sorted(all_df.columns)
+            logger.info(f"Sanity check passed. Data loaded OK from file: {file}")
+        logger.info("Finished gathering stock dividends data.")
+
+    def gather_stock_splits(self):
+        logger.info("Gathering stock splits data...")
+        all_df = None
+        for ticker in self.stocks_ticker_set:
+            logger.info(f"Gathering report for {ticker}")
+            raw = fmpsdk.historical_stock_split(apikey=self.FMP_API_KEY, symbol=ticker)
+            # skip symbols without any data
+            if raw is not None:
+                raw = raw.get("historical")
+            if raw is not None and len(raw) > 0:
+                # logger.debug(f"Sample raw report for {ticker}: \n{raw}")
+                df = pd.DataFrame(raw)
+                # logger.debug(f"df for {ticker}: \n{df}")
+                df = df.dropna(how="all")
+                df["date"] = pd.to_datetime(df["date"])
+                df["symbol"] = ticker
+                df = df.set_index(["symbol", "date"])
+                all_df = pd.concat([all_df, df])
+                logger.debug(f"Total reports for {ticker}: {len(df)}")
+
+        if all_df is not None:
+            logger.debug(f"Sample report for {ticker}: \n{df}")
+            all_df.index.names = ["Symbol", "Date"]
+            all_df = all_df.sort_index()
+            logger.info(f"Total number of records for all stocks: \n{len(all_df)}")
+            logger.debug(f"all_df: \n{all_df}")
+            logger.info(f"len(all_df.index.levels[0]): \n{len(all_df.index.levels[0])}")
+            file = f"{self.data_dir}/{self.data_3rd_party}/stock_splits.parquet"
+            all_df.to_parquet(file)
+            ### Read back data and verify it
+            tmp_df = pd.read_parquet(file)
+            logger.debug(f"tmp_df: \n{tmp_df}")
+            assert tmp_df.index.names == ["Symbol", "Date"]
+            assert len(tmp_df) == len(all_df)
+            assert sorted(tmp_df.columns) == sorted(all_df.columns)
+            logger.info(f"Sanity check passed. Data loaded OK from file: {file}")
+        logger.info("Finished gathering stock splits data.")
+
     def gather_institutional_stock_ownership(self):
         logger.info("Gathering institutional ownership data...")
-        inst_ownership_all_df = pd.DataFrame()
+        inst_ownership_all_df = None
         for ticker in self.stocks_ticker_set:
             inst_ownership = institutional_symbol_ownership(
                 apikey=self.FMP_API_KEY,
@@ -479,7 +556,7 @@ def gather_analyst_estimates(self):
 
         def _fetch_estimates(period=None):
             assert period in ["quarter", "annual"]
-            estimates_all_df = pd.DataFrame()
+            estimates_all_df = None
             for ticker in self.stocks_ticker_set:  # ['ALTR']:
                 est = analyst_estimates(
                     apikey=self.FMP_API_KEY, symbol=ticker, period=period, limit=-1
@@ -537,6 +614,8 @@ def main():
     g.gather_industry_fund_data()
     g.gather_stock_tickers()
     g.gather_stock_price_data()
+    g.gather_stock_dividends()
+    g.gather_stock_splits()
     g.gather_earnings_data()
     g.gather_stock_key_metrics()
     g.gather_institutional_stock_ownership()

diff --git a/src/canswim/model.py b/src/canswim/model.py
@@ -55,7 +55,7 @@ def __init__(self):
         self.n_plot_samples: int = 4
         self.train_date_start: pd.Timestamp = None
         self.targets = Targets()
-        self.target_column = "Adj Close"
+        self.target_column = "Close"
         self.covariates = Covariates()
         self.hfhub = HFHub()
         # use GPU if available
@@ -330,8 +330,6 @@ def __load_config(self):
         self.n_stocks = int(
             os.getenv("n_stocks", 50)
         )  # -1 for all, otherwise a number like 300
-        if self.n_stocks == -1:
-            self.n_stocks = len(self.stock_train_list)
         logger.info("n_stocks: {ns}", ns=self.n_stocks)
 
         # model training epochs
@@ -491,7 +489,9 @@ def __build_model(self, **kwargs):
 
         encoders = {
             "cyclic": {"future": ["dayofweek", "month", "quarter"]},
-            "datetime_attribute": {"future": ["dayofweek", "month", "quarter", "year"]},
+            "datetime_attribute": {
+                "future": ["dayofweek", "day", "month", "quarter", "year"]
+            },
             "position": {"past": ["relative"], "future": ["relative"]},
             "custom": {
                 "future": [election_year_offset]
@@ -578,15 +578,31 @@ def train(self):
         )
         assert supports_multi_ts is True
         # train model
-        # for i in range(100):
         logger.info("Starting model training...")
+        assert (
+            self.target_train_list is not None and len(self.target_train_list) > 0
+        ), "Targets train list must not be empty"
+        assert len(self.target_train_list) == len(
+            self.past_cov_list
+        ), f"""Past covs series list must be the exact same length as targets list
+            {len(self.target_train_list)} != {len(self.past_cov_list)}
+            """
+        assert len(self.target_train_list) == len(
+            self.future_cov_list
+        ), f"""Future covs series list must be the exact same length as targets list
+            {len(self.target_train_list)} != {len(self.future_cov_list)}
+            """
+        assert len(self.target_train_list) == len(
+            self.future_cov_list
+        ), f"""Validation series list must be the exact same length as targets list
+            {len(self.target_train_list)} != {len(self.target_val_list)}
+            """
         self.torch_model.fit(
             self.target_train_list,
             epochs=self.n_epochs,
             past_covariates=self.past_cov_list,
             future_covariates=self.future_cov_list,
             val_series=self.target_val_list,
-            ##val_past_covariates=self.past_cov_val_list,
             val_past_covariates=self.past_cov_list,
             val_future_covariates=self.future_cov_list,
             verbose=True,
@@ -672,7 +688,10 @@ def load_data(
             logger.info(f"Loaded {len(all_stock_tickers)} symbols in total")
             stock_set = list(set(all_stock_tickers["Symbol"]))
             # reduce ticker set to a workable sample size for one training loop
-            self.__stock_tickers = random.sample(stock_set, self.n_stocks)
+            if self.n_stocks > 0 and self.n_stocks < len(stock_set):
+                self.__stock_tickers = random.sample(stock_set, self.n_stocks)
+            else:
+                self.__stock_tickers = stock_set
         logger.info(
             f"Training loop stock subset has {len(self.stock_tickers)} tickers: ",
             self.stock_tickers,
@@ -868,7 +887,7 @@ def backtest(
     def plot_backtest_results(
         self,
         target: TimeSeries = None,
-        backtest: [TimeSeries] = None,
+        backtest: List[TimeSeries] = None,
         start: pd.Timestamp = None,
         forecast_horizon: int = None,
     ):

diff --git a/src/canswim/targets.py b/src/canswim/targets.py
@@ -52,11 +52,12 @@ def load_stock_prices(self):
             # logger.info(f"validating price data for {t}")
             stock_full_hist = stocks_df.loc[[t]]
             if len(stock_full_hist.index) >= self.min_samples:
-                # UPDATE: Do not drop Close as it carries unique information about the relationships between OHLC and Adj Close
-                # We also keep Adj Close which takes into account dividends and splits
                 stock_full_hist = stock_full_hist.droplevel("Symbol")
                 stock_full_hist.index = pd.to_datetime(stock_full_hist.index)
-                stock_price_dict[t] = stock_full_hist  # .drop(columns=['Close'])
+                # Drop Adj Close because yfiance changes its values retroactively at future dates
+                # after stock dividend or split dates, which makes training data less stable
+                # Ref: https://help.yahoo.com/kb/adjusted-close-sln28256.html
+                stock_price_dict[t] = stock_full_hist.drop(columns=["Adj Close"])
                 # logger.info(f'ticker: {t}')
                 # logger.info(f'ticker historic data: {ticker_dict[t]}')
             else:

diff --git a/src/canswim/train.py b/src/canswim/train.py
@@ -23,7 +23,7 @@ def prepare_data(self):
     def build_new_model(self):
         """Build a new model using known optimal hyperparameters"""
         self.canswim_model.build(
-            input_chunk_length=168,
+            input_chunk_length=252,
             output_chunk_length=42,
             hidden_size=2048,
             num_encoder_layers=3,