Skip to content

Commit

Permalink
feat: add dividends as future covariates and splits as past covariates
Browse files Browse the repository at this point in the history
Signed-off-by: ivelin <[email protected]>
  • Loading branch information
ivelin committed Mar 8, 2024
1 parent 38f479d commit 56f45ac
Show file tree
Hide file tree
Showing 10 changed files with 358 additions and 64 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,13 @@ usage: canswim [-h] {dashboard,train}
CANSWIM is a toolkit for CANSLIM style investors. Aims to complement the Simple Moving Average and other technical indicators.
positional arguments:
{dashboard,gatherdata,uploaddata,modelsearch,train,finetune,forecast}
{dashboard,gatherdata,uploaddata,modelsearch,train,forecast}
Which canswim task to run:
`dashboard` - start Web App service with stock charting and forecast scans.
'gatherdata` to gather 3rd party stock market data and save to HF Hub.
'uploaddata` upload to HF Hub any interim changes to local data storage.
`modelsearch` to find and save optimal hyperparameters for model training.
`train` for continuous model training.
`finetune` to fine tune pretrained model on new stock market data. `forecast` to run forecast on stocks and upload dataset to HF Hub.
options:
-h, --help show this help message and exit
Expand Down
3 changes: 3 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#FMP_API_KEY=
#HF_TOKEN=

# Logging level
#LOGURU_LEVEL=INFO

# number of samples in each train loop. -1 for all, otherwise a number like 50, 100, 300
n_stocks = 50

Expand Down
2 changes: 1 addition & 1 deletion hparams.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
input_chunk_length: 168
input_chunk_length: 252
output_chunk_length: 42
hidden_size: 2048
num_encoder_layers: 3
Expand Down
12 changes: 6 additions & 6 deletions src/canswim/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@

import signal
import sys
from loguru import logger
from dotenv import load_dotenv

# load os env vars before loguru import
# otherwise it won't pickup LOGURU_LEVEL
load_dotenv(override=True)

from loguru import logger
import os
import argparse
from canswim import dashboard, gather_data, model_search, train, forecast
Expand All @@ -43,7 +48,6 @@
'uploaddata` upload to HF Hub any interim changes to local train and forecast data.
`modelsearch` to find and save optimal hyperparameters for model training.
`train` for continuous model training.
`finetune` to fine tune pretrained model on new stock market data.
`forecast` to run forecast on stocks and upload dataset to HF Hub.
""",
choices=[
Expand All @@ -53,7 +57,6 @@
"uploaddata",
"modelsearch",
"train",
"finetune",
"forecast",
],
)
Expand All @@ -75,7 +78,6 @@

args = parser.parse_args()

load_dotenv(override=True)
logging_dir = os.getenv("logging_dir", "tmp")
logging_path = f"{logging_dir}/canswim.log"
rot = "24 hours"
Expand Down Expand Up @@ -116,8 +118,6 @@ def signal_handler(sig, frame):
hfhub.upload_data()
case "train":
train.main(new_model=args.new_model)
case "finetune":
raise NotImplementedError("finetune task not implemented yet")
case "forecast":
forecast.main(forecast_start_date=args.forecast_start_date)
case _:
Expand Down
258 changes: 225 additions & 33 deletions src/canswim/covariates.py

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions src/canswim/forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from loguru import logger
import os
from canswim import constants
from typing import List


class CanswimForecaster:
Expand Down Expand Up @@ -109,17 +110,17 @@ def prep_next_stock_group(self):
logger.info(f"Prepared forecast data for {len(stock_group)}: {stock_group}")
yield pos

def save_forecast(self, forecast_list: [] = None):
def save_forecast(self, forecast_list: List = None):
"""Saves forecast data to local database"""

def _list_to_df(forecast_list: [] = None):
def _list_to_df(forecast_list: list = None):
"""Format list of forecasts as a dataframe to be saved as a partitioned parquet dir"""
forecast_df = pd.DataFrame()
for i, t in enumerate(self.canswim_model.targets_ticker_list):
ts = forecast_list[i]
pred_start = ts.start_time()
# logger.debug(f"Next forecast timeseries: {ts}")
# normalize name of target series column ("Adj Close" -> "Close")
# normalize name of target series column if needed (e.g. "Adj Close" -> "Close")
if self.canswim_model.target_column != "Close":
ts = ts.with_columns_renamed(
self.canswim_model.target_column, "Close"
Expand Down
93 changes: 86 additions & 7 deletions src/canswim/gather_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def gather_stock_tickers(self):
# Prepare list of stocks for training
all_stock_set = set()
stock_files = [
# "test_stocks.csv"
"IBD50.csv",
"IBD250.csv",
"ibdlive_picks.csv",
Expand Down Expand Up @@ -300,10 +301,8 @@ def gather_stock_price_data(self):
)

def gather_earnings_data(self):
logger.info("Gathering earnings and sales data...")
earnings_all_df = (
None # pd.DataFrame() -- Pandas prefers None to empty df in concat
)
logger.info("Gathering earnings and sales data...")
earnings_all_df = None # Pandas prefers None to empty df in concat
for ticker in self.stocks_ticker_set: # ['AAON']: #
earnings = fmpsdk.historical_earning_calendar(
apikey=self.FMP_API_KEY, symbol=ticker, limit=-1
Expand Down Expand Up @@ -346,7 +345,7 @@ def gather_earnings_data(self):

def gather_stock_key_metrics(self):
logger.info("Gathering key metrics data with company fundamentals...")
keymetrics_all_df = pd.DataFrame()
keymetrics_all_df = None
for ticker in self.stocks_ticker_set:
kms = fmpsdk.key_metrics(
apikey=self.FMP_API_KEY, symbol=ticker, period="quarter", limit=-1
Expand Down Expand Up @@ -386,9 +385,87 @@ def gather_stock_key_metrics(self):
f"Sanity check passed for key metrics data. Loaded OK from file: {kms_file}"
)

def gather_stock_dividends(self):
logger.info("Gathering stock dividends data...")
all_df = None
for ticker in self.stocks_ticker_set:
logger.info(f"Gathering report for {ticker}")
raw = fmpsdk.historical_stock_dividend(
apikey=self.FMP_API_KEY, symbol=ticker
)
# skip symbols without any data
if raw is not None:
raw = raw.get("historical")
if raw is not None and len(raw) > 0:
# logger.info(f"Sample raw report for {ticker}: \n{raw}")
df = pd.DataFrame(raw)
# logger.debug(f"df for {ticker}: \n{df}")
df = df.dropna(how="all")
df["date"] = pd.to_datetime(df["date"])
df["symbol"] = ticker
df = df.set_index(["symbol", "date"])
all_df = pd.concat([all_df, df])
logger.debug(f"Total reports for {ticker}: {len(df)}")

if all_df is not None:
logger.debug(f"Sample report for {ticker}: \n{df}")
all_df.index.names = ["Symbol", "Date"]
all_df = all_df.sort_index()
logger.info(f"Total number of records for all stocks: \n{len(all_df)}")
logger.debug(f"all_df: \n{all_df}")
logger.info(f"len(all_df.index.levels[0]): \n{len(all_df.index.levels[0])}")
file = f"{self.data_dir}/{self.data_3rd_party}/stock_dividends.parquet"
all_df.to_parquet(file)
### Read back data and verify it
tmp_df = pd.read_parquet(file)
logger.debug(f"tmp_df: \n{tmp_df}")
assert tmp_df.index.names == ["Symbol", "Date"]
assert len(tmp_df) == len(all_df)
assert sorted(tmp_df.columns) == sorted(all_df.columns)
logger.info(f"Sanity check passed. Data loaded OK from file: {file}")
logger.info("Finished gathering stock dividends data.")

def gather_stock_splits(self):
logger.info("Gathering stock splits data...")
all_df = None
for ticker in self.stocks_ticker_set:
logger.info(f"Gathering report for {ticker}")
raw = fmpsdk.historical_stock_split(apikey=self.FMP_API_KEY, symbol=ticker)
# skip symbols without any data
if raw is not None:
raw = raw.get("historical")
if raw is not None and len(raw) > 0:
# logger.debug(f"Sample raw report for {ticker}: \n{raw}")
df = pd.DataFrame(raw)
# logger.debug(f"df for {ticker}: \n{df}")
df = df.dropna(how="all")
df["date"] = pd.to_datetime(df["date"])
df["symbol"] = ticker
df = df.set_index(["symbol", "date"])
all_df = pd.concat([all_df, df])
logger.debug(f"Total reports for {ticker}: {len(df)}")

if all_df is not None:
logger.debug(f"Sample report for {ticker}: \n{df}")
all_df.index.names = ["Symbol", "Date"]
all_df = all_df.sort_index()
logger.info(f"Total number of records for all stocks: \n{len(all_df)}")
logger.debug(f"all_df: \n{all_df}")
logger.info(f"len(all_df.index.levels[0]): \n{len(all_df.index.levels[0])}")
file = f"{self.data_dir}/{self.data_3rd_party}/stock_splits.parquet"
all_df.to_parquet(file)
### Read back data and verify it
tmp_df = pd.read_parquet(file)
logger.debug(f"tmp_df: \n{tmp_df}")
assert tmp_df.index.names == ["Symbol", "Date"]
assert len(tmp_df) == len(all_df)
assert sorted(tmp_df.columns) == sorted(all_df.columns)
logger.info(f"Sanity check passed. Data loaded OK from file: {file}")
logger.info("Finished gathering stock splits data.")

def gather_institutional_stock_ownership(self):
logger.info("Gathering institutional ownership data...")
inst_ownership_all_df = pd.DataFrame()
inst_ownership_all_df = None
for ticker in self.stocks_ticker_set:
inst_ownership = institutional_symbol_ownership(
apikey=self.FMP_API_KEY,
Expand Down Expand Up @@ -479,7 +556,7 @@ def gather_analyst_estimates(self):

def _fetch_estimates(period=None):
assert period in ["quarter", "annual"]
estimates_all_df = pd.DataFrame()
estimates_all_df = None
for ticker in self.stocks_ticker_set: # ['ALTR']:
est = analyst_estimates(
apikey=self.FMP_API_KEY, symbol=ticker, period=period, limit=-1
Expand Down Expand Up @@ -537,6 +614,8 @@ def main():
g.gather_industry_fund_data()
g.gather_stock_tickers()
g.gather_stock_price_data()
g.gather_stock_dividends()
g.gather_stock_splits()
g.gather_earnings_data()
g.gather_stock_key_metrics()
g.gather_institutional_stock_ownership()
Expand Down
35 changes: 27 additions & 8 deletions src/canswim/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(self):
self.n_plot_samples: int = 4
self.train_date_start: pd.Timestamp = None
self.targets = Targets()
self.target_column = "Adj Close"
self.target_column = "Close"
self.covariates = Covariates()
self.hfhub = HFHub()
# use GPU if available
Expand Down Expand Up @@ -330,8 +330,6 @@ def __load_config(self):
self.n_stocks = int(
os.getenv("n_stocks", 50)
) # -1 for all, otherwise a number like 300
if self.n_stocks == -1:
self.n_stocks = len(self.stock_train_list)
logger.info("n_stocks: {ns}", ns=self.n_stocks)

# model training epochs
Expand Down Expand Up @@ -491,7 +489,9 @@ def __build_model(self, **kwargs):

encoders = {
"cyclic": {"future": ["dayofweek", "month", "quarter"]},
"datetime_attribute": {"future": ["dayofweek", "month", "quarter", "year"]},
"datetime_attribute": {
"future": ["dayofweek", "day", "month", "quarter", "year"]
},
"position": {"past": ["relative"], "future": ["relative"]},
"custom": {
"future": [election_year_offset]
Expand Down Expand Up @@ -578,15 +578,31 @@ def train(self):
)
assert supports_multi_ts is True
# train model
# for i in range(100):
logger.info("Starting model training...")
assert (
self.target_train_list is not None and len(self.target_train_list) > 0
), "Targets train list must not be empty"
assert len(self.target_train_list) == len(
self.past_cov_list
), f"""Past covs series list must be the exact same length as targets list
{len(self.target_train_list)} != {len(self.past_cov_list)}
"""
assert len(self.target_train_list) == len(
self.future_cov_list
), f"""Future covs series list must be the exact same length as targets list
{len(self.target_train_list)} != {len(self.future_cov_list)}
"""
assert len(self.target_train_list) == len(
self.future_cov_list
), f"""Validation series list must be the exact same length as targets list
{len(self.target_train_list)} != {len(self.target_val_list)}
"""
self.torch_model.fit(
self.target_train_list,
epochs=self.n_epochs,
past_covariates=self.past_cov_list,
future_covariates=self.future_cov_list,
val_series=self.target_val_list,
##val_past_covariates=self.past_cov_val_list,
val_past_covariates=self.past_cov_list,
val_future_covariates=self.future_cov_list,
verbose=True,
Expand Down Expand Up @@ -672,7 +688,10 @@ def load_data(
logger.info(f"Loaded {len(all_stock_tickers)} symbols in total")
stock_set = list(set(all_stock_tickers["Symbol"]))
# reduce ticker set to a workable sample size for one training loop
self.__stock_tickers = random.sample(stock_set, self.n_stocks)
if self.n_stocks > 0 and self.n_stocks < len(stock_set):
self.__stock_tickers = random.sample(stock_set, self.n_stocks)
else:
self.__stock_tickers = stock_set
logger.info(
f"Training loop stock subset has {len(self.stock_tickers)} tickers: ",
self.stock_tickers,
Expand Down Expand Up @@ -868,7 +887,7 @@ def backtest(
def plot_backtest_results(
self,
target: TimeSeries = None,
backtest: [TimeSeries] = None,
backtest: List[TimeSeries] = None,
start: pd.Timestamp = None,
forecast_horizon: int = None,
):
Expand Down
7 changes: 4 additions & 3 deletions src/canswim/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@ def load_stock_prices(self):
# logger.info(f"validating price data for {t}")
stock_full_hist = stocks_df.loc[[t]]
if len(stock_full_hist.index) >= self.min_samples:
# UPDATE: Do not drop Close as it carries unique information about the relationships between OHLC and Adj Close
# We also keep Adj Close which takes into account dividends and splits
stock_full_hist = stock_full_hist.droplevel("Symbol")
stock_full_hist.index = pd.to_datetime(stock_full_hist.index)
stock_price_dict[t] = stock_full_hist # .drop(columns=['Close'])
# Drop Adj Close because yfiance changes its values retroactively at future dates
# after stock dividend or split dates, which makes training data less stable
# Ref: https://help.yahoo.com/kb/adjusted-close-sln28256.html
stock_price_dict[t] = stock_full_hist.drop(columns=["Adj Close"])
# logger.info(f'ticker: {t}')
# logger.info(f'ticker historic data: {ticker_dict[t]}')
else:
Expand Down
2 changes: 1 addition & 1 deletion src/canswim/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def prepare_data(self):
def build_new_model(self):
"""Build a new model using known optimal hyperparameters"""
self.canswim_model.build(
input_chunk_length=168,
input_chunk_length=252,
output_chunk_length=42,
hidden_size=2048,
num_encoder_layers=3,
Expand Down

0 comments on commit 56f45ac

Please sign in to comment.