Skip to content

Commit

Permalink
refactor out dataclasses that we're not using, Get cache data set wit…
Browse files Browse the repository at this point in the history
…h extended features working
  • Loading branch information
j03m committed Oct 2, 2024
1 parent 0582615 commit c89e942
Show file tree
Hide file tree
Showing 11 changed files with 130 additions and 211 deletions.
1 change: 1 addition & 0 deletions lists/baby_training_list.csv
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ AAPL
TSLA
T
^VIX
SPY
12 changes: 0 additions & 12 deletions lists/symbols.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1 @@
Symbols
AAPL
TER
KODK
T
IBM
MSFT
^VIX
VNDA
PHM
GE
TSLA
LUMN
39 changes: 1 addition & 38 deletions scripts/create-final-data-set.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
#!/usr/bin/env python3

from alfred.data import attach_moving_average_diffs, scale_relevant_training_columns, read_file
from alfred.utils import CustomScaler
from alfred.data import attach_moving_average_diffs, read_file
import argparse
import os
import joblib
import pandas as pd
from sklearn.decomposition import PCA

initial_columns_to_keep = [
"Symbol",
Expand All @@ -21,15 +17,6 @@
'Margin_Net_Profit'
]

scaler_config = [
{'regex': r'^Close$', 'type': 'log_returns'},
{'regex': r'^VIX.*', 'type': 'standard'},
{'regex': r'^Margin.*', 'type': 'standard'},
{'regex': r'^Volume$', 'type': 'log_returns'},
{'columns': ['reportedEPS', 'estimatedEPS', 'surprise', 'surprisePercentage'], 'type': 'standard'},
{'regex': r'\d+year', 'type': 'standard'}
]

def add_vix(final_df, args):
vix = pd.read_csv(f"{args.data}/^VIX.csv")
vix.index = pd.to_datetime(vix['Date'])
Expand Down Expand Up @@ -165,13 +152,6 @@ def main():
new_file_path = os.path.join(args.data, f"{symbol}_unscaled.csv")
frame.to_csv(new_file_path)

scaler = CustomScaler(scaler_config, frame)
scaled_df = scaler.fit_transform(frame)
scaled_file_path = os.path.join(args.data, f"{symbol}_scaled.csv")
scaled_df.to_csv(scaled_file_path)
scaler.serialize(os.path.join(args.data, f"{symbol}_scaler.joblib"))


def finalize_single_data_file(args, ticker_data_frames):
final_df = pd.concat(ticker_data_frames)
final_df = add_vix(final_df, args)
Expand All @@ -187,23 +167,6 @@ def finalize_single_data_file(args, ticker_data_frames):
new_file_name = f"{file_name}_processed_unscaled{file_extension}"
new_file_path = os.path.join(args.data, new_file_name)
final_df.to_csv(new_file_path)
# continue scaling
scaler = CustomScaler(scaler_config, final_df)
scaled_df = scaler.fit_transform(final_df)
assert not scaled_df.isnull().any().any(), f"scaled df has null after transform"
if args.debug:
for column in final_df.columns:
print("column: ", column,
"min value: ", final_df[column].min(),
"max value: ", final_df[column].max(),
"min scaled: ", scaled_df[column].min(),
"max scaled: ", scaled_df[column].max())
# save the scaled data file (final)
new_file_name = f"{file_name}_processed_scaled{file_extension}"
new_file_path = os.path.join(args.data, new_file_name)
scaled_df.to_csv(new_file_path)
scaler.serialize(os.path.join(args.data, f"{file_name}_scaler.joblib"))


if __name__ == "__main__":
main()
Expand Down
2 changes: 0 additions & 2 deletions scripts/educational/log_returns.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from sympy.benchmarks.bench_discrete_log import data_set_1

from alfred.data import YahooNextCloseWindowDataSet
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
Expand Down
57 changes: 37 additions & 20 deletions scripts/train-test-comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import torch.optim as optim
from torch.utils.data import DataLoader
from alfred.models import LSTMModel, Stockformer, AdvancedLSTM, LinearSeries, LinearConv1dSeries, LSTMConv1d, TransAm
from alfred.data import YahooNextCloseWindowDataSet, YahooChangeWindowDataSet, YahooDirectionWindowDataSet, \
YahooChangeSeriesWindowDataSet
from alfred.data import YahooNextCloseWindowDataSet, CachedStockDataSet
from alfred.model_persistence import maybe_save_model, get_latest_model
from statistics import mean
from sklearn.metrics import mean_squared_error
Expand All @@ -15,6 +14,7 @@
from matplotlib.pyplot import figure
import numpy as np
import faulthandler

faulthandler.enable()

np.random.seed(42)
Expand All @@ -27,16 +27,26 @@
BATCH_SIZE = 64
SIZE = 32


def get_simple_yahoo_data_loader(ticker, start, end, seq_length, predict_type, window=1):
if predict_type == "change":
dataset = YahooChangeWindowDataSet(ticker, start, end, seq_length, change=window)
return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True), dataset
elif predict_type == "change-series":
dataset = YahooChangeSeriesWindowDataSet(ticker, start, end, seq_length, change=window)
return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True), dataset
elif predict_type == "direction":
dataset = YahooDirectionWindowDataSet(ticker, start, end, seq_length, change=window)
scaler_config = [
{'regex': r'^Close$', 'type': 'log_returns'},
{'regex': r'^VIX.*', 'type': 'standard'},
{'regex': r'^Margin.*', 'type': 'standard'},
{'regex': r'^Volume$', 'type': 'log_returns'},
{'columns': ['reportedEPS', 'estimatedEPS', 'surprise', 'surprisePercentage'], 'type': 'standard'},
{'regex': r'\d+year', 'type': 'standard'}
]


def get_simple_yahoo_data_loader(ticker, start, end, seq_length, predict_type, window=1, use_cache=False):
if use_cache: # assume close only but test cache
file = f"./data/{ticker}_unscaled.csv"
dataset = CachedStockDataSet(file=file,
start=start,
end=end,
scaler_config = scaler_config,
sequence_length=seq_length,
feature_columns=["Close"],
target_columns=["Close"])
return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True), dataset
elif predict_type == "price":
dataset = YahooNextCloseWindowDataSet(ticker, start, end, seq_length, change=window, log_return_scaler=True)
Expand Down Expand Up @@ -132,12 +142,17 @@ def main():
help="some datasets need a window, for example when predict the next change")
parser.add_argument("--make-plots", action='store_true',
help="plot all data")
parser.add_argument("--use-cache", action='store_true',
help="use data cache files")
parser.add_argument("--epochs", type=int, default=100, help="epochs")
parser.add_argument("--ticker", type=str, default="SPY", help="symbol to train or eval")
parser.add_argument("--start", type=str, default='1999-01-01', help="start date")
parser.add_argument("--end", type=str, default="'2021-01-01'", help="end date")
args = parser.parse_args()

ticker = 'SPY'
start_date = '1999-01-01'
end_date = '2021-01-01'
ticker = args.ticker
start_date = args.start
end_date = args.end
seq_length = 30
num_features = 1
output = 1
Expand All @@ -149,7 +164,7 @@ def main():
num_layers=layers).to(device)

elif args.model_token == 'stockformer':
model = Stockformer(1,1)
model = Stockformer(1, 1)
elif args.model_token == 'advanced-lstm':
model = AdvancedLSTM(features=num_features, hidden_dim=SIZE, output_dim=output)
elif args.model_token == 'linear' and args.predict_type != 'direction':
Expand All @@ -164,7 +179,8 @@ def main():
# size 10 kernel should smooth about 2 weeks of data
model = LSTMConv1d(features=1, seq_len=seq_length, hidden_dim=SIZE, output_size=output, kernel_size=10)
elif args.model_token == 'trans-am':
model = TransAm(feature_size=250, last_bar=True) # not apples to apples, size needs to be div by heads so larger number from transam exp
model = TransAm(feature_size=250,
last_bar=True) # not apples to apples, size needs to be div by heads so larger number from transam exp
else:
raise Exception("Model type not supported")

Expand All @@ -186,8 +202,8 @@ def main():

if args.action == 'train' or args.action == 'both':
train_loader, dataset = get_simple_yahoo_data_loader(ticker, start_date, end_date, seq_length,
args.predict_type, args.window)

args.predict_type, args.window, args.use_cache)
print("**********TRAIN")
if args.make_plots:
plot(dataset.df.index, dataset.df["Close"])
plot(dataset.df.index[:len(dataset.data)], dataset.data)
Expand All @@ -197,8 +213,9 @@ def main():
model_path=args.model_path, epochs=args.epochs, loss_function=loss_function)

if args.action == 'eval' or args.action == 'both':
print("**********EVAL")
eval_loader, dataset = get_simple_yahoo_data_loader(ticker, end_date, '2023-01-01', seq_length,
args.predict_type)
args.predict_type, args.window, args.use_cache)
if args.make_plots:
plot(dataset.df.index, dataset.df["Close"])
plot(dataset.df.index[:len(dataset.data)], dataset.data)
Expand Down
3 changes: 1 addition & 2 deletions src/alfred/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .downloaders import download_ticker_list, AlphaDownloader
from .readers import read_processed_file, read_symbol_file, read_file
from .processors import attach_moving_average_diffs, scale_relevant_training_columns
from .data_sources import BaseYahooDataSet, YahooNextCloseWindowDataSet, YahooChangeWindowDataSet, \
YahooDirectionWindowDataSet, YahooChangeSeriesWindowDataSet, YahooSeriesAsFeaturesWindowDataSet
from .data_sources import YahooNextCloseWindowDataSet, CachedStockDataSet
from .features_and_labels import feature_columns, label_columns
Loading

0 comments on commit c89e942

Please sign in to comment.