-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_pci.R
165 lines (132 loc) · 5.28 KB
/
prepare_pci.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
library(data.table)
# PARAMS ------------------------------------------------------------------
# Parameters
FREQ = "m" # Frequency - for now I would suggest month (m) and week (w)
# PRICE DATA --------------------------------------------------------------
# Import QC daily data
prices = fread("F:/lean/data/stocks_daily.csv")
setnames(prices, gsub(" ", "_", c(tolower(colnames(prices)))))
# Remove duplicates
prices = unique(prices, by = c("symbol", "date"))
# Remove duplicates - there are same for different symbols (eg. phun and phun.1)
dups = prices[, .(symbol , n = .N),
by = .(date, open, high, low, close, volume, adj_close,
symbol_first = substr(symbol, 1, 1))]
dups = dups[n > 1]
dups[, symbol_short := gsub("\\.\\d$", "", symbol)]
symbols_remove = dups[, .(symbol, n = .N),
by = .(date, open, high, low, close, volume, adj_close,
symbol_short)]
symbols_remove[n >= 2, unique(symbol)]
symbols_remove = symbols_remove[n >= 2, unique(symbol)]
symbols_remove = symbols_remove[grepl("\\.", symbols_remove)]
prices = prices[symbol %notin% symbols_remove]
# Adjust all columns
prices[, adj_rate := adj_close / close]
prices[, let(
open = open*adj_rate,
high = high*adj_rate,
low = low*adj_rate
)]
setnames(prices, "close", "close_raw")
setnames(prices, "adj_close", "close")
prices[, let(adj_rate = NULL)]
setcolorder(prices, c("symbol", "date", "open", "high", "low", "close", "volume"))
# Remove observations where open, high, low, close columns are below 1e-008
# This step is opional, we need it if we will use finfeatures package
prices = prices[open > 1e-008 & high > 1e-008 & low > 1e-008 & close > 1e-008]
# Remove missing values
prices = na.omit(prices)
# Keep only symbol with at least 2 years of data
# This step is optional
symbol_keep = prices[, .N, symbol][N >= 2 * 252, symbol]
prices = prices[symbol %chin% symbol_keep]
# Sort
setorder(prices, symbol, date)
# free memory
gc()
# FILTERING ---------------------------------------------------------------
# Add label for most liquid asssets
prices[, dollar_volume_month := frollsum(close_raw * volume, 22, na.rm= TRUE), by = symbol]
calculate_liquid = function(prices, n) {
# dt = copy(prices)
# n = 500
dt = copy(prices)
setorder(dt, date, -dollar_volume_month)
filtered = na.omit(dt)[, .(symbol = first(symbol, n)), by = date]
col_ = paste0("liquid_", n)
filtered[, (col_) := TRUE]
dt = filtered[dt, on = c("date", "symbol")]
dt[is.na(x), x := FALSE, env = list(x = col_)] # fill NA with FALSE
return(dt)
}
prices = calculate_liquid(prices, 100)
prices = calculate_liquid(prices, 200)
prices = calculate_liquid(prices, 500)
prices = calculate_liquid(prices, 1000)
# Remove columns we don't need
prices[, dollar_volume_month := NULL]
# Order again
setorder(prices, symbol, date)
# PARAMETERS --------------------------------------------------------------
# Define parameters
train_size_years = c(2, 5, 10)
# Create sh file with one node for every parameter
dates = seq.Date(prices[, min(date)], prices[, max(date)], by = "months")
# Util function that creates train and test sets
create_rolling_splits = function(dates, train_size = 12, test_size = 1) {
# Ensure the dates are sorted
dates = sort(dates)
dates_dt = data.table(date = dates)
# Initialize a list to store the train-test splits
splits_list = list()
total_periods = nrow(dates_dt)
# Loop over each possible split point
for (i in seq(train_size + 1, total_periods - test_size + 1)) {
# Training indices
train_indices = seq(i - train_size, i - 1)
# Testing indices
test_indices = seq(i, i + test_size - 1)
# Store the indices in the splits list
splits_list[[length(splits_list) + 1]] = data.table(
train_size = train_size,
test_size = test_size,
train_start = dates[first(train_indices)],
train_end = dates[last(train_indices)],
test_start = dates[first(test_indices)],
test_end = dates[last(test_indices)]
)
}
rbindlist(splits_list)
}
# Create train/test splits
meta = lapply(train_size_years, function(x) create_rolling_splits(dates, x*12, 1))
meta = rbindlist(meta)
# Add search_type and maxfact option from hedge.pci
CJ_table_1 = function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]
meta = CJ_table_1(meta, data.table(search_type = c("lasso", "limited")))
meta = CJ_table_1(meta, data.table(maxfact = 1:2))
# Add to padobran
fwrite(meta, "F:/strategies/statsarb/pci/meta.csv")
fwrite(prices, "F:/strategies/statsarb/pci/prices.csv")
# If on windows use winscp, otherwise use commands below
# scp F:/strategies/statsarb/pci/meta.csv padobran:/home/jmaric/predictors_padobran/
# scp /home/sn/data/strategies/pread/ padobran:/home/jmaric/pread/predictors_padobran/
file.remove("F:/strategies/statsarb/pci/meta.csv")
file.remove("F:/strategies/statsarb/pci/prices.csv")
# SH FILE -----------------------------------------------------------------
# Create sh file for padobran with PBS j equal to number of rows
sh_file = sprintf("
#!/bin/bash
#PBS -N STATSARBPCI
#PBS -l ncpus=2
#PBS -l mem=10GB
#PBS -J 1-%d
#PBS -o logs
#PBS -j oe
cd ${PBS_O_WORKDIR}
apptainer run image.sif estimate_pci_padobran.R
", nrow(meta))
sh_file_name = "estimate_pci_padobran.sh"
file.create(sh_file_name)
writeLines(sh_file, sh_file_name)