Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 86 additions & 129 deletions solution.R
Original file line number Diff line number Diff line change
@@ -1,160 +1,117 @@
# Load necessary libraries
set.seed(123)

library(dplyr)
library(lubridate)
library(ggplot2)
library(randomForest)
library(tidyr)
library(pacman)
# pacman::p_load(randomForest)

print('---> R Script Start')

# Define parameters
start_train <- as.Date("2017-01-01")
end_train <- as.Date("2023-11-30")
start_test <- as.Date("2024-01-01")
end_test <- as.Date("2024-06-30")

n_buys <- 10
verbose <- FALSE

print('---> initial data set up')

# Load sector data
df_sectors <- read.csv('data/data0.csv')

# Load price and financial data
df_data <- read.csv('data/data1.csv')
df_data$date <- as.Date(df_data$date)

df_x <- df_data %>% select(date, security, price, return30, ratio_pe, ratio_pcf, ratio_de, ratio_roe, ratio_roa)
df_y <- df_data %>% select(date, security, label)

list_vars1 <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa')

# Create signals DataFrame
df_signals <- data.frame(date = unique(df_x$date[df_x$date >= start_test & df_x$date <= end_test]))
df_signals <- df_signals %>% arrange(date)

# Initialize an empty list for storing accuracy results
df_signals$acc_total <- NA
df_signals$acc_current <- NA
library(caret) # For confusionMatrix

params <- list(
train_start_date = as.Date("2017-01-01"),
train_end_date = as.Date("2023-12-31"),
test_start_date = as.Date("2024-01-01"),
test_end_date = as.Date("2024-12-31"),
max_buys = 10,
verbose = FALSE
)

sector_data <- read.csv('data0.csv')
financial_data <- read.csv('data1.csv') %>%
mutate(date = as.Date(date))

returns_data <- read.csv('returns.csv') %>%
mutate(date = as.Date(date)) %>%
# only use data before 2024
filter(date >= params$test_start_date) %>%
pivot_wider(names_from = security, values_from = return1)

features <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa')
features_data <- financial_data %>% select(date, security, all_of(features))
labels_data <- financial_data %>% select(date, security, label)

signal_dates <- financial_data %>%
filter(date >= params$test_start_date & date <= params$test_end_date) %>%
distinct(date) %>%
arrange(date) %>%
mutate(total_accuracy = NA, current_accuracy = NA)

normalize_features <- function(df) {
df %>%
mutate(across(all_of(features), ~ (.-min(.))/(max(.)-min(.)) * 2 - 1))
}

for (i in seq_len(nrow(df_signals))) {

if (verbose) print(paste('---> doing', df_signals$date[i]))
for (index in seq_len(nrow(signal_dates))) {

# Training set
df_trainx <- df_x %>% filter(date < df_signals$date[i])
df_trainx <- df_trainx %>% filter(date != max(date))
training_filter <- features_data$date < signal_dates$date[index]

df_trainy <- df_y %>% filter(date < df_signals$date[i])
df_trainy <- df_trainy %>% filter(date != max(date))
training_features <- features_data %>%
filter(training_filter) %>%
filter(date != max(date)) %>%
normalize_features()

# Test set
df_testx <- df_x %>% filter(date >= df_signals$date[i])
df_testy <- df_y %>% filter(date >= df_signals$date[i])
training_labels <- labels_data %>%
filter(training_filter) %>%
filter(date != max(date)) %>%
mutate(label = as.factor(label))

# Scale data
for (col in list_vars1) {
scaler <- function(x) (x - min(x)) / (max(x) - min(x)) * 2 - 1
df_trainx[[col]] <- scaler(df_trainx[[col]])
df_testx[[col]] <- scaler(df_testx[[col]])
}
testing_features <- features_data %>%
filter(date >= signal_dates$date[index]) %>%
normalize_features()

df_trainy <- df_trainy %>% mutate(label = as.factor(label))
testing_labels <- labels_data %>%
filter(date >= signal_dates$date[index])

# Fit the Random Forest classifier
if (i == 1) {
clf <- randomForest(x = df_trainx[list_vars1], y = df_trainy$label, ntree = 10, mtry = sqrt(length(list_vars1)), nodesize = 1000)
# logistic model
if (index == 1) {
model <- glm(label ~ ., data = training_features %>% mutate(label = training_labels$label), family = binomial)
}

# Predictions and accuracy
pred_probs <- predict(clf, newdata = df_testx[list_vars1], type = "prob")
df_testy$signal <- pred_probs[, 2]
df_testy$pred <- ifelse(pred_probs[, 2] > 0.5, 1, 0)
df_testy$count <- 1
predicted_probabilities <- predict(model, newdata = testing_features, type = "response")
testing_labels <- testing_labels %>%
mutate(signal_strength = predicted_probabilities,
predicted_label = ifelse(predicted_probabilities > 0.5, 1, 0))

df_current <- df_testy %>% filter(date == df_signals$date[i])
current_day_labels <- testing_labels %>% filter(date == signal_dates$date[index])

acc_total <- mean(df_testy$label == df_testy$pred)
acc_current <- mean(df_current$label == df_current$pred)
signal_dates$total_accuracy[index] <- mean(testing_labels$label == testing_labels$predicted_label)
signal_dates$current_accuracy[index] <- mean(current_day_labels$label == current_day_labels$predicted_label)

print(paste('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2)))

# Add accuracy and signal to dataframe
df_signals$acc_total[i] <- acc_total
df_signals$acc_current[i] <- acc_current

df_signals[i, df_current$security] <- df_current$signal
signal_dates[index, current_day_labels$security] <- current_day_labels$signal_strength
}

# Create buy matrix for payoff plot
df_signals$`10th` <- apply(df_signals[df_sectors$security], 1, function(x) sort(x, decreasing = TRUE)[n_buys])
df_index <- df_signals[df_sectors$security] >= df_signals$`10th`
buy_thresholds <- apply(signal_dates[sector_data$security], 1, function(x) sort(x, decreasing = TRUE)[params$max_buys])
buy_indices <- signal_dates[sector_data$security] >= buy_thresholds

# Set 1 for top 10 strongest signals
df_buys <- as.data.frame(matrix(0, nrow = nrow(df_signals), ncol = length(df_sectors$security)))
colnames(df_buys) <- df_sectors$security
df_buys[df_index] <- 1
buy_matrix <- as.data.frame(ifelse(buy_indices, 1, 0), stringsAsFactors = FALSE)
colnames(buy_matrix) <- sector_data$security

# keep first 10 obs
process_row <- function(row) {
ones_indices <- which(row == 1) # Get the indices of 1's
if(length(ones_indices) > 10) {
row[ones_indices[11:length(ones_indices)]] <- 0 # Set ones after the first 10 to 0
buy_matrix <- t(apply(buy_matrix, 1, function(row) {
if (sum(row) > params$max_buys) {
row[order(row, decreasing = TRUE)[(params$max_buys + 1):length(row)]] <- 0
}
return(row)
}
df_buys <- t(apply(df_buys, 1, process_row))

# add dates
library(zoo)
# df_buys <- cbind(date = as.Date(df_signals$date), df_buys)
df_buys <- cbind(date = df_signals$date, df_buys)


# df_buys[,-1] %>% rowSums()


# Plot signals
ggplot(df_signals, aes(x = date)) + geom_line(aes(y = AAPL)) + ggtitle("AAPL Signals")
image(t(df_buys[-1])) # You might need to use other visualization for the buy signals
}))

# Create return matrix
df_returns <- read.csv('data/returns.csv')
df_returns$date <- as.Date(df_returns$date)
df_returns <- df_returns %>% filter(date >= start_test)
df_returns <- df_returns %>% pivot_wider(names_from = security, values_from = return1)
buy_matrix <- cbind(date = signal_dates$date, buy_matrix)

plot_payoff <- function(df_buys) {
calculate_payoff <- function(buy_matrix, returns_data) {
normalized_buys <- buy_matrix[,-1] * (1 / params$max_buys)
returns_array <- as.matrix(returns_data[-1] + 1)

df <- df_buys[,-1]
total_payoff <- diag(normalized_buys %*% t(returns_array - 1))
cumulative_growth <- cumprod(1 + total_payoff)

if (sum(rowSums(df) == 10) != nrow(df)) {
stop("---> must have exactly 10 buys each day")
}

# df_payoff <- df[, 1, drop = FALSE]
df <- df * (1 / n_buys) # equally weighted

df_payoff <- NULL
arr_ret <- as.matrix(df_returns[-1] + 1)
df_payoff$payoff <- diag(df %*% t(arr_ret-1))
df_payoff$tri <- cumprod(1+df_payoff$payoff)
payoffs <- tibble(date = returns_data$date, total_payoff = total_payoff, cumulative_growth = cumulative_growth)

df_payoff$date <- df_returns$date
df_payoff <- df_payoff %>% as_tibble()
final_payoff_percentage <- (tail(payoffs$cumulative_growth, 1) - 1) * 100


payoff_result <- (tail(df_payoff$tri, 1) - 1) * 100
print(paste("---> payoff for these buys between period", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_result, 2), "%"))

return(df_payoff)
return(payoffs)
}

df_payoff <- plot_payoff(df_buys)
ggplot(df_payoff, aes(x = date, y = tri)) + geom_line() + ggtitle("Payoff Over Time")

payoff_results <- calculate_payoff(buy_matrix, returns_data)

print('---> R Script End')
ggplot(payoff_results, aes(x = date, y = cumulative_growth)) +
geom_line() +
ggtitle("Predicted Payoff Over 6 Months") +
xlab("Date") +
ylab("Predicted Payoff")