Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 149 additions & 134 deletions solution.R
Original file line number Diff line number Diff line change
@@ -1,160 +1,175 @@
# Load necessary libraries
library(dplyr)
library(lubridate)
library(ggplot2)
library(randomForest)
library(tidyr)
library(pacman)
# pacman::p_load(randomForest)

print('---> R Script Start')

# Define parameters
start_train <- as.Date("2017-01-01")
end_train <- as.Date("2023-11-30")
start_test <- as.Date("2024-01-01")
end_test <- as.Date("2024-06-30")
# Note: Since you prefer not to use additional packages, I'll use base R functions where possible.

n_buys <- 10
verbose <- FALSE
# Load data
data0 <- read.csv('data0.csv', stringsAsFactors = FALSE)
data1 <- read.csv('data1.csv', stringsAsFactors = FALSE)
returns <- read.csv('returns.csv', stringsAsFactors = FALSE)

print('---> initial data set up')
# Merge data
data <- merge(data1, data0, by = 'security', all.x = TRUE)
data$security <- as.factor(data$security)
data$sector <- as.factor(data$sector)

# Load sector data
df_sectors <- read.csv('data/data0.csv')
# Create the new sector variable
data$sector_grouped <- ifelse(data$sector %in% c('Information Technology', 'Staples', 'Consumer Discretionary'),
as.character(data$sector), 'other')

# Load price and financial data
df_data <- read.csv('data/data1.csv')
df_data$date <- as.Date(df_data$date)
# Convert 'sector_grouped' to a factor
data$sector_grouped <- as.factor(data$sector_grouped)

df_x <- df_data %>% select(date, security, price, return30, ratio_pe, ratio_pcf, ratio_de, ratio_roe, ratio_roa)
df_y <- df_data %>% select(date, security, label)

list_vars1 <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa')
# Convert date columns
data$date <- as.Date(data$date)
returns$date <- as.Date(returns$date)

# Create signals DataFrame
df_signals <- data.frame(date = unique(df_x$date[df_x$date >= start_test & df_x$date <= end_test]))
df_signals <- df_signals %>% arrange(date)
# Sort data
data <- data[order(data$date, data$security), ]

# Initialize an empty list for storing accuracy results
df_signals$acc_total <- NA
df_signals$acc_current <- NA
# Split data into 70:30 ratio
unique_dates <- sort(unique(data$date))
split_index <- floor(length(unique_dates) * 0.7)
train_dates <- unique_dates[1:split_index]
test_dates <- unique_dates[(split_index + 1):length(unique_dates)]

for (i in seq_len(nrow(df_signals))) {

if (verbose) print(paste('---> doing', df_signals$date[i]))

# Training set
df_trainx <- df_x %>% filter(date < df_signals$date[i])
df_trainx <- df_trainx %>% filter(date != max(date))

df_trainy <- df_y %>% filter(date < df_signals$date[i])
df_trainy <- df_trainy %>% filter(date != max(date))

# Test set
df_testx <- df_x %>% filter(date >= df_signals$date[i])
df_testy <- df_y %>% filter(date >= df_signals$date[i])

# Scale data
for (col in list_vars1) {
scaler <- function(x) (x - min(x)) / (max(x) - min(x)) * 2 - 1
df_trainx[[col]] <- scaler(df_trainx[[col]])
df_testx[[col]] <- scaler(df_testx[[col]])
}

df_trainy <- df_trainy %>% mutate(label = as.factor(label))

# Fit the Random Forest classifier
if (i == 1) {
clf <- randomForest(x = df_trainx[list_vars1], y = df_trainy$label, ntree = 10, mtry = sqrt(length(list_vars1)), nodesize = 1000)
}

# Predictions and accuracy
pred_probs <- predict(clf, newdata = df_testx[list_vars1], type = "prob")
df_testy$signal <- pred_probs[, 2]
df_testy$pred <- ifelse(pred_probs[, 2] > 0.5, 1, 0)
df_testy$count <- 1

df_current <- df_testy %>% filter(date == df_signals$date[i])

acc_total <- mean(df_testy$label == df_testy$pred)
acc_current <- mean(df_current$label == df_current$pred)

print(paste('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2)))
train_data <- subset(data, date %in% train_dates)
test_data <- subset(data, date %in% test_dates)

# Prepare training data
train_data$label <- as.factor(train_data$label)
train_data$security <- as.factor(train_data$security)
train_data$sector <- as.factor(train_data$sector)

# Fit the models
full_model <- glm(label ~ sector_grouped + price + return30 + ratio_pe + ratio_pcf + ratio_de + ratio_roe + ratio_roa,
data = train_data, family = binomial)
null_model <- glm(label ~ 1, data = train_data, family = binomial)

# Stepwise selection (both directions)
stepwise_model <- step(null_model, scope = list(lower = null_model, upper = full_model),
direction = "both", trace = TRUE)

# Predict probabilities on test_data
predicted_probabilities <- predict(stepwise_model, newdata = test_data, type = "response")

# Add predicted probabilities to test_data
test_data$predicted_probability <- predicted_probabilities

# Create 'prob_data' data frame with necessary columns
prob_data <- data.frame(
date = test_data$date,
company = test_data$security,
probability = test_data$predicted_probability
)

# Ensure 'date' and 'company' are appropriate types
prob_data$date <- as.Date(prob_data$date)
prob_data$company <- as.character(prob_data$company)

# Get list of all unique dates and companies
all_dates <- sort(unique(prob_data$date))
all_companies <- sort(unique(prob_data$company))

# Create complete grid of all date-company combinations
complete_grid <- expand.grid(date = all_dates, company = all_companies)

# Merge 'prob_data' with 'complete_grid' to ensure all combinations are present
prob_data_complete <- merge(complete_grid, prob_data, by = c("date", "company"), all.x = TRUE)

# Reshape data into matrix with dates as rows and companies as columns using 'xtabs'
prob_matrix <- xtabs(probability ~ date + company, data = prob_data_complete)
prob_matrix <- as.matrix(prob_matrix)

# Replace NA values with -Inf to handle missing probabilities
prob_matrix[is.na(prob_matrix)] <- 0

# Initialize the binary matrix
prob_binary_matrix <- matrix(0, nrow = nrow(prob_matrix), ncol = ncol(prob_matrix))
rownames(prob_binary_matrix) <- rownames(prob_matrix)
colnames(prob_binary_matrix) <- colnames(prob_matrix)

# Loop over each date to select top 10 companies
for (i in 1:nrow(prob_matrix)) {
# Extract probabilities for the current date
probs <- prob_matrix[i, ]

# Add accuracy and signal to dataframe
df_signals$acc_total[i] <- acc_total
df_signals$acc_current[i] <- acc_current
# Get indices of top 10 probabilities
top10_indices <- order(probs, decreasing = TRUE)[1:10]

df_signals[i, df_current$security] <- df_current$signal
# Assign 1 to the top 10 companies in the binary matrix
prob_binary_matrix[i, top10_indices] <- 1
}

# Create buy matrix for payoff plot
df_signals$`10th` <- apply(df_signals[df_sectors$security], 1, function(x) sort(x, decreasing = TRUE)[n_buys])
df_index <- df_signals[df_sectors$security] >= df_signals$`10th`

# Set 1 for top 10 strongest signals
df_buys <- as.data.frame(matrix(0, nrow = nrow(df_signals), ncol = length(df_sectors$security)))
colnames(df_buys) <- df_sectors$security
df_buys[df_index] <- 1

# keep first 10 obs
process_row <- function(row) {
ones_indices <- which(row == 1) # Get the indices of 1's
if(length(ones_indices) > 10) {
row[ones_indices[11:length(ones_indices)]] <- 0 # Set ones after the first 10 to 0
}
return(row)
# Verify that each row has exactly 10 ones
row_sums <- rowSums(prob_binary_matrix)
if (all(row_sums == 10)) {
cat("All dates have 10 companies selected.\n")
} else {
cat("Some dates do not have exactly 10 companies selected.\n")
}
df_buys <- t(apply(df_buys, 1, process_row))

# add dates
library(zoo)
# df_buys <- cbind(date = as.Date(df_signals$date), df_buys)
df_buys <- cbind(date = df_signals$date, df_buys)
# Now, integrate the payoff calculation

# Read returns data
df_returns <- read.csv('returns.csv', stringsAsFactors = FALSE)
df_returns$date <- as.Date(df_returns$date)

# df_buys[,-1] %>% rowSums()
# Filter df_returns to include only dates in the test set
start_test <- min(test_data$date)
end_test <- max(test_data$date)
df_returns <- subset(df_returns, date >= start_test & date <= end_test)

# Reshape df_returns to wide format with dates as rows and securities as columns
# Assuming df_returns has columns: 'date', 'security', 'return1'
# Use 'xtabs' to reshape
returns_matrix <- xtabs(return1 ~ date + security, data = df_returns)
returns_matrix <- as.matrix(returns_matrix)

# Plot signals
ggplot(df_signals, aes(x = date)) + geom_line(aes(y = AAPL)) + ggtitle("AAPL Signals")
image(t(df_buys[-1])) # You might need to use other visualization for the buy signals
# Replace NA values with 0 (assuming no gain or loss if return is missing)
returns_matrix[is.na(returns_matrix)] <- 0

# Create return matrix
df_returns <- read.csv('data/returns.csv')
df_returns$date <- as.Date(df_returns$date)
df_returns <- df_returns %>% filter(date >= start_test)
df_returns <- df_returns %>% pivot_wider(names_from = security, values_from = return1)
# Ensure that the dates and companies in returns_matrix match those in prob_binary_matrix
common_dates <- intersect(rownames(prob_binary_matrix), rownames(returns_matrix))
common_companies <- intersect(colnames(prob_binary_matrix), colnames(returns_matrix))

plot_payoff <- function(df_buys) {

df <- df_buys[,-1]

if (sum(rowSums(df) == 10) != nrow(df)) {
stop("---> must have exactly 10 buys each day")
}

# df_payoff <- df[, 1, drop = FALSE]
df <- df * (1 / n_buys) # equally weighted
# Subset both matrices to the common dates and companies
prob_binary_matrix_sub <- prob_binary_matrix[common_dates, common_companies]
returns_matrix_sub <- returns_matrix[common_dates, common_companies]

df_payoff <- NULL
arr_ret <- as.matrix(df_returns[-1] + 1)
df_payoff$payoff <- diag(df %*% t(arr_ret-1))
df_payoff$tri <- cumprod(1+df_payoff$payoff)

df_payoff$date <- df_returns$date
df_payoff <- df_payoff %>% as_tibble()


payoff_result <- (tail(df_payoff$tri, 1) - 1) * 100
print(paste("---> payoff for these buys between period", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_result, 2), "%"))

return(df_payoff)
}
# Ensure matrices are aligned
prob_binary_matrix_sub <- prob_binary_matrix_sub[order(rownames(prob_binary_matrix_sub)), order(colnames(prob_binary_matrix_sub))]
returns_matrix_sub <- returns_matrix_sub[order(rownames(returns_matrix_sub)), order(colnames(returns_matrix_sub))]

# Define the number of buys per day
n_buys <- 10

# Define the initial capital
initial_capital <- 100

# Calculate the payoff

# Calculate equal weights for each buy
df_buys_weighted <- prob_binary_matrix_sub * (1 / n_buys) # Equally weighted

# Calculate daily returns
daily_returns <- rowSums(df_buys_weighted * returns_matrix_sub)

# Calculate cumulative return starting from initial capital
cumulative_return <- initial_capital * cumprod(1 + daily_returns)

# Prepare data frame for analysis
df_payoff <- data.frame(
date = as.Date(rownames(prob_binary_matrix_sub)),
daily_return = daily_returns,
cumulative_return = cumulative_return
)

# Calculate total payoff over the period
total_payoff <- cumulative_return[length(cumulative_return)] - initial_capital
payoff_percentage <- (total_payoff / initial_capital) * 100

df_payoff <- plot_payoff(df_buys)
ggplot(df_payoff, aes(x = date, y = tri)) + geom_line() + ggtitle("Payoff Over Time")
print(paste("---> Payoff for these buys between", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_percentage, 2), "%"))

# Plot cumulative return over time
plot(df_payoff$date, df_payoff$cumulative_return, type = "l", xlab = "Date", ylab = "Cumulative Return", main = "Strategy Cumulative Return Over Time")

print('---> R Script End')