diff --git a/solution.R b/solution.R index 16eb1c3..0cecc10 100644 --- a/solution.R +++ b/solution.R @@ -1,160 +1,175 @@ # Load necessary libraries -library(dplyr) -library(lubridate) -library(ggplot2) -library(randomForest) -library(tidyr) -library(pacman) -# pacman::p_load(randomForest) - -print('---> R Script Start') - -# Define parameters -start_train <- as.Date("2017-01-01") -end_train <- as.Date("2023-11-30") -start_test <- as.Date("2024-01-01") -end_test <- as.Date("2024-06-30") +# Note: Since you prefer not to use additional packages, I'll use base R functions where possible. -n_buys <- 10 -verbose <- FALSE +# Load data +data0 <- read.csv('data0.csv', stringsAsFactors = FALSE) +data1 <- read.csv('data1.csv', stringsAsFactors = FALSE) +returns <- read.csv('returns.csv', stringsAsFactors = FALSE) -print('---> initial data set up') +# Merge data +data <- merge(data1, data0, by = 'security', all.x = TRUE) +data$security <- as.factor(data$security) +data$sector <- as.factor(data$sector) -# Load sector data -df_sectors <- read.csv('data/data0.csv') +# Create the new sector variable +data$sector_grouped <- ifelse(data$sector %in% c('Information Technology', 'Staples', 'Consumer Discretionary'), + as.character(data$sector), 'other') -# Load price and financial data -df_data <- read.csv('data/data1.csv') -df_data$date <- as.Date(df_data$date) +# Convert 'sector_grouped' to a factor +data$sector_grouped <- as.factor(data$sector_grouped) -df_x <- df_data %>% select(date, security, price, return30, ratio_pe, ratio_pcf, ratio_de, ratio_roe, ratio_roa) -df_y <- df_data %>% select(date, security, label) -list_vars1 <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa') +# Convert date columns +data$date <- as.Date(data$date) +returns$date <- as.Date(returns$date) -# Create signals DataFrame -df_signals <- data.frame(date = unique(df_x$date[df_x$date >= start_test & df_x$date <= end_test])) -df_signals <- df_signals %>% arrange(date) +# Sort data +data <- data[order(data$date, data$security), ] -# Initialize an empty list for storing accuracy results -df_signals$acc_total <- NA -df_signals$acc_current <- NA +# Split data into 70:30 ratio +unique_dates <- sort(unique(data$date)) +split_index <- floor(length(unique_dates) * 0.7) +train_dates <- unique_dates[1:split_index] +test_dates <- unique_dates[(split_index + 1):length(unique_dates)] -for (i in seq_len(nrow(df_signals))) { - - if (verbose) print(paste('---> doing', df_signals$date[i])) - - # Training set - df_trainx <- df_x %>% filter(date < df_signals$date[i]) - df_trainx <- df_trainx %>% filter(date != max(date)) - - df_trainy <- df_y %>% filter(date < df_signals$date[i]) - df_trainy <- df_trainy %>% filter(date != max(date)) - - # Test set - df_testx <- df_x %>% filter(date >= df_signals$date[i]) - df_testy <- df_y %>% filter(date >= df_signals$date[i]) - - # Scale data - for (col in list_vars1) { - scaler <- function(x) (x - min(x)) / (max(x) - min(x)) * 2 - 1 - df_trainx[[col]] <- scaler(df_trainx[[col]]) - df_testx[[col]] <- scaler(df_testx[[col]]) - } - - df_trainy <- df_trainy %>% mutate(label = as.factor(label)) - - # Fit the Random Forest classifier - if (i == 1) { - clf <- randomForest(x = df_trainx[list_vars1], y = df_trainy$label, ntree = 10, mtry = sqrt(length(list_vars1)), nodesize = 1000) - } - - # Predictions and accuracy - pred_probs <- predict(clf, newdata = df_testx[list_vars1], type = "prob") - df_testy$signal <- pred_probs[, 2] - df_testy$pred <- ifelse(pred_probs[, 2] > 0.5, 1, 0) - df_testy$count <- 1 - - df_current <- df_testy %>% filter(date == df_signals$date[i]) - - acc_total <- mean(df_testy$label == df_testy$pred) - acc_current <- mean(df_current$label == df_current$pred) - - print(paste('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2))) +train_data <- subset(data, date %in% train_dates) +test_data <- subset(data, date %in% test_dates) + +# Prepare training data +train_data$label <- as.factor(train_data$label) +train_data$security <- as.factor(train_data$security) +train_data$sector <- as.factor(train_data$sector) + +# Fit the models +full_model <- glm(label ~ sector_grouped + price + return30 + ratio_pe + ratio_pcf + ratio_de + ratio_roe + ratio_roa, + data = train_data, family = binomial) +null_model <- glm(label ~ 1, data = train_data, family = binomial) + +# Stepwise selection (both directions) +stepwise_model <- step(null_model, scope = list(lower = null_model, upper = full_model), + direction = "both", trace = TRUE) + +# Predict probabilities on test_data +predicted_probabilities <- predict(stepwise_model, newdata = test_data, type = "response") + +# Add predicted probabilities to test_data +test_data$predicted_probability <- predicted_probabilities + +# Create 'prob_data' data frame with necessary columns +prob_data <- data.frame( + date = test_data$date, + company = test_data$security, + probability = test_data$predicted_probability +) + +# Ensure 'date' and 'company' are appropriate types +prob_data$date <- as.Date(prob_data$date) +prob_data$company <- as.character(prob_data$company) + +# Get list of all unique dates and companies +all_dates <- sort(unique(prob_data$date)) +all_companies <- sort(unique(prob_data$company)) + +# Create complete grid of all date-company combinations +complete_grid <- expand.grid(date = all_dates, company = all_companies) + +# Merge 'prob_data' with 'complete_grid' to ensure all combinations are present +prob_data_complete <- merge(complete_grid, prob_data, by = c("date", "company"), all.x = TRUE) + +# Reshape data into matrix with dates as rows and companies as columns using 'xtabs' +prob_matrix <- xtabs(probability ~ date + company, data = prob_data_complete) +prob_matrix <- as.matrix(prob_matrix) + +# Replace NA values with -Inf to handle missing probabilities +prob_matrix[is.na(prob_matrix)] <- 0 + +# Initialize the binary matrix +prob_binary_matrix <- matrix(0, nrow = nrow(prob_matrix), ncol = ncol(prob_matrix)) +rownames(prob_binary_matrix) <- rownames(prob_matrix) +colnames(prob_binary_matrix) <- colnames(prob_matrix) + +# Loop over each date to select top 10 companies +for (i in 1:nrow(prob_matrix)) { + # Extract probabilities for the current date + probs <- prob_matrix[i, ] - # Add accuracy and signal to dataframe - df_signals$acc_total[i] <- acc_total - df_signals$acc_current[i] <- acc_current + # Get indices of top 10 probabilities + top10_indices <- order(probs, decreasing = TRUE)[1:10] - df_signals[i, df_current$security] <- df_current$signal + # Assign 1 to the top 10 companies in the binary matrix + prob_binary_matrix[i, top10_indices] <- 1 } -# Create buy matrix for payoff plot -df_signals$`10th` <- apply(df_signals[df_sectors$security], 1, function(x) sort(x, decreasing = TRUE)[n_buys]) -df_index <- df_signals[df_sectors$security] >= df_signals$`10th` - -# Set 1 for top 10 strongest signals -df_buys <- as.data.frame(matrix(0, nrow = nrow(df_signals), ncol = length(df_sectors$security))) -colnames(df_buys) <- df_sectors$security -df_buys[df_index] <- 1 - -# keep first 10 obs -process_row <- function(row) { - ones_indices <- which(row == 1) # Get the indices of 1's - if(length(ones_indices) > 10) { - row[ones_indices[11:length(ones_indices)]] <- 0 # Set ones after the first 10 to 0 - } - return(row) +# Verify that each row has exactly 10 ones +row_sums <- rowSums(prob_binary_matrix) +if (all(row_sums == 10)) { + cat("All dates have 10 companies selected.\n") +} else { + cat("Some dates do not have exactly 10 companies selected.\n") } -df_buys <- t(apply(df_buys, 1, process_row)) -# add dates -library(zoo) -# df_buys <- cbind(date = as.Date(df_signals$date), df_buys) -df_buys <- cbind(date = df_signals$date, df_buys) +# Now, integrate the payoff calculation +# Read returns data +df_returns <- read.csv('returns.csv', stringsAsFactors = FALSE) +df_returns$date <- as.Date(df_returns$date) -# df_buys[,-1] %>% rowSums() +# Filter df_returns to include only dates in the test set +start_test <- min(test_data$date) +end_test <- max(test_data$date) +df_returns <- subset(df_returns, date >= start_test & date <= end_test) +# Reshape df_returns to wide format with dates as rows and securities as columns +# Assuming df_returns has columns: 'date', 'security', 'return1' +# Use 'xtabs' to reshape +returns_matrix <- xtabs(return1 ~ date + security, data = df_returns) +returns_matrix <- as.matrix(returns_matrix) -# Plot signals -ggplot(df_signals, aes(x = date)) + geom_line(aes(y = AAPL)) + ggtitle("AAPL Signals") -image(t(df_buys[-1])) # You might need to use other visualization for the buy signals +# Replace NA values with 0 (assuming no gain or loss if return is missing) +returns_matrix[is.na(returns_matrix)] <- 0 -# Create return matrix -df_returns <- read.csv('data/returns.csv') -df_returns$date <- as.Date(df_returns$date) -df_returns <- df_returns %>% filter(date >= start_test) -df_returns <- df_returns %>% pivot_wider(names_from = security, values_from = return1) +# Ensure that the dates and companies in returns_matrix match those in prob_binary_matrix +common_dates <- intersect(rownames(prob_binary_matrix), rownames(returns_matrix)) +common_companies <- intersect(colnames(prob_binary_matrix), colnames(returns_matrix)) -plot_payoff <- function(df_buys) { - - df <- df_buys[,-1] - - if (sum(rowSums(df) == 10) != nrow(df)) { - stop("---> must have exactly 10 buys each day") - } - - # df_payoff <- df[, 1, drop = FALSE] - df <- df * (1 / n_buys) # equally weighted +# Subset both matrices to the common dates and companies +prob_binary_matrix_sub <- prob_binary_matrix[common_dates, common_companies] +returns_matrix_sub <- returns_matrix[common_dates, common_companies] - df_payoff <- NULL - arr_ret <- as.matrix(df_returns[-1] + 1) - df_payoff$payoff <- diag(df %*% t(arr_ret-1)) - df_payoff$tri <- cumprod(1+df_payoff$payoff) - - df_payoff$date <- df_returns$date - df_payoff <- df_payoff %>% as_tibble() - - - payoff_result <- (tail(df_payoff$tri, 1) - 1) * 100 - print(paste("---> payoff for these buys between period", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_result, 2), "%")) - - return(df_payoff) -} +# Ensure matrices are aligned +prob_binary_matrix_sub <- prob_binary_matrix_sub[order(rownames(prob_binary_matrix_sub)), order(colnames(prob_binary_matrix_sub))] +returns_matrix_sub <- returns_matrix_sub[order(rownames(returns_matrix_sub)), order(colnames(returns_matrix_sub))] + +# Define the number of buys per day +n_buys <- 10 + +# Define the initial capital +initial_capital <- 100 + +# Calculate the payoff + +# Calculate equal weights for each buy +df_buys_weighted <- prob_binary_matrix_sub * (1 / n_buys) # Equally weighted + +# Calculate daily returns +daily_returns <- rowSums(df_buys_weighted * returns_matrix_sub) + +# Calculate cumulative return starting from initial capital +cumulative_return <- initial_capital * cumprod(1 + daily_returns) + +# Prepare data frame for analysis +df_payoff <- data.frame( + date = as.Date(rownames(prob_binary_matrix_sub)), + daily_return = daily_returns, + cumulative_return = cumulative_return +) + +# Calculate total payoff over the period +total_payoff <- cumulative_return[length(cumulative_return)] - initial_capital +payoff_percentage <- (total_payoff / initial_capital) * 100 -df_payoff <- plot_payoff(df_buys) -ggplot(df_payoff, aes(x = date, y = tri)) + geom_line() + ggtitle("Payoff Over Time") +print(paste("---> Payoff for these buys between", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_percentage, 2), "%")) +# Plot cumulative return over time +plot(df_payoff$date, df_payoff$cumulative_return, type = "l", xlab = "Date", ylab = "Cumulative Return", main = "Strategy Cumulative Return Over Time") -print('---> R Script End') \ No newline at end of file