PIM-Data-Science · Crowd48 · Oct 5, 2024
diff --git a/solution.R b/solution.R
@@ -1,160 +1,117 @@
-# Load necessary libraries
+set.seed(123)
+
 library(dplyr)
-library(lubridate)
 library(ggplot2)
-library(randomForest)
 library(tidyr)
-library(pacman)
-# pacman::p_load(randomForest)
-
-print('---> R Script Start')
-
-# Define parameters
-start_train <- as.Date("2017-01-01")
-end_train <- as.Date("2023-11-30")
-start_test <- as.Date("2024-01-01")
-end_test <- as.Date("2024-06-30")
-
-n_buys <- 10
-verbose <- FALSE
-
-print('---> initial data set up')
-
-# Load sector data
-df_sectors <- read.csv('data/data0.csv')
-
-# Load price and financial data
-df_data <- read.csv('data/data1.csv')
-df_data$date <- as.Date(df_data$date)
-
-df_x <- df_data %>% select(date, security, price, return30, ratio_pe, ratio_pcf, ratio_de, ratio_roe, ratio_roa)
-df_y <- df_data %>% select(date, security, label)
-
-list_vars1 <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa')
-
-# Create signals DataFrame
-df_signals <- data.frame(date = unique(df_x$date[df_x$date >= start_test & df_x$date <= end_test]))
-df_signals <- df_signals %>% arrange(date)
-
-# Initialize an empty list for storing accuracy results
-df_signals$acc_total <- NA
-df_signals$acc_current <- NA
+library(caret)  # For confusionMatrix
+
+params <- list(
+  train_start_date = as.Date("2017-01-01"),
+  train_end_date = as.Date("2023-12-31"),
+  test_start_date = as.Date("2024-01-01"),
+  test_end_date = as.Date("2024-12-31"),
+  max_buys = 10,
+  verbose = FALSE
+)
+
+sector_data <- read.csv('data0.csv')
+financial_data <- read.csv('data1.csv') %>%
+  mutate(date = as.Date(date))
+
+returns_data <- read.csv('returns.csv') %>%
+  mutate(date = as.Date(date)) %>%
+  # only use data before 2024
+  filter(date >= params$test_start_date) %>%
+  pivot_wider(names_from = security, values_from = return1)
+
+features <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa')
+features_data <- financial_data %>% select(date, security, all_of(features))
+labels_data <- financial_data %>% select(date, security, label)
+
+signal_dates <- financial_data %>%
+  filter(date >= params$test_start_date & date <= params$test_end_date) %>%
+  distinct(date) %>%
+  arrange(date) %>%
+  mutate(total_accuracy = NA, current_accuracy = NA)
+
+normalize_features <- function(df) {
+  df %>%
+    mutate(across(all_of(features), ~ (.-min(.))/(max(.)-min(.)) * 2 - 1))
+}
 
-for (i in seq_len(nrow(df_signals))) {
-
-  if (verbose) print(paste('---> doing', df_signals$date[i]))
+for (index in seq_len(nrow(signal_dates))) {
 
-  # Training set
-  df_trainx <- df_x %>% filter(date < df_signals$date[i])
-  df_trainx <- df_trainx %>% filter(date != max(date))
+  training_filter <- features_data$date < signal_dates$date[index]
 
-  df_trainy <- df_y %>% filter(date < df_signals$date[i])
-  df_trainy <- df_trainy %>% filter(date != max(date))
+  training_features <- features_data %>%
+    filter(training_filter) %>%
+    filter(date != max(date)) %>%
+    normalize_features()
 
-  # Test set
-  df_testx <- df_x %>% filter(date >= df_signals$date[i])
-  df_testy <- df_y %>% filter(date >= df_signals$date[i])
+  training_labels <- labels_data %>%
+    filter(training_filter) %>%
+    filter(date != max(date)) %>%
+    mutate(label = as.factor(label))
 
-  # Scale data
-  for (col in list_vars1) {
-    scaler <- function(x) (x - min(x)) / (max(x) - min(x)) * 2 - 1
-    df_trainx[[col]] <- scaler(df_trainx[[col]])
-    df_testx[[col]] <- scaler(df_testx[[col]])
-  }
+  testing_features <- features_data %>%
+    filter(date >= signal_dates$date[index]) %>%
+    normalize_features()
 
-  df_trainy <- df_trainy %>% mutate(label = as.factor(label)) 
+  testing_labels <- labels_data %>%
+    filter(date >= signal_dates$date[index])
 
-  # Fit the Random Forest classifier
-  if (i == 1) {
-    clf <- randomForest(x = df_trainx[list_vars1], y = df_trainy$label, ntree = 10, mtry = sqrt(length(list_vars1)), nodesize = 1000)
+  # logistic model
+  if (index == 1) {
+    model <- glm(label ~ ., data = training_features %>% mutate(label = training_labels$label), family = binomial)
   }
 
-  # Predictions and accuracy
-  pred_probs <- predict(clf, newdata = df_testx[list_vars1], type = "prob")
-  df_testy$signal <- pred_probs[, 2]
-  df_testy$pred <- ifelse(pred_probs[, 2] > 0.5, 1, 0)
-  df_testy$count <- 1
+  predicted_probabilities <- predict(model, newdata = testing_features, type = "response")
+  testing_labels <- testing_labels %>%
+    mutate(signal_strength = predicted_probabilities,
+           predicted_label = ifelse(predicted_probabilities > 0.5, 1, 0))
 
-  df_current <- df_testy %>% filter(date == df_signals$date[i])
+  current_day_labels <- testing_labels %>% filter(date == signal_dates$date[index])
 
-  acc_total <- mean(df_testy$label == df_testy$pred)
-  acc_current <- mean(df_current$label == df_current$pred)
+  signal_dates$total_accuracy[index] <- mean(testing_labels$label == testing_labels$predicted_label)
+  signal_dates$current_accuracy[index] <- mean(current_day_labels$label == current_day_labels$predicted_label)
 
-  print(paste('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2)))
 
-  # Add accuracy and signal to dataframe
-  df_signals$acc_total[i] <- acc_total
-  df_signals$acc_current[i] <- acc_current
-
-  df_signals[i, df_current$security] <- df_current$signal
+  signal_dates[index, current_day_labels$security] <- current_day_labels$signal_strength
 }
 
-# Create buy matrix for payoff plot
-df_signals$`10th` <- apply(df_signals[df_sectors$security], 1, function(x) sort(x, decreasing = TRUE)[n_buys])
-df_index <- df_signals[df_sectors$security] >= df_signals$`10th`
+buy_thresholds <- apply(signal_dates[sector_data$security], 1, function(x) sort(x, decreasing = TRUE)[params$max_buys])
+buy_indices <- signal_dates[sector_data$security] >= buy_thresholds
 
-# Set 1 for top 10 strongest signals
-df_buys <- as.data.frame(matrix(0, nrow = nrow(df_signals), ncol = length(df_sectors$security)))
-colnames(df_buys) <- df_sectors$security
-df_buys[df_index] <- 1
+buy_matrix <- as.data.frame(ifelse(buy_indices, 1, 0), stringsAsFactors = FALSE)
+colnames(buy_matrix) <- sector_data$security
 
-# keep first 10 obs
-process_row <- function(row) {
-  ones_indices <- which(row == 1)  # Get the indices of 1's
-  if(length(ones_indices) > 10) {
-    row[ones_indices[11:length(ones_indices)]] <- 0  # Set ones after the first 10 to 0
+buy_matrix <- t(apply(buy_matrix, 1, function(row) {
+  if (sum(row) > params$max_buys) {
+    row[order(row, decreasing = TRUE)[(params$max_buys + 1):length(row)]] <- 0
   }
   return(row)
-}
-df_buys <- t(apply(df_buys, 1, process_row))
-
-# add dates
-library(zoo)
-# df_buys <- cbind(date = as.Date(df_signals$date), df_buys)
-df_buys <- cbind(date = df_signals$date, df_buys)
-
-
-# df_buys[,-1] %>% rowSums()
-
-
-# Plot signals
-ggplot(df_signals, aes(x = date)) + geom_line(aes(y = AAPL)) + ggtitle("AAPL Signals")
-image(t(df_buys[-1]))  # You might need to use other visualization for the buy signals
+}))
 
-# Create return matrix
-df_returns <- read.csv('data/returns.csv')
-df_returns$date <- as.Date(df_returns$date)
-df_returns <- df_returns %>% filter(date >= start_test)
-df_returns <- df_returns %>% pivot_wider(names_from = security, values_from = return1)
+buy_matrix <- cbind(date = signal_dates$date, buy_matrix)
 
-plot_payoff <- function(df_buys) {
+calculate_payoff <- function(buy_matrix, returns_data) {
+  normalized_buys <- buy_matrix[,-1] * (1 / params$max_buys)
+  returns_array <- as.matrix(returns_data[-1] + 1)
 
-  df <- df_buys[,-1]
+  total_payoff <- diag(normalized_buys %*% t(returns_array - 1))
+  cumulative_growth <- cumprod(1 + total_payoff)
 
-  if (sum(rowSums(df) == 10) != nrow(df)) {
-    stop("---> must have exactly 10 buys each day")
-  }
-
-  # df_payoff <- df[, 1, drop = FALSE]
-  df <- df * (1 / n_buys)  # equally weighted
-
-  df_payoff <- NULL
-  arr_ret <- as.matrix(df_returns[-1] + 1)
-  df_payoff$payoff <- diag(df %*% t(arr_ret-1))
-  df_payoff$tri <- cumprod(1+df_payoff$payoff)
+  payoffs <- tibble(date = returns_data$date, total_payoff = total_payoff, cumulative_growth = cumulative_growth)
 
-  df_payoff$date <- df_returns$date
-  df_payoff <- df_payoff %>% as_tibble()
+  final_payoff_percentage <- (tail(payoffs$cumulative_growth, 1) - 1) * 100
 
-
-  payoff_result <- (tail(df_payoff$tri, 1) - 1) * 100
-  print(paste("---> payoff for these buys between period", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_result, 2), "%"))
-
-  return(df_payoff)
+  return(payoffs)
 }
 
-df_payoff <- plot_payoff(df_buys)
-ggplot(df_payoff, aes(x = date, y = tri)) + geom_line() + ggtitle("Payoff Over Time")
-
+payoff_results <- calculate_payoff(buy_matrix, returns_data)
 
-print('---> R Script End')
+ggplot(payoff_results, aes(x = date, y = cumulative_growth)) +
+  geom_line() +
+  ggtitle("Predicted Payoff Over 6 Months") +
+  xlab("Date") +
+  ylab("Predicted Payoff")