PIM-Data-Science · ClariS914 · Oct 5, 2024
diff --git a/solution.R b/solution.R
@@ -1,160 +1,175 @@
 # Load necessary libraries
-library(dplyr)
-library(lubridate)
-library(ggplot2)
-library(randomForest)
-library(tidyr)
-library(pacman)
-# pacman::p_load(randomForest)
-
-print('---> R Script Start')
-
-# Define parameters
-start_train <- as.Date("2017-01-01")
-end_train <- as.Date("2023-11-30")
-start_test <- as.Date("2024-01-01")
-end_test <- as.Date("2024-06-30")
+# Note: Since you prefer not to use additional packages, I'll use base R functions where possible.
 
-n_buys <- 10
-verbose <- FALSE
+# Load data
+data0 <- read.csv('data0.csv', stringsAsFactors = FALSE)
+data1 <- read.csv('data1.csv', stringsAsFactors = FALSE)
+returns <- read.csv('returns.csv', stringsAsFactors = FALSE)
 
-print('---> initial data set up')
+# Merge data
+data <- merge(data1, data0, by = 'security', all.x = TRUE)
+data$security <- as.factor(data$security)
+data$sector <- as.factor(data$sector)
 
-# Load sector data
-df_sectors <- read.csv('data/data0.csv')
+# Create the new sector variable
+data$sector_grouped <- ifelse(data$sector %in% c('Information Technology', 'Staples', 'Consumer Discretionary'),
+                              as.character(data$sector), 'other')
 
-# Load price and financial data
-df_data <- read.csv('data/data1.csv')
-df_data$date <- as.Date(df_data$date)
+# Convert 'sector_grouped' to a factor
+data$sector_grouped <- as.factor(data$sector_grouped)
 
-df_x <- df_data %>% select(date, security, price, return30, ratio_pe, ratio_pcf, ratio_de, ratio_roe, ratio_roa)
-df_y <- df_data %>% select(date, security, label)
 
-list_vars1 <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa')
+# Convert date columns
+data$date <- as.Date(data$date)
+returns$date <- as.Date(returns$date)
 
-# Create signals DataFrame
-df_signals <- data.frame(date = unique(df_x$date[df_x$date >= start_test & df_x$date <= end_test]))
-df_signals <- df_signals %>% arrange(date)
+# Sort data
+data <- data[order(data$date, data$security), ]
 
-# Initialize an empty list for storing accuracy results
-df_signals$acc_total <- NA
-df_signals$acc_current <- NA
+# Split data into 70:30 ratio
+unique_dates <- sort(unique(data$date))
+split_index <- floor(length(unique_dates) * 0.7)
+train_dates <- unique_dates[1:split_index]
+test_dates <- unique_dates[(split_index + 1):length(unique_dates)]
 
-for (i in seq_len(nrow(df_signals))) {
-
-  if (verbose) print(paste('---> doing', df_signals$date[i]))
-
-  # Training set
-  df_trainx <- df_x %>% filter(date < df_signals$date[i])
-  df_trainx <- df_trainx %>% filter(date != max(date))
-
-  df_trainy <- df_y %>% filter(date < df_signals$date[i])
-  df_trainy <- df_trainy %>% filter(date != max(date))
-
-  # Test set
-  df_testx <- df_x %>% filter(date >= df_signals$date[i])
-  df_testy <- df_y %>% filter(date >= df_signals$date[i])
-
-  # Scale data
-  for (col in list_vars1) {
-    scaler <- function(x) (x - min(x)) / (max(x) - min(x)) * 2 - 1
-    df_trainx[[col]] <- scaler(df_trainx[[col]])
-    df_testx[[col]] <- scaler(df_testx[[col]])
-  }
-
-  df_trainy <- df_trainy %>% mutate(label = as.factor(label)) 
-
-  # Fit the Random Forest classifier
-  if (i == 1) {
-    clf <- randomForest(x = df_trainx[list_vars1], y = df_trainy$label, ntree = 10, mtry = sqrt(length(list_vars1)), nodesize = 1000)
-  }
-
-  # Predictions and accuracy
-  pred_probs <- predict(clf, newdata = df_testx[list_vars1], type = "prob")
-  df_testy$signal <- pred_probs[, 2]
-  df_testy$pred <- ifelse(pred_probs[, 2] > 0.5, 1, 0)
-  df_testy$count <- 1
-
-  df_current <- df_testy %>% filter(date == df_signals$date[i])
-
-  acc_total <- mean(df_testy$label == df_testy$pred)
-  acc_current <- mean(df_current$label == df_current$pred)
-
-  print(paste('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2)))
+train_data <- subset(data, date %in% train_dates)
+test_data <- subset(data, date %in% test_dates)
+
+# Prepare training data
+train_data$label <- as.factor(train_data$label)
+train_data$security <- as.factor(train_data$security)
+train_data$sector <- as.factor(train_data$sector)
+
+# Fit the models
+full_model <- glm(label ~ sector_grouped + price + return30 + ratio_pe + ratio_pcf + ratio_de + ratio_roe + ratio_roa,
+                  data = train_data, family = binomial)
+null_model <- glm(label ~ 1, data = train_data, family = binomial)
+
+# Stepwise selection (both directions)
+stepwise_model <- step(null_model, scope = list(lower = null_model, upper = full_model),
+                       direction = "both", trace = TRUE)
+
+# Predict probabilities on test_data
+predicted_probabilities <- predict(stepwise_model, newdata = test_data, type = "response")
+
+# Add predicted probabilities to test_data
+test_data$predicted_probability <- predicted_probabilities
+
+# Create 'prob_data' data frame with necessary columns
+prob_data <- data.frame(
+  date = test_data$date,
+  company = test_data$security,
+  probability = test_data$predicted_probability
+)
+
+# Ensure 'date' and 'company' are appropriate types
+prob_data$date <- as.Date(prob_data$date)
+prob_data$company <- as.character(prob_data$company)
+
+# Get list of all unique dates and companies
+all_dates <- sort(unique(prob_data$date))
+all_companies <- sort(unique(prob_data$company))
+
+# Create complete grid of all date-company combinations
+complete_grid <- expand.grid(date = all_dates, company = all_companies)
+
+# Merge 'prob_data' with 'complete_grid' to ensure all combinations are present
+prob_data_complete <- merge(complete_grid, prob_data, by = c("date", "company"), all.x = TRUE)
+
+# Reshape data into matrix with dates as rows and companies as columns using 'xtabs'
+prob_matrix <- xtabs(probability ~ date + company, data = prob_data_complete)
+prob_matrix <- as.matrix(prob_matrix)
+
+# Replace NA values with -Inf to handle missing probabilities
+prob_matrix[is.na(prob_matrix)] <- 0
+
+# Initialize the binary matrix
+prob_binary_matrix <- matrix(0, nrow = nrow(prob_matrix), ncol = ncol(prob_matrix))
+rownames(prob_binary_matrix) <- rownames(prob_matrix)
+colnames(prob_binary_matrix) <- colnames(prob_matrix)
+
+# Loop over each date to select top 10 companies
+for (i in 1:nrow(prob_matrix)) {
+  # Extract probabilities for the current date
+  probs <- prob_matrix[i, ]
 
-  # Add accuracy and signal to dataframe
-  df_signals$acc_total[i] <- acc_total
-  df_signals$acc_current[i] <- acc_current
+  # Get indices of top 10 probabilities
+  top10_indices <- order(probs, decreasing = TRUE)[1:10]
 
-  df_signals[i, df_current$security] <- df_current$signal
+  # Assign 1 to the top 10 companies in the binary matrix
+  prob_binary_matrix[i, top10_indices] <- 1
 }
 
-# Create buy matrix for payoff plot
-df_signals$`10th` <- apply(df_signals[df_sectors$security], 1, function(x) sort(x, decreasing = TRUE)[n_buys])
-df_index <- df_signals[df_sectors$security] >= df_signals$`10th`
-
-# Set 1 for top 10 strongest signals
-df_buys <- as.data.frame(matrix(0, nrow = nrow(df_signals), ncol = length(df_sectors$security)))
-colnames(df_buys) <- df_sectors$security
-df_buys[df_index] <- 1
-
-# keep first 10 obs
-process_row <- function(row) {
-  ones_indices <- which(row == 1)  # Get the indices of 1's
-  if(length(ones_indices) > 10) {
-    row[ones_indices[11:length(ones_indices)]] <- 0  # Set ones after the first 10 to 0
-  }
-  return(row)
+# Verify that each row has exactly 10 ones
+row_sums <- rowSums(prob_binary_matrix)
+if (all(row_sums == 10)) {
+  cat("All dates have 10 companies selected.\n")
+} else {
+  cat("Some dates do not have exactly 10 companies selected.\n")
 }
-df_buys <- t(apply(df_buys, 1, process_row))
 
-# add dates
-library(zoo)
-# df_buys <- cbind(date = as.Date(df_signals$date), df_buys)
-df_buys <- cbind(date = df_signals$date, df_buys)
+# Now, integrate the payoff calculation
 
+# Read returns data
+df_returns <- read.csv('returns.csv', stringsAsFactors = FALSE)
+df_returns$date <- as.Date(df_returns$date)
 
-# df_buys[,-1] %>% rowSums()
+# Filter df_returns to include only dates in the test set
+start_test <- min(test_data$date)
+end_test <- max(test_data$date)
+df_returns <- subset(df_returns, date >= start_test & date <= end_test)
 
+# Reshape df_returns to wide format with dates as rows and securities as columns
+# Assuming df_returns has columns: 'date', 'security', 'return1'
+# Use 'xtabs' to reshape
+returns_matrix <- xtabs(return1 ~ date + security, data = df_returns)
+returns_matrix <- as.matrix(returns_matrix)
 
-# Plot signals
-ggplot(df_signals, aes(x = date)) + geom_line(aes(y = AAPL)) + ggtitle("AAPL Signals")
-image(t(df_buys[-1]))  # You might need to use other visualization for the buy signals
+# Replace NA values with 0 (assuming no gain or loss if return is missing)
+returns_matrix[is.na(returns_matrix)] <- 0
 
-# Create return matrix
-df_returns <- read.csv('data/returns.csv')
-df_returns$date <- as.Date(df_returns$date)
-df_returns <- df_returns %>% filter(date >= start_test)
-df_returns <- df_returns %>% pivot_wider(names_from = security, values_from = return1)
+# Ensure that the dates and companies in returns_matrix match those in prob_binary_matrix
+common_dates <- intersect(rownames(prob_binary_matrix), rownames(returns_matrix))
+common_companies <- intersect(colnames(prob_binary_matrix), colnames(returns_matrix))
 
-plot_payoff <- function(df_buys) {
-
-  df <- df_buys[,-1]
-
-  if (sum(rowSums(df) == 10) != nrow(df)) {
-    stop("---> must have exactly 10 buys each day")
-  }
-
-  # df_payoff <- df[, 1, drop = FALSE]
-  df <- df * (1 / n_buys)  # equally weighted
+# Subset both matrices to the common dates and companies
+prob_binary_matrix_sub <- prob_binary_matrix[common_dates, common_companies]
+returns_matrix_sub <- returns_matrix[common_dates, common_companies]
 
-  df_payoff <- NULL
-  arr_ret <- as.matrix(df_returns[-1] + 1)
-  df_payoff$payoff <- diag(df %*% t(arr_ret-1))
-  df_payoff$tri <- cumprod(1+df_payoff$payoff)
-
-  df_payoff$date <- df_returns$date
-  df_payoff <- df_payoff %>% as_tibble()
-
-
-  payoff_result <- (tail(df_payoff$tri, 1) - 1) * 100
-  print(paste("---> payoff for these buys between period", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_result, 2), "%"))
-
-  return(df_payoff)
-}
+# Ensure matrices are aligned
+prob_binary_matrix_sub <- prob_binary_matrix_sub[order(rownames(prob_binary_matrix_sub)), order(colnames(prob_binary_matrix_sub))]
+returns_matrix_sub <- returns_matrix_sub[order(rownames(returns_matrix_sub)), order(colnames(returns_matrix_sub))]
+
+# Define the number of buys per day
+n_buys <- 10
+
+# Define the initial capital
+initial_capital <- 100
+
+# Calculate the payoff
+
+# Calculate equal weights for each buy
+df_buys_weighted <- prob_binary_matrix_sub * (1 / n_buys)  # Equally weighted
+
+# Calculate daily returns
+daily_returns <- rowSums(df_buys_weighted * returns_matrix_sub)
+
+# Calculate cumulative return starting from initial capital
+cumulative_return <- initial_capital * cumprod(1 + daily_returns)
+
+# Prepare data frame for analysis
+df_payoff <- data.frame(
+  date = as.Date(rownames(prob_binary_matrix_sub)),
+  daily_return = daily_returns,
+  cumulative_return = cumulative_return
+)
+
+# Calculate total payoff over the period
+total_payoff <- cumulative_return[length(cumulative_return)] - initial_capital
+payoff_percentage <- (total_payoff / initial_capital) * 100
 
-df_payoff <- plot_payoff(df_buys)
-ggplot(df_payoff, aes(x = date, y = tri)) + geom_line() + ggtitle("Payoff Over Time")
+print(paste("---> Payoff for these buys between", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_percentage, 2), "%"))
 
+# Plot cumulative return over time
+plot(df_payoff$date, df_payoff$cumulative_return, type = "l", xlab = "Date", ylab = "Cumulative Return", main = "Strategy Cumulative Return Over Time")
 
-print('---> R Script End')