diff --git a/solution.R b/solution.R index 16eb1c3..0cf69ed 100644 --- a/solution.R +++ b/solution.R @@ -1,14 +1,60 @@ # Load necessary libraries +rm(list=ls()) library(dplyr) library(lubridate) library(ggplot2) -library(randomForest) library(tidyr) -library(pacman) -# pacman::p_load(randomForest) +library(h2o) + + +# Initialize the H2O cluster +h2o.init(nthreads = -1) + +# Define parameters +start_train <- as.Date("2017-01-01") +end_train <- as.Date("2023-11-30") +start_test <- as.Date("2024-01-01") +end_test <- as.Date("2024-06-30") + +n_buys <- 10 +verbose <- FALSE + + + +# Load sector data +df_sectors <- read.csv('data/data0.csv') + +# Load price and financial data +df_data <- read.csv('data/data1.csv') +df_data$date <- as.Date(df_data$date) + +df_x <- df_data %>% select(date, security, price, return30, ratio_pe, ratio_pcf, ratio_de, ratio_roe, ratio_roa) +df_y <- df_data %>% select(date, security, label) + +list_vars1 <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa') + +# Create signals DataFrame +df_signals <- data.frame(date = unique(df_x$date[df_x$date >= start_test & df_x$date <= end_test])) +df_signals <- df_signals %>% arrange(date) + +# Initialize an empty list for storing accuracy results +df_signals$acc_total <- NA +df_signals$acc_current <- NA + + + +# Load necessary libraries +library(dplyr) +library(lubridate) +library(ggplot2) +library(tidyr) +library(h2o) print('---> R Script Start') +# Initialize the H2O cluster +h2o.init(nthreads = -1) + # Define parameters start_train <- as.Date("2017-01-01") end_train <- as.Date("2023-11-30") @@ -55,7 +101,7 @@ for (i in seq_len(nrow(df_signals))) { df_testx <- df_x %>% filter(date >= df_signals$date[i]) df_testy <- df_y %>% filter(date >= df_signals$date[i]) - # Scale data + # Scale data (optional, h2o handles scaling internally) for (col in list_vars1) { scaler <- function(x) (x - min(x)) / (max(x) - min(x)) * 2 - 1 df_trainx[[col]] <- scaler(df_trainx[[col]]) @@ -64,15 +110,33 @@ for (i in seq_len(nrow(df_signals))) { df_trainy <- df_trainy %>% mutate(label = as.factor(label)) - # Fit the Random Forest classifier - if (i == 1) { - clf <- randomForest(x = df_trainx[list_vars1], y = df_trainy$label, ntree = 10, mtry = sqrt(length(list_vars1)), nodesize = 1000) + # Convert to h2o frames + train_h2o <- as.h2o(cbind(df_trainx[list_vars1], label = df_trainy$label)) + test_h2o <- as.h2o(df_testx[list_vars1]) + + # Build and train the H2O deep learning model trains every 10 days + if (i %in% seq(1, nrow(df_signals), 10)) { + clf <- h2o.deeplearning( + x = list_vars1, + y = "label", + training_frame = train_h2o, + activation = "RectifierWithDropout", + hidden = c(64, 32), + epochs = 20, + input_dropout_ratio = 0.2, + hidden_dropout_ratios = c(0.5, 0.5), + balance_classes = TRUE, + stopping_metric = "misclassification", + stopping_rounds = 5, + verbose = verbose + ) } # Predictions and accuracy - pred_probs <- predict(clf, newdata = df_testx[list_vars1], type = "prob") - df_testy$signal <- pred_probs[, 2] - df_testy$pred <- ifelse(pred_probs[, 2] > 0.5, 1, 0) + pred_probs <- h2o.predict(clf, newdata = test_h2o) + pred_probs <- as.data.frame(pred_probs) + df_testy$signal <- pred_probs$p1 + df_testy$pred <- ifelse(pred_probs$p1 > 0.5, 1, 0) df_testy$count <- 1 df_current <- df_testy %>% filter(date == df_signals$date[i]) @@ -89,6 +153,19 @@ for (i in seq_len(nrow(df_signals))) { df_signals[i, df_current$security] <- df_current$signal } +# Continue with your signal plotting, buy matrix, and payoff calculations + +# Shutdown H2O cluster after completion +h2o.shutdown(prompt = FALSE) + +print('---> R Script End') + +# Continue with your signal plotting, buy matrix, and payoff calculations + +# Shutdown H2O cluster after completion +h2o.shutdown(prompt = FALSE) + + # Create buy matrix for payoff plot df_signals$`10th` <- apply(df_signals[df_sectors$security], 1, function(x) sort(x, decreasing = TRUE)[n_buys]) df_index <- df_signals[df_sectors$security] >= df_signals$`10th` @@ -137,7 +214,7 @@ plot_payoff <- function(df_buys) { # df_payoff <- df[, 1, drop = FALSE] df <- df * (1 / n_buys) # equally weighted - + df_payoff <- NULL arr_ret <- as.matrix(df_returns[-1] + 1) df_payoff$payoff <- diag(df %*% t(arr_ret-1)) @@ -157,4 +234,3 @@ df_payoff <- plot_payoff(df_buys) ggplot(df_payoff, aes(x = date, y = tri)) + geom_line() + ggtitle("Payoff Over Time") -print('---> R Script End') \ No newline at end of file