CPM-missing_data/rcpm_comp.R at main · qinghaoliang/CPM-missing_data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#################################################################
# CPM with complete data
# Input:
# all_edges: edges of complete subjects
# all_behav: phenotype of complete subjects
# seed: random number for cross-validation data split
# thresh: feature selection threshold, p-value
# lambda: ridge regression parameters
#
# Output:
# Rcv for regression.

rcpm_comp <- function(all_edges, all_behav,
                        id, threshold = 0.01, alpha = 0,
                        lambda = 0.5*10^seq(-2, 10, length.out = 100), nfold = 10,
                        seed = 123){
  require(glmnet)
  require(cvTools)
  source("impute_missing.R")
  source("nrmse.R")
  num_sub_total = dim(all_edges)[1]
  num_edges = dim(all_edges)[2]

  coef_total = matrix(nrow = num_edges,
                      ncol = nfold) # store all the coefficients
  coef0_total = c()
  lambda_total = c() # store all the lambda
  r_pearson_fold = c()
  y.predict <- c()
  mse_fold <- c()
  r_2_fold <- c()

  #Perform 10 fold cross validation
  behav <- all_behav
  label_missing <- is.na(behav[, id])
  Index <- 1:nrow(all_edges)
  missIndex <- Index[label_missing]
  compIndex <- Index[!label_missing]

  edge_comp  <- all_edges[compIndex,]
  behav_comp <- behav[compIndex,]

  set.seed(seed)
  folds <- cvFolds(length(compIndex), K = nfold)

  for(i_fold in 1:nfold){
    testIndexes <- folds$subsets[folds$which == i_fold]
    test_mat <- edge_comp[testIndexes, ]
    test_behav <- behav_comp[testIndexes, ]
    train_mat <- edge_comp[-testIndexes, ]
    train_behav <- behav_comp[-testIndexes,]

    test_behav <- test_behav[, id]
    train_behav <- train_behav[, id]

    # Feature Selection
    cor_pvalue <- apply(train_mat, 2, function(x){
      cc <- cor.test(x, train_behav, method = "pearson")
      cc$p.value
    })

    edges_1 <- which(cor_pvalue <= threshold)
    #cat("\n CV:", i_fold, "/number of selected edges:", length(edges_1), "\n")

    #Use the train data to decide best lambda for ridge regression
    model.cv <- cv.glmnet(x = train_mat[, edges_1], y = train_behav,
                          family = "gaussian", alpha = alpha, nfolds = 10)
    idxLambda1SE = model.cv$lambda.1se
    coef_total[edges_1, i_fold] <- as.vector(model.cv$glmnet.fit$
                                beta[, which(model.cv$lambda == idxLambda1SE)])
    coef0_total[i_fold] <- model.cv$glmnet.fit$
                           a0[which(model.cv$lambda == idxLambda1SE)]
    lambda_total[i_fold] <- idxLambda1SE
    y.predict[testIndexes] <- test_mat[, edges_1] %*%
                       matrix(coef_total[edges_1, i_fold]) + coef0_total[i_fold]
    r_pearson_fold[i_fold] <- cor(y.predict[testIndexes], test_behav)
    mse_fold[i_fold] <- mean((y.predict[testIndexes] - test_behav)^2)
    r_2_fold[i_fold] <- 1 - mse_fold[i_fold]/var(test_behav)
  }
  #browser()
  rm(all_edges)
  r_2_fold[r_2_fold < 0] = 0
  r_2 <- mean(sqrt(r_2_fold))
  cat("seed:", seed, "/r:", r_2, "\n")
  return(r_2)
}