caret_on_headers_nona.R

# coding: utf-8
############################################################################
# stratify by BT column: those are 100% sure digital, others can be either
############################################################################
rm(list=ls())
setwd(dir = "~/repos/mammo/learn_spotmag_from_dicom_headers")
#cell#
library(caret)
library(data.table)

library(pROC)
# install.packages(c("pROC"))
library(ggplot2)
library(fastmatch)

read.gz <- function(filename, ...){
  as.data.frame(fread(paste("zcat < ",filename),
                      header=TRUE,  fill = TRUE, ...))
}


fn_ids = "../tables/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz"
ids = read.gz(fn_ids, select="id")$id

fn_features = "../tables/mammo_dicom_headers/df_all_mammos_dicom_headers_selected_nona.tab.gz"
dffeatures = read.gz(fn_features, sep='\t')

# rownames(dffeatures) <- dffeatures$filename
print(nrow(dffeatures))
print(length(ids))

dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
dffeatures <- dffeatures[!is.na(dffeatures$filename),]

rm(ids)

collist = c("BodyPartThickness", "XRayTubeCurrentInuA",  "ContentTime",
            "DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
for (cc in collist){
  dffeatures[,cc] <- as.numeric(dffeatures[,cc])
}


# (head(as.numeric(dffeatures$BodyPartThickness)))
dtypes = sapply(dffeatures, class)

row.names(dffeatures) = dffeatures$filename
excludeCols <- c("filename",
                 "CollimatorLeftVerticalEdge",
                 "CollimatorLowerHorizontalEdge",
                 "DistanceSourceToEntrance",
                 "ExposuresOnDetectorSinceLastCalibration",
                 "ExposuresOnDetectorSinceManufactured",
                 "ShutterLowerHorizontalEdge",     
                 "ShutterRightVerticalEdge",
                 "XRayTubeCurrentInuA"
                 # "ManufacturerModelName"
)
dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])


catcols <- c('ViewModifierCodeMeaning',
             'ViewCodeValue',
             'DetectorActiveDimensionsMissing',
             'FieldOfViewOriginMissing',
             'Grid',
             'Manufacturer',
             'ManufacturerModelName')

for (cc in catcols){
  dffeatures[,cc] = paste0("=", dffeatures[,cc])
  dffeatures[,cc] = as.factor(dffeatures[,cc])
}

dffeatures[,"HighBit"] <- as.numeric(dffeatures[,"HighBit"])

colSums(sapply(dffeatures, is.na))

# Read labels ---------------------------------

fn.labelledset = "../tables/spotmag_predictions/train_test_split-2018-02-15-within7e5.csv"
# filelist.labelled = read.table(fn.labelledset, )
df.labelled = as.data.frame(fread(fn.labelledset))
rownames(df.labelled) <- df.labelled$id
vec.labelled = df.labelled$id
df.labelled$label <- as.factor(df.labelled$label)

#cell#

vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
############################################################
dffeatures.labelled <- dffeatures[vec.labelled,]
dffeatures.labelled$label  <- df.labelled$label

dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]

table(dffeatures.labelled.tr_set$label)


goodrows <- 1 - colSums(sapply(dffeatures.labelled.tr_set, is.na)) / nrow(dffeatures.labelled.tr_set)

names(goodrows[goodrows<0.1])


for (cc in colnames(dffeatures.labelled.tr_set)){
  if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
    setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
    if (length(setdiff_)>0){
      print(cc)
      print(setdiff_)
    }
  }
}


# GLMNET ---------------------------------------------------------------------

library(glmnet)
# Using glmnet to directly perform CV
set.seed(0)

x_train <- model.matrix( ~ .-1, dffeatures.labelled.tr_set[,!(colnames(dffeatures.labelled.tr_set) %in% c("label"))])
dim(x_train)

cvob1=cv.glmnet(x=x_train,
                y=dffeatures.labelled.tr_set[,"label"],
                family="binomial",alpha=1, 
                type.measure="auc", nfolds = 5, lambda = seq(0.001,0.1,by = 0.001),
                standardize=FALSE)
plot(cvob1)

control <- trainControl(method="cv", number=5, returnResamp="all",
                        classProbs=TRUE, summaryFunction=twoClassSummary)
#classProbs = TRUE

tuneGrid <- expand.grid(alpha=c(0.00, 0.25, 0.50, 0.75, 0.99, 1.00), lambda = 10^seq(-5,-2,0.5))
tune = list()
fits = list()
rocs = list()
for (ii in 1:5){
    glmnetFit <- train(label ~ ., data = dffeatures.labelled.tr_set, 
                       method = "glmnet",
                       na.action = na.pass,
                       tuneGrid=tuneGrid,
                       metric = "ROC",
                       trControl = control)
    fits[[ii]] <- glmnetFit
    tune[[ii]] <- glmnetFit$bestTune
    rocs[[ii]] <- max(glmnetFit$results$ROC)
}

tune

varImp(glmnetFit, scale=T)
as.data.frame(glmnetFit$bestTune)

saveRDS(glmnetFit, sprintf("glmnet.rds", Sys.Date()))

## Save predictions  ---------------------------------------------------------

dffeatures[,"predictions_glmnet"] = predict(glmnetFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special

write.table(dffeatures[,c("predictions_glmnet"), drop=F],
            file="all_predictions_glmnet.tab", quote=F, sep='\t')