WGS metrics

ndbrown6 · ndbrown6 · commit 5c6b7cb15d29 · 2023-09-13T12:11:17.000-04:00
diff --git a/Makefile b/Makefile
@@ -437,6 +437,10 @@ TARGETS += bam_interval_metrics
 bam_interval_metrics :
 	$(call RUN_MAKE,modules/qc/bam_interval_metrics.mk)
 
+TARGETS += wgs_metrics
+wgs_metrics :
+	$(call RUN_MAKE,modules/qc/wgs_metrics.mk)
+
 TARGETS += rnaseq_metrics
 rnaseq_metrics :
 	$(call RUN_MAKE,modules/qc/rnaseqMetrics.mk)
diff --git a/Makefile.inc b/Makefile.inc
@@ -103,6 +103,8 @@ COLLECT_ALIGNMENT_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectAlignmentSummaryM
 COLLECT_INSERT_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectInsertSizeMetrics $(PICARD_OPTS)
 COLLECT_OXOG_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectOxoGMetrics $(PICAD_OPTS)
 COLLECT_GC_BIAS = $(PICARD) -Xmx$(PICARD_MEM) CollectGcBiasMetrics $(PICARD_OPTS)
+COLLECT_WGS_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectWgsMetrics $(PICARD_OPTS)
+COLLECT_DUP_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectDuplicateMetrics $(PICARD_OPTS)
 BAM_INDEX = $(PICARD) -Xmx$(PICARD_MEM) BamIndexStats $(PICARD_OPTS)
 FIX_MATE = $(call FIX_MATE_MEM,$(PICARD_MEM))
 FIX_MATE_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/FixMateInformation.jar $(PICARD_OPTS) TMP_DIR=$(TMPDIR)
diff --git a/qc/wgs_metrics.mk b/qc/wgs_metrics.mk
@@ -0,0 +1,116 @@
+include modules/Makefile.inc
+
+LOGDIR ?= log/wgs_metrics.$(NOW)
+
+wgs_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt) \
+	      $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt) \
+	      $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt) \
+	      $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt) \
+	      $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics_summary.txt) \
+	      $(foreach sample,$(SAMPLES),metrics/$(sample).wgs_metrics.txt) \
+	      $(foreach sample,$(SAMPLES),metrics/$(sample).duplicate_metrics.txt) \
+	      summary/idx_metrics.txt \
+	      summary/aln_metrics.txt \
+	      summary/insert_metrics.txt \
+	      summary/oxog_metrics.txt \
+	      summary/gc_metrics.txt \
+	      summary/wgs_metrics.txt \
+	      summary/duplicate_metrics.txt
+	    
+SAMTOOLS_THREADS = 4
+SAMTOOLS_MEM_THREAD = 1G
+
+GATK_THREADS = 4
+GATK_MEM_THREAD = 2G
+
+define picard-metrics
+metrics/$1.idx_stats.txt : bam/$1.bam
+	$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
+							$$(BAM_INDEX) \
+							INPUT=$$(<) \
+							> $$(@)")
+									   
+metrics/$1.aln_metrics.txt : bam/$1.bam
+	$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
+							$$(COLLECT_ALIGNMENT_METRICS) \
+							REFERENCE_SEQUENCE=$$(REF_FASTA) \
+							INPUT=$$(<) \
+							OUTPUT=$$(@)")
+									   
+metrics/$1.insert_metrics.txt : bam/$1.bam
+	$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
+							$$(COLLECT_INSERT_METRICS) \
+							INPUT=$$(<) \
+							OUTPUT=$$(@) \
+							HISTOGRAM_FILE=metrics/$1.insert_metrics.pdf \
+							MINIMUM_PCT=0.05")
+									   
+metrics/$1.oxog_metrics.txt : bam/$1.bam
+	$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
+							$$(COLLECT_OXOG_METRICS) \
+							REFERENCE_SEQUENCE=$$(REF_FASTA) \
+							INPUT=$$(<) \
+							OUTPUT=$$(@)")
+					    
+metrics/$1.gc_metrics_summary.txt : bam/$1.bam
+	$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
+							$$(COLLECT_GC_BIAS) \
+							INPUT=$$(<) \
+							OUTPUT=metrics/$1.gc_metrics.txt \
+							CHART_OUTPUT=metrics/$1.gc_metrics.pdf \
+							REFERENCE_SEQUENCE=$$(REF_FASTA) \
+							SUMMARY_OUTPUT=$$(@)")
+					   
+metrics/$1.wgs_metrics.txt : bam/$1.bam
+	$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
+							$$(COLLECT_WGS_METRICS) \
+							INPUT=$$(<) \
+							OUTPUT=$$(@) \
+							REFERENCE_SEQUENCE=$$(REF_FASTA)")
+							
+metrics/$1.duplicate_metrics.txt : bam/$1.bam
+	$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
+							$$(COLLECT_DUP_METRICS) \
+							INPUT=$$(<) \
+							METRICS_FILE=$$(@)")
+
+endef
+$(foreach sample,$(SAMPLES),\
+	$(eval $(call picard-metrics,$(sample))))
+	
+summary/idx_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt)
+	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
+					  $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 1 --sample_names '$(SAMPLES)'")
+					  
+summary/aln_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt)
+	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
+					  $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 2 --sample_names '$(SAMPLES)'")
+
+summary/insert_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt)
+	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
+					  $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 3 --sample_names '$(SAMPLES)'")
+					  
+summary/oxog_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt)
+	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
+					  $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 4 --sample_names '$(SAMPLES)'")
+					  
+summary/gc_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics_summary.txt)
+	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
+					  $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 5 --sample_names '$(SAMPLES)'")
+					  
+summary/wgs_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).wgs_metrics.txt)
+	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
+					  $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 6 --sample_names '$(SAMPLES)'")
+					  
+summary/duplicate_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).duplicate_metrics.txt)
+	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
+					  $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 7 --sample_names '$(SAMPLES)'")
+
+..DUMMY := $(shell mkdir -p version; \
+	     $(SAMTOOLS) --version >> version/wgs_metrics.txt; \
+	     echo "gatk3" >> version/wgs_metrics.txt; \
+	     $(GATK) --version >> version/wgs_metrics.txt; \
+	     echo "picard" >> version/wgs_metrics.txt)
+.SECONDARY:
+.DELETE_ON_ERROR:
+.PHONY: wgs_metrics
diff --git a/scripts/wgs_metrics.R b/scripts/wgs_metrics.R
@@ -0,0 +1,109 @@
+#!/usr/bin/env Rscript
+
+suppressPackageStartupMessages(library("optparse"))
+suppressPackageStartupMessages(library("readr"))
+suppressPackageStartupMessages(library("dplyr"))
+suppressPackageStartupMessages(library("magrittr"))
+
+if (!interactive()) {
+    options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
+}
+
+optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"),
+               make_option("--sample_names", default = NA, type = 'character', help = "sample names"))
+parser = OptionParser(usage = "%prog", option_list = optList)
+arguments = parse_args(parser, positional_arguments = T)
+opt = arguments$options
+
+if (as.numeric(opt$option)==1) {
+	sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
+	metrics = list()
+	for (i in 1:length(sample_names)) {
+		metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".idx_stats.txt"),
+					       col_names = FALSE, col_types = cols(.default = col_character()))[-85,,drop=FALSE] %>%
+			       readr::type_convert() %>%
+			       dplyr::select(CHROMOSOME = X1,
+					     LENGTH = X2,
+					     ALIGNED_READS = X3) %>%
+			       dplyr::mutate(CHROMOSOME = gsub(pattern=" length=", replacement="", x=CHROMOSOME),
+					     ALIGNED_READS = gsub(pattern="Aligned= ", replacement="", x=ALIGNED_READS),
+					     SAMPLE_NAME = sample_names[i])
+	}
+	metrics = do.call(rbind, metrics)
+	write_tsv(metrics, path="summary/idx_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
+	
+} else if (as.numeric(opt$option)==2) {
+	sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
+	metrics = list()
+	for (i in 1:length(sample_names)) {
+		metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".aln_metrics.txt"),
+					       skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>%
+			       readr::type_convert() %>%
+			       dplyr::select(-SAMPLE, -READ_GROUP) %>%
+			       dplyr::mutate(SAMPLE_NAME = sample_names[i])
+	}
+	metrics = do.call(rbind, metrics)
+	write_tsv(metrics, path="summary/aln_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
+
+} else if (as.numeric(opt$option)==3) {
+	sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
+	metrics = list()
+	for (i in 1:length(sample_names)) {
+		metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".insert_metrics.txt"),
+					       skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>%
+			       readr::type_convert() %>%
+			       dplyr::select(-SAMPLE, -READ_GROUP) %>%
+			       dplyr::mutate(SAMPLE_NAME = sample_names[i])
+	}
+	metrics = do.call(rbind, metrics)
+	write_tsv(metrics, path="summary/insert_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
+
+} else if (as.numeric(opt$option)==4) {
+	sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
+	metrics = list()
+	for (i in 1:length(sample_names)) {
+		metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".oxog_metrics.txt"),
+					  skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>%
+			       readr::type_convert() %>%
+			       dplyr::rename(SAMPLE_NAME = SAMPLE_ALIAS)
+	}
+	metrics = do.call(rbind, metrics)
+	write_tsv(metrics, path="summary/oxog_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
+
+} else if (as.numeric(opt$option)==5) {
+	sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
+	metrics = list()
+	for (i in 1:length(sample_names)) {
+		metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".gc_metrics.txt"),
+					       skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>%
+			       readr::type_convert() %>%
+			       dplyr::mutate(SAMPLE_NAME = sample_names[i])
+	}
+	metrics = do.call(rbind, metrics)
+	write_tsv(metrics, path="summary/gc_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
+
+} else if (as.numeric(opt$option)==6) {
+	sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
+	metrics = list()
+	for (i in 1:length(sample_names)) {
+		metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".wgs_metrics.txt"),
+					       skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>%
+			       readr::type_convert() %>%
+			       dplyr::mutate(SAMPLE_NAME = sample_names[i])
+	}
+	metrics = do.call(rbind, metrics)
+	write_tsv(metrics, path="summary/wgs_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
+
+} else if (as.numeric(opt$option)==7) {
+	sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
+	metrics = list()
+	for (i in 1:length(sample_names)) {
+		metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".duplicate_metrics.txt"),
+					       skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>%
+			       readr::type_convert() %>%
+			       dplyr::mutate(SAMPLE_NAME = sample_names[i])
+	}
+	metrics = do.call(rbind, metrics)
+	write_tsv(metrics, path="summary/duplicate_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
+
+}