Skip to content

Commit 5c6b7cb

Browse files
committed
WGS metrics
1 parent 1bb9b05 commit 5c6b7cb

File tree

4 files changed

+231
-0
lines changed

4 files changed

+231
-0
lines changed

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,10 @@ TARGETS += bam_interval_metrics
437437
bam_interval_metrics :
438438
$(call RUN_MAKE,modules/qc/bam_interval_metrics.mk)
439439

440+
TARGETS += wgs_metrics
441+
wgs_metrics :
442+
$(call RUN_MAKE,modules/qc/wgs_metrics.mk)
443+
440444
TARGETS += rnaseq_metrics
441445
rnaseq_metrics :
442446
$(call RUN_MAKE,modules/qc/rnaseqMetrics.mk)

Makefile.inc

+2
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ COLLECT_ALIGNMENT_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectAlignmentSummaryM
103103
COLLECT_INSERT_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectInsertSizeMetrics $(PICARD_OPTS)
104104
COLLECT_OXOG_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectOxoGMetrics $(PICAD_OPTS)
105105
COLLECT_GC_BIAS = $(PICARD) -Xmx$(PICARD_MEM) CollectGcBiasMetrics $(PICARD_OPTS)
106+
COLLECT_WGS_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectWgsMetrics $(PICARD_OPTS)
107+
COLLECT_DUP_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectDuplicateMetrics $(PICARD_OPTS)
106108
BAM_INDEX = $(PICARD) -Xmx$(PICARD_MEM) BamIndexStats $(PICARD_OPTS)
107109
FIX_MATE = $(call FIX_MATE_MEM,$(PICARD_MEM))
108110
FIX_MATE_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/FixMateInformation.jar $(PICARD_OPTS) TMP_DIR=$(TMPDIR)

qc/wgs_metrics.mk

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
include modules/Makefile.inc
2+
3+
LOGDIR ?= log/wgs_metrics.$(NOW)
4+
5+
wgs_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt) \
6+
$(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt) \
7+
$(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt) \
8+
$(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt) \
9+
$(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics_summary.txt) \
10+
$(foreach sample,$(SAMPLES),metrics/$(sample).wgs_metrics.txt) \
11+
$(foreach sample,$(SAMPLES),metrics/$(sample).duplicate_metrics.txt) \
12+
summary/idx_metrics.txt \
13+
summary/aln_metrics.txt \
14+
summary/insert_metrics.txt \
15+
summary/oxog_metrics.txt \
16+
summary/gc_metrics.txt \
17+
summary/wgs_metrics.txt \
18+
summary/duplicate_metrics.txt
19+
20+
SAMTOOLS_THREADS = 4
21+
SAMTOOLS_MEM_THREAD = 1G
22+
23+
GATK_THREADS = 4
24+
GATK_MEM_THREAD = 2G
25+
26+
define picard-metrics
27+
metrics/$1.idx_stats.txt : bam/$1.bam
28+
$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
29+
$$(BAM_INDEX) \
30+
INPUT=$$(<) \
31+
> $$(@)")
32+
33+
metrics/$1.aln_metrics.txt : bam/$1.bam
34+
$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
35+
$$(COLLECT_ALIGNMENT_METRICS) \
36+
REFERENCE_SEQUENCE=$$(REF_FASTA) \
37+
INPUT=$$(<) \
38+
OUTPUT=$$(@)")
39+
40+
metrics/$1.insert_metrics.txt : bam/$1.bam
41+
$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
42+
$$(COLLECT_INSERT_METRICS) \
43+
INPUT=$$(<) \
44+
OUTPUT=$$(@) \
45+
HISTOGRAM_FILE=metrics/$1.insert_metrics.pdf \
46+
MINIMUM_PCT=0.05")
47+
48+
metrics/$1.oxog_metrics.txt : bam/$1.bam
49+
$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
50+
$$(COLLECT_OXOG_METRICS) \
51+
REFERENCE_SEQUENCE=$$(REF_FASTA) \
52+
INPUT=$$(<) \
53+
OUTPUT=$$(@)")
54+
55+
metrics/$1.gc_metrics_summary.txt : bam/$1.bam
56+
$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
57+
$$(COLLECT_GC_BIAS) \
58+
INPUT=$$(<) \
59+
OUTPUT=metrics/$1.gc_metrics.txt \
60+
CHART_OUTPUT=metrics/$1.gc_metrics.pdf \
61+
REFERENCE_SEQUENCE=$$(REF_FASTA) \
62+
SUMMARY_OUTPUT=$$(@)")
63+
64+
metrics/$1.wgs_metrics.txt : bam/$1.bam
65+
$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
66+
$$(COLLECT_WGS_METRICS) \
67+
INPUT=$$(<) \
68+
OUTPUT=$$(@) \
69+
REFERENCE_SEQUENCE=$$(REF_FASTA)")
70+
71+
metrics/$1.duplicate_metrics.txt : bam/$1.bam
72+
$$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \
73+
$$(COLLECT_DUP_METRICS) \
74+
INPUT=$$(<) \
75+
METRICS_FILE=$$(@)")
76+
77+
endef
78+
$(foreach sample,$(SAMPLES),\
79+
$(eval $(call picard-metrics,$(sample))))
80+
81+
summary/idx_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt)
82+
$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
83+
$(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 1 --sample_names '$(SAMPLES)'")
84+
85+
summary/aln_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt)
86+
$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
87+
$(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 2 --sample_names '$(SAMPLES)'")
88+
89+
summary/insert_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt)
90+
$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
91+
$(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 3 --sample_names '$(SAMPLES)'")
92+
93+
summary/oxog_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt)
94+
$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
95+
$(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 4 --sample_names '$(SAMPLES)'")
96+
97+
summary/gc_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics_summary.txt)
98+
$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
99+
$(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 5 --sample_names '$(SAMPLES)'")
100+
101+
summary/wgs_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).wgs_metrics.txt)
102+
$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
103+
$(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 6 --sample_names '$(SAMPLES)'")
104+
105+
summary/duplicate_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).duplicate_metrics.txt)
106+
$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
107+
$(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 7 --sample_names '$(SAMPLES)'")
108+
109+
..DUMMY := $(shell mkdir -p version; \
110+
$(SAMTOOLS) --version >> version/wgs_metrics.txt; \
111+
echo "gatk3" >> version/wgs_metrics.txt; \
112+
$(GATK) --version >> version/wgs_metrics.txt; \
113+
echo "picard" >> version/wgs_metrics.txt)
114+
.SECONDARY:
115+
.DELETE_ON_ERROR:
116+
.PHONY: wgs_metrics

scripts/wgs_metrics.R

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env Rscript
2+
3+
suppressPackageStartupMessages(library("optparse"))
4+
suppressPackageStartupMessages(library("readr"))
5+
suppressPackageStartupMessages(library("dplyr"))
6+
suppressPackageStartupMessages(library("magrittr"))
7+
8+
if (!interactive()) {
9+
options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
10+
}
11+
12+
optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"),
13+
make_option("--sample_names", default = NA, type = 'character', help = "sample names"))
14+
parser = OptionParser(usage = "%prog", option_list = optList)
15+
arguments = parse_args(parser, positional_arguments = T)
16+
opt = arguments$options
17+
18+
if (as.numeric(opt$option)==1) {
19+
sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
20+
metrics = list()
21+
for (i in 1:length(sample_names)) {
22+
metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".idx_stats.txt"),
23+
col_names = FALSE, col_types = cols(.default = col_character()))[-85,,drop=FALSE] %>%
24+
readr::type_convert() %>%
25+
dplyr::select(CHROMOSOME = X1,
26+
LENGTH = X2,
27+
ALIGNED_READS = X3) %>%
28+
dplyr::mutate(CHROMOSOME = gsub(pattern=" length=", replacement="", x=CHROMOSOME),
29+
ALIGNED_READS = gsub(pattern="Aligned= ", replacement="", x=ALIGNED_READS),
30+
SAMPLE_NAME = sample_names[i])
31+
}
32+
metrics = do.call(rbind, metrics)
33+
write_tsv(metrics, path="summary/idx_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
34+
35+
} else if (as.numeric(opt$option)==2) {
36+
sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
37+
metrics = list()
38+
for (i in 1:length(sample_names)) {
39+
metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".aln_metrics.txt"),
40+
skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>%
41+
readr::type_convert() %>%
42+
dplyr::select(-SAMPLE, -READ_GROUP) %>%
43+
dplyr::mutate(SAMPLE_NAME = sample_names[i])
44+
}
45+
metrics = do.call(rbind, metrics)
46+
write_tsv(metrics, path="summary/aln_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
47+
48+
} else if (as.numeric(opt$option)==3) {
49+
sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
50+
metrics = list()
51+
for (i in 1:length(sample_names)) {
52+
metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".insert_metrics.txt"),
53+
skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>%
54+
readr::type_convert() %>%
55+
dplyr::select(-SAMPLE, -READ_GROUP) %>%
56+
dplyr::mutate(SAMPLE_NAME = sample_names[i])
57+
}
58+
metrics = do.call(rbind, metrics)
59+
write_tsv(metrics, path="summary/insert_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
60+
61+
} else if (as.numeric(opt$option)==4) {
62+
sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
63+
metrics = list()
64+
for (i in 1:length(sample_names)) {
65+
metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".oxog_metrics.txt"),
66+
skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>%
67+
readr::type_convert() %>%
68+
dplyr::rename(SAMPLE_NAME = SAMPLE_ALIAS)
69+
}
70+
metrics = do.call(rbind, metrics)
71+
write_tsv(metrics, path="summary/oxog_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
72+
73+
} else if (as.numeric(opt$option)==5) {
74+
sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
75+
metrics = list()
76+
for (i in 1:length(sample_names)) {
77+
metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".gc_metrics.txt"),
78+
skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>%
79+
readr::type_convert() %>%
80+
dplyr::mutate(SAMPLE_NAME = sample_names[i])
81+
}
82+
metrics = do.call(rbind, metrics)
83+
write_tsv(metrics, path="summary/gc_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
84+
85+
} else if (as.numeric(opt$option)==6) {
86+
sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
87+
metrics = list()
88+
for (i in 1:length(sample_names)) {
89+
metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".wgs_metrics.txt"),
90+
skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>%
91+
readr::type_convert() %>%
92+
dplyr::mutate(SAMPLE_NAME = sample_names[i])
93+
}
94+
metrics = do.call(rbind, metrics)
95+
write_tsv(metrics, path="summary/wgs_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
96+
97+
} else if (as.numeric(opt$option)==7) {
98+
sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
99+
metrics = list()
100+
for (i in 1:length(sample_names)) {
101+
metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".duplicate_metrics.txt"),
102+
skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>%
103+
readr::type_convert() %>%
104+
dplyr::mutate(SAMPLE_NAME = sample_names[i])
105+
}
106+
metrics = do.call(rbind, metrics)
107+
write_tsv(metrics, path="summary/duplicate_metrics.txt", na = "NA", append = FALSE, col_names = TRUE)
108+
109+
}

0 commit comments

Comments
 (0)