diff --git a/docker/lr-metrics/environment.yml b/docker/lr-metrics/environment.yml index 1414228d8..fdc08c739 100644 --- a/docker/lr-metrics/environment.yml +++ b/docker/lr-metrics/environment.yml @@ -5,6 +5,7 @@ channels: - bioconda - r dependencies: + - conda-forge::ncurses - samtools - bedtools - java-jdk diff --git a/docker/lr-mini/Dockerfile b/docker/lr-mini/Dockerfile new file mode 100644 index 000000000..096dab081 --- /dev/null +++ b/docker/lr-mini/Dockerfile @@ -0,0 +1,14 @@ +FROM us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2023-07-28-v0.28.1-beta-e70dfbd7 + +#install some small, commonly used tools +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends bsdmainutils datamash tree && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* + diff --git a/docker/lr-mini/Makefile b/docker/lr-mini/Makefile new file mode 100644 index 000000000..7d4253440 --- /dev/null +++ b/docker/lr-mini/Makefile @@ -0,0 +1,17 @@ +IMAGE_NAME = lr-mini +VERSION = 0.1.1 + +TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest + +all: | build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +build_no_cache: + docker build --no-cache -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-mosdepth/Dockerfile b/docker/lr-mosdepth/Dockerfile index 9a5acff9d..77232063e 100644 --- a/docker/lr-mosdepth/Dockerfile +++ b/docker/lr-mosdepth/Dockerfile @@ -10,7 +10,7 @@ RUN conda env create -f /environment.yml && conda clean -a ENV PATH=/opt/conda/envs/lr-mosdepth/bin/:/root/google-cloud-sdk/bin/:${PATH} RUN apt-get -y update && \ - apt-get -y install curl zlib1g-dev libcurl4-openssl-dev libbz2-dev liblzma-dev && \ + apt-get -y install curl zlib1g-dev libcurl4-openssl-dev libbz2-dev liblzma-dev datamash bsdmainutils && \ apt-get clean # install gsutil diff --git a/docker/lr-mosdepth/Makefile b/docker/lr-mosdepth/Makefile index 9b2693d2b..0186202bd 100644 --- a/docker/lr-mosdepth/Makefile +++ b/docker/lr-mosdepth/Makefile @@ -1,5 +1,5 @@ IMAGE_NAME = lr-mosdepth -VERSION = 0.3.1 # This should match the MosDepth version number +VERSION = 0.3.2 # This should match the MosDepth version number TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest diff --git a/wdl/deprecated/ONT10x.wdl b/wdl/deprecated/ONT10x.wdl index 8ce3068ea..f9a260b5d 100644 --- a/wdl/deprecated/ONT10x.wdl +++ b/wdl/deprecated/ONT10x.wdl @@ -1,10 +1,10 @@ version 1.0 +import "../structs/Structs.wdl" import "../tasks/Utility/Utils.wdl" as Utils import "../tasks/Utility/ONTUtils.wdl" as ONT import "tasks/C3POa.wdl" as C3 import "../tasks/Alignment/AlignReads.wdl" as AR -import "../tasks/QC/AlignedMetrics.wdl" as AM import "../tasks/Utility/Finalize.wdl" as FF workflow ONT10x { diff --git a/wdl/deprecated/PBCCS10x.wdl b/wdl/deprecated/PBCCS10x.wdl index 7b54cf212..d1658c393 100644 --- a/wdl/deprecated/PBCCS10x.wdl +++ b/wdl/deprecated/PBCCS10x.wdl @@ -2,7 +2,6 @@ version 1.0 import "../tasks/Utility/PBUtils.wdl" as PB import "../tasks/Utility/Utils.wdl" as Utils -import "../tasks/Alignment/AlignReads.wdl" as AR import "../tasks/QC/AlignedMetrics.wdl" as AM import "tasks/AnnotateAdapters.wdl" as AA import "../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl b/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl index 05a41636e..edf05806d 100644 --- a/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl +++ b/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl @@ -10,7 +10,6 @@ version 1.0 import "../tasks/Utility/PBUtils.wdl" as PB import "../tasks/Utility/Utils.wdl" as Utils -import "../tasks/Alignment/AlignReads.wdl" as AR import "../tasks/QC/AlignedMetrics.wdl" as AM import "../tasks/VariantCalling/CallVariantsPBCCS.wdl" as VAR import "../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl b/wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl index 764a2ac55..429628a0d 100644 --- a/wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl +++ b/wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl @@ -8,7 +8,6 @@ version 1.0 import "../tasks/Utility/PBUtils.wdl" as PB import "../tasks/Utility/Utils.wdl" as Utils -import "../tasks/Alignment/AlignReads.wdl" as AR import "../tasks/QC/AlignedMetrics.wdl" as AM import "../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/deprecated/tasks/C3POa.wdl b/wdl/deprecated/tasks/C3POa.wdl index 1d82029a5..467c80d37 100644 --- a/wdl/deprecated/tasks/C3POa.wdl +++ b/wdl/deprecated/tasks/C3POa.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../../tasks/Utility/Utils.wdl" as Utils workflow C3POa { input { diff --git a/wdl/deprecated/tasks/GATKBestPractice.wdl b/wdl/deprecated/tasks/GATKBestPractice.wdl index 412ef5fdc..e38d24633 100644 --- a/wdl/deprecated/tasks/GATKBestPractice.wdl +++ b/wdl/deprecated/tasks/GATKBestPractice.wdl @@ -8,6 +8,7 @@ version 1.0 # "https://github.com/PacificBiosciences/hg002-ccs/" ########################################################################################## +import "../../structs/Structs.wdl" import "dsde_pipelines_tasks/GermlineVariantDiscovery.wdl" as Calling import "dsde_pipelines_tasks/Qc.wdl" as QC import "dsde_pipelines_tasks/Utilities.wdl" as DSDEPipelinesUtils diff --git a/wdl/deprecated/tasks/Scrapbook.wdl b/wdl/deprecated/tasks/Scrapbook.wdl new file mode 100644 index 000000000..c186e875b --- /dev/null +++ b/wdl/deprecated/tasks/Scrapbook.wdl @@ -0,0 +1,185 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task CoverageTrack { + input { + File bam + File bai + String chr + String start + String end + + RuntimeAttr? runtime_attr_override + } + + String basename = basename(bam, ".bam") + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) + + command <<< + set -euxo pipefail + + samtools depth -a ~{bam} -r ~{chr}:~{start}-~{end} | bgzip > ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz + tabix -p bed ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz + >>> + + output { + File coverage = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz" + File coverage_tbi = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FilterMQ0Reads { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB")) + String prefix = basename(bam, ".bam") + + command <<< + set -euxo pipefail + + samtools view -q 1 -b ~{bam} > ~{prefix}.no_mq0.bam + samtools index ~{prefix}.no_mq0.bam + >>> + + output { + File no_mq0_bam = "~{prefix}.no_mq0.bam" + File no_mq0_bai = "~{prefix}.no_mq0.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ComputeBedCoverage { + input { + File bam + File bai + File bed + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB") + size(bed, "GB")) + + command <<< + set -euxo pipefail + + bedtools coverage -b ~{bed} -a ~{bam} -nobuf | gzip > ~{prefix}.txt.gz + zcat ~{prefix}.txt.gz | awk '{ sum += sprintf("%f", $15*$16) } END { printf("%f\n", sum) }' > ~{prefix}.count.txt + >>> + + output { + File coverage = "~{prefix}.txt.gz" + Float counts = read_float("~{prefix}.count.txt") + File counts_file = "~{prefix}.count.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task BamToBed { + input { + File bam + File bai + + RuntimeAttr? runtime_attr_override + } + + String bed = basename(bam, ".bam") + ".bed" + Int disk_size = 4*ceil(size(bam, "GB") + size(bai, "GB")) + + command <<< + set -euxo pipefail + + bedtools bamtobed -i ~{bam} > ~{bed} + >>> + + output { + File bedfile = bed + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl b/wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl index c7cff9978..d7b4237bc 100644 --- a/wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl +++ b/wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl @@ -1,9 +1,8 @@ version 1.0 +import "../../../structs/Structs.wdl" import "../../../tasks/Utility/Utils.wdl" as Utils -import "../../../tasks/Utility/ONTUtils.wdl" as ONTUtils import "../../../tasks/Utility/VariantUtils.wdl" -import "../../../tasks/Preprocessing/Guppy.wdl" as Guppy import "../../../tasks/Utility/Finalize.wdl" as FF workflow ONTMethylation { @@ -733,4 +732,4 @@ task CallHaploidVariants { maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) } -} \ No newline at end of file +} diff --git a/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl b/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl index ed387effa..d4d49ac48 100644 --- a/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl +++ b/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../../structs/Structs.wdl" -import "../../../tasks/Utility/Finalize.wdl" as FF workflow ONTPfHrp2Hrp3Status { @@ -86,7 +85,7 @@ task IsLocusDeleted { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.1" + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.2" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { diff --git a/wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl b/wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl index 8173fcf4e..8d09fe9fb 100644 --- a/wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl +++ b/wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../../tasks/Preprocessing/Guppy.wdl" as Guppy -import "../../../tasks/Utility/Finalize.wdl" as FF workflow ONTBasecall { diff --git a/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl b/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl index 7e0470bfb..be29a30b9 100644 --- a/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl +++ b/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl @@ -1,6 +1,5 @@ version 1.0 -import "../../../tasks/Utility/ONTUtils.wdl" as ONT import "../../../tasks/Utility/Utils.wdl" as Utils import "../../../tasks/VariantCalling/CallVariantsONT.wdl" as VAR import "../../../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl b/wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl index 23da6a63f..fc73ed400 100644 --- a/wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl +++ b/wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl @@ -7,10 +7,13 @@ import "../../../tasks/Preprocessing/Longbow.wdl" as Longbow import "../../../tasks/QC/AlignedMetrics.wdl" as AM import "../../../tasks/Visualization/NanoPlot.wdl" as NP import "../../../tasks/Utility/Finalize.wdl" as FF - import "../../../tasks/Transcriptomics/MASSeq.wdl" as MAS -import "../../../tasks/Utility/JupyterNotebooks.wdl" as JUPYTER + +struct DataTypeParameters { + Int num_shards + String map_preset +} workflow PBFlowcell { diff --git a/wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl b/wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl index 84273ce53..c4a320245 100644 --- a/wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl +++ b/wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl @@ -2,7 +2,6 @@ version 1.0 import "../../../tasks/Utility/PBUtils.wdl" as PB import "../../../tasks/Utility/Utils.wdl" as Utils -import "../../../tasks/Alignment/AlignReads.wdl" as AR import "../../../tasks/Utility/Finalize.wdl" as FF workflow PBCCSIsoSeq { diff --git a/wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl b/wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl index c4050b63a..d29e94f83 100644 --- a/wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl +++ b/wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl @@ -2,10 +2,8 @@ version 1.0 import "../../../tasks/Utility/Utils.wdl" as Utils import "../../../tasks/Utility/PBUtils.wdl" as PB -import "../../../tasks/Alignment/AlignReads.wdl" as AR import "../../../tasks/Utility/StringTie2.wdl" as ST2 import "../../../tasks/Transcriptomics/MASSeq.wdl" as MAS -import "../../../tasks/Transcriptomics/UMI_Tools.wdl" as UMI_TOOLS import "../../../tasks/Transcriptomics/Preprocessing_Tasks.wdl" as TX_PRE import "../../../tasks/Transcriptomics/Postprocessing_Tasks.wdl" as TX_POST import "../../../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/pipelines/TechAgnostic/Annotation/AnnotateTranscriptomeWithGuide.wdl b/wdl/pipelines/TechAgnostic/Annotation/AnnotateTranscriptomeWithGuide.wdl index 14d54fa49..d6306030c 100644 --- a/wdl/pipelines/TechAgnostic/Annotation/AnnotateTranscriptomeWithGuide.wdl +++ b/wdl/pipelines/TechAgnostic/Annotation/AnnotateTranscriptomeWithGuide.wdl @@ -1,7 +1,5 @@ version 1.0 -import "../../../tasks/Utility/PBUtils.wdl" as PB -import "../../../tasks/Utility/Utils.wdl" as Utils import "../../../tasks/Utility/StringTie2.wdl" import "../../../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/pipelines/TechAgnostic/Utility/ConvertToHailMT.wdl b/wdl/pipelines/TechAgnostic/Utility/ConvertToHailMT.wdl index 310782a83..a0071cab7 100644 --- a/wdl/pipelines/TechAgnostic/Utility/ConvertToHailMT.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/ConvertToHailMT.wdl @@ -1,8 +1,6 @@ version 1.0 -import "../../../tasks/VariantCalling/GLNexus.wdl" as GLNexus import "../../../tasks/Utility/Hail.wdl" as Hail -import "../../../tasks/Utility/Finalize.wdl" as FF workflow ConvertToHailMT { diff --git a/wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl b/wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl index fb076fdd1..f411ef583 100644 --- a/wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl @@ -1,5 +1,6 @@ version 1.0 +import "../../../structs/Structs.wdl" import "../../../tasks/Utility/Utils.wdl" as Utils workflow DownloadFromSRA { diff --git a/wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl b/wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl index 7d021e562..224b26ac1 100644 --- a/wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl @@ -6,6 +6,7 @@ version 1.0 # reimagining of the Nextflow/AWS downloading pipeline from @alaincoletta (see: http://broad.io/aws_dl). ########################################################################################## +import "../../../structs/Structs.wdl" import "../../../tasks/Utility/Utils.wdl" as Utils workflow DownloadFromWeb { diff --git a/wdl/structs/Structs.wdl b/wdl/structs/Structs.wdl index eef96b9ce..4bed884f5 100644 --- a/wdl/structs/Structs.wdl +++ b/wdl/structs/Structs.wdl @@ -9,8 +9,3 @@ struct RuntimeAttr { Int? max_retries String? docker } - -struct DataTypeParameters { - Int num_shards - String map_preset -} diff --git a/wdl/tasks/Assembly/Hifiasm.wdl b/wdl/tasks/Assembly/Hifiasm.wdl index 878daa73b..3f0e8bcd5 100644 --- a/wdl/tasks/Assembly/Hifiasm.wdl +++ b/wdl/tasks/Assembly/Hifiasm.wdl @@ -2,8 +2,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../Utility/Utils.wdl" - workflow Hifiasm { meta { diff --git a/wdl/tasks/Preprocessing/Guppy.wdl b/wdl/tasks/Preprocessing/Guppy.wdl index 4ce7f51e5..7a3b57c03 100644 --- a/wdl/tasks/Preprocessing/Guppy.wdl +++ b/wdl/tasks/Preprocessing/Guppy.wdl @@ -9,9 +9,9 @@ version 1.0 # linearly but untested. ########################################################################################## +import "../../structs/Structs.wdl" import "../Utility/Utils.wdl" as Utils import "../Utility/ONTUtils.wdl" as ONT -import "../../structs/Structs.wdl" workflow Guppy { diff --git a/wdl/tasks/Preprocessing/Longbow.wdl b/wdl/tasks/Preprocessing/Longbow.wdl index d98561d32..f6f62283c 100644 --- a/wdl/tasks/Preprocessing/Longbow.wdl +++ b/wdl/tasks/Preprocessing/Longbow.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../Utility/Utils.wdl" import "../Transcriptomics/MASSeq.wdl" as MAS struct LongbowModelParams { diff --git a/wdl/tasks/QC/AlignedMetrics.wdl b/wdl/tasks/QC/AlignedMetrics.wdl index 7ad2ec933..6e60abd64 100644 --- a/wdl/tasks/QC/AlignedMetrics.wdl +++ b/wdl/tasks/QC/AlignedMetrics.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../Utility/Finalize.wdl" as FF workflow AlignedMetrics { @@ -11,7 +10,7 @@ workflow AlignedMetrics { parameter_meta { aligned_bam: "Aligned BAM file" aligned_bai: "Index for aligned BAM file" - ref_fasta: "Reference FASTA file" + ref_fasta: "Not used" ref_dict: "Reference dictionary file" gcs_output_dir: "GCS output directory" } @@ -20,13 +19,17 @@ workflow AlignedMetrics { File aligned_bam File aligned_bai - File ref_fasta + File? ref_fasta File ref_dict String? gcs_output_dir } - call ReadMetrics as AlignedReadMetrics { input: bam = aligned_bam } + call ReadMetrics { + input: + bam = aligned_bam, + gcs_output_dir = gcs_output_dir + } call MakeChrIntervalList { input: ref_dict = ref_dict } @@ -35,51 +38,12 @@ workflow AlignedMetrics { input: bam = aligned_bam, bai = aligned_bai, - chr = chr_info[0] - } - - call SummarizeDepth { input: regions = MosDepth.regions } - } - - call FlagStats as AlignedFlagStats { input: bam = aligned_bam } - - if (defined(gcs_output_dir)) { - String outdir = sub(gcs_output_dir + "", "/$", "") - - call FF.FinalizeToDir as FFYieldAligned { - input: - outdir = outdir + "/yield_aligned/", - files = [ - AlignedFlagStats.flag_stats, - AlignedReadMetrics.np_hist, - AlignedReadMetrics.range_gap_hist, - AlignedReadMetrics.zmw_hist, - AlignedReadMetrics.prl_counts, - AlignedReadMetrics.prl_hist, - AlignedReadMetrics.prl_nx, - AlignedReadMetrics.prl_yield_hist, - AlignedReadMetrics.rl_counts, - AlignedReadMetrics.rl_hist, - AlignedReadMetrics.rl_nx, - AlignedReadMetrics.rl_yield_hist - ] + chr = chr_info[0], + gcs_output_dir = gcs_output_dir } - - call FF.FinalizeToDir as FFCoverageFullDist { input: outdir = outdir + "/coverage/", files = MosDepth.full_dist } - call FF.FinalizeToDir as FFCoverageGlobalDist { input: outdir = outdir + "/coverage/", files = MosDepth.global_dist } - call FF.FinalizeToDir as FFCoverageRegionDist { input: outdir = outdir + "/coverage/", files = MosDepth.region_dist } - call FF.FinalizeToDir as FFCoverageRegions { input: outdir = outdir + "/coverage/", files = MosDepth.regions } - call FF.FinalizeToDir as FFCoverageRegionsCsi { input: outdir = outdir + "/coverage/", files = MosDepth.regions_csi } - call FF.FinalizeToDir as FFCoverageQuantizedDist { input: outdir = outdir + "/coverage/", files = MosDepth.quantized_dist } - call FF.FinalizeToDir as FFCoverageQuantized { input: outdir = outdir + "/coverage/", files = MosDepth.quantized } - call FF.FinalizeToDir as FFCoverageQuantizedCsi { input: outdir = outdir + "/coverage/", files = MosDepth.quantized_csi } - - call FF.FinalizeToDir as FFDepthSummaries { input: outdir = outdir + "/coverage_summaries/", files = SummarizeDepth.cov_summary } } output { - File aligned_flag_stats = AlignedFlagStats.flag_stats - Array[File] coverage_full_dist = MosDepth.full_dist Array[File] coverage_global_dist = MosDepth.global_dist Array[File] coverage_region_dist = MosDepth.region_dist @@ -88,18 +52,20 @@ workflow AlignedMetrics { Array[File] coverage_quantized_dist = MosDepth.quantized_dist Array[File] coverage_quantized = MosDepth.quantized Array[File] coverage_quantized_csi = MosDepth.quantized_csi - - File aligned_np_hist = AlignedReadMetrics.np_hist - File aligned_range_gap_hist = AlignedReadMetrics.range_gap_hist - File aligned_zmw_hist = AlignedReadMetrics.zmw_hist - File aligned_prl_counts = AlignedReadMetrics.prl_counts - File aligned_prl_hist = AlignedReadMetrics.prl_hist - File aligned_prl_nx = AlignedReadMetrics.prl_nx - File aligned_prl_yield_hist = AlignedReadMetrics.prl_yield_hist - File aligned_rl_counts = AlignedReadMetrics.rl_counts - File aligned_rl_hist = AlignedReadMetrics.rl_hist - File aligned_rl_nx = AlignedReadMetrics.rl_nx - File aligned_rl_yield_hist = AlignedReadMetrics.rl_yield_hist + Array[File] coverage_summary = MosDepth.cov_summary + + File aligned_np_hist = ReadMetrics.np_hist + File aligned_range_gap_hist = ReadMetrics.range_gap_hist + File aligned_zmw_hist = ReadMetrics.zmw_hist + File aligned_prl_counts = ReadMetrics.prl_counts + File aligned_prl_hist = ReadMetrics.prl_hist + File aligned_prl_nx = ReadMetrics.prl_nx + File aligned_prl_yield_hist = ReadMetrics.prl_yield_hist + File aligned_rl_counts = ReadMetrics.rl_counts + File aligned_rl_hist = ReadMetrics.rl_hist + File aligned_rl_nx = ReadMetrics.rl_nx + File aligned_rl_yield_hist = ReadMetrics.rl_yield_hist + File aligned_flag_stats = ReadMetrics.flag_stats File raw_chr_intervals = MakeChrIntervalList.raw_chrs } @@ -133,7 +99,7 @@ task MakeChrIntervalList { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + docker: "ubuntu:21.04" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -152,13 +118,15 @@ task MosDepth { File bam File bai String chr - Int? window_size + Int window_size = 500 + String? gcs_output_dir RuntimeAttr? runtime_attr_override } + String? coverage_dir = if defined(gcs_output_dir) then sub(select_first([gcs_output_dir]), "/?$", "/coverage/") else gcs_output_dir + String? summary_dir = if defined(gcs_output_dir) then sub(select_first([gcs_output_dir]), "/?$", "/coverage_summaries/") else gcs_output_dir Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) - Int ws = select_first([window_size, 500]) String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage.~{chr}" @@ -166,7 +134,7 @@ task MosDepth { set -euxo pipefail mosdepth -t 4 -c "~{chr}" -n -x -Q 1 ~{prefix}.full ~{bam} - mosdepth -t 4 -c "~{chr}" -n -x -Q 1 -b ~{ws} ~{prefix} ~{bam} + mosdepth -t 4 -c "~{chr}" -n -x -Q 1 -b ~{window_size} ~{prefix} ~{bam} export MOSDEPTH_Q0=NO_COVERAGE # 0 -- defined by the arguments to --quantize export MOSDEPTH_Q1=LOW_COVERAGE # 1..4 @@ -174,6 +142,25 @@ task MosDepth { export MOSDEPTH_Q3=HIGH_COVERAGE # 150 ... mosdepth -t 4 -c "~{chr}" -n -x -Q 1 --quantize 0:1:5:150: ~{prefix}.quantized ~{bam} + + ( echo 'chr start stop cov_mean cov_sd cov_q1 cov_median cov_q3 cov_iqr' && \ + zcat ~{prefix}.regions.bed.gz | datamash first 1 first 2 last 3 mean 4 sstdev 4 q1 4 median 4 q3 4 iqr 4 ) | \ + column -t > ~{prefix}.summary.txt + + if ~{defined(coverage_dir)}; then + gsutil -m cp \ + "~{prefix}.full.mosdepth.global.dist.txt" \ + "~{prefix}.mosdepth.global.dist.txt" \ + "~{prefix}.mosdepth.region.dist.txt" \ + "~{prefix}.regions.bed.gz" \ + "~{prefix}.regions.bed.gz.csi" \ + "~{prefix}.quantized.mosdepth.global.dist.txt" \ + "~{prefix}.quantized.quantized.bed.gz" \ + "~{prefix}.quantized.quantized.bed.gz.csi" \ + "~{coverage_dir}" + gsutil cp "~{prefix}.summary.txt" "~{summary_dir}" + fi + >>> output { @@ -185,6 +172,7 @@ task MosDepth { File quantized_dist = "~{prefix}.quantized.mosdepth.global.dist.txt" File quantized = "~{prefix}.quantized.quantized.bed.gz" File quantized_csi = "~{prefix}.quantized.quantized.bed.gz.csi" + File cov_summary = "~{prefix}.summary.txt" } ######################### @@ -195,56 +183,7 @@ task MosDepth { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.1" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task MosDepthOverBed { - input { - File bam - File bai - File bed - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) - String basename = basename(bam, ".bam") - String bedname = basename(bed, ".bed") - String prefix = "~{basename}.coverage_over_bed.~{bedname}" - - command <<< - set -euxo pipefail - - mosdepth -t 4 -b ~{bed} -n -x -Q 1 ~{prefix} ~{bam} - >>> - - output { - File global_dist = "~{prefix}.mosdepth.global.dist.txt" - File region_dist = "~{prefix}.mosdepth.region.dist.txt" - File regions = "~{prefix}.regions.bed.gz" - File regions_csi = "~{prefix}.regions.bed.gz.csi" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 4, - mem_gb: 8, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "quay.io/biocontainers/mosdepth:0.2.4--he527e40_0" + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.2" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -258,287 +197,162 @@ task MosDepthOverBed { } } -task SummarizeDepth { - input { - File regions - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 2*ceil(size(regions, "GB")) - String chrName = sub(basename(regions, ".regions.bed.gz"), "out.coverage.", "") - - command <<< - set -euxo pipefail - - ((echo 'chr start stop cov_mean cov_sd cov_q1 cov_median cov_q3 cov_iqr') && \ - (zcat ~{regions} | datamash first 1 first 2 last 3 mean 4 sstdev 4 q1 4 median 4 q3 4 iqr 4)) | \ - column -t > ~{chrName}.summary.txt - >>> - - output { - File cov_summary = "~{chrName}.summary.txt" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 2, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task CoverageTrack { - input { - File bam - File bai - String chr - String start - String end - - RuntimeAttr? runtime_attr_override - } - - String basename = basename(bam, ".bam") - Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) - - command <<< - set -euxo pipefail - - samtools depth -a ~{bam} -r ~{chr}:~{start}-~{end} | bgzip > ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz - tabix -p bed ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz - >>> - - output { - File coverage = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz" - File coverage_tbi = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz.tbi" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task FlagStats { - input { - File bam - - RuntimeAttr? runtime_attr_override - } - - String basename = basename(bam, ".bam") - Int disk_size = 2*ceil(size(bam, "GB")) - - command <<< - set -euxo pipefail - - samtools flagstat ~{bam} > ~{basename}.flag_stats.txt - >>> - - output { - File flag_stats = "~{basename}.flag_stats.txt" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task ReadNamesAndLengths { +task ReadMetrics { input { File bam + String? gcs_output_dir RuntimeAttr? runtime_attr_override } + String? output_dir = if defined(gcs_output_dir) then sub(select_first([gcs_output_dir]), "/?$", "/yield_aligned/") else gcs_output_dir String basename = basename(bam, ".bam") Int disk_size = 2*ceil(size(bam, "GB")) command <<< set -euxo pipefail - samtools view ~{bam} | awk '{ print $1, length($10) }' | gzip -1 > ~{basename}.read_names_and_lengths.txt.gz - >>> - - output { - File read_names_and_lengths = "~{basename}.read_names_and_lengths.txt.gz" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task FilterMQ0Reads { - input { - File bam - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 2*ceil(size(bam, "GB")) - String prefix = basename(bam, ".bam") - - command <<< - set -euxo pipefail - - samtools view -q 1 -b ~{bam} > ~{prefix}.no_mq0.bam - samtools index ~{prefix}.no_mq0.bam - >>> - - output { - File no_mq0_bam = "~{prefix}.no_mq0.bam" - File no_mq0_bai = "~{prefix}.no_mq0.bam.bai" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 2, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task ComputeBedCoverage { - input { - File bam - File bai - File bed - String prefix - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB") + size(bed, "GB")) - - command <<< - set -euxo pipefail - - bedtools coverage -b ~{bed} -a ~{bam} -nobuf | gzip > ~{prefix}.txt.gz - zcat ~{prefix}.txt.gz | awk '{ sum += sprintf("%f", $15*$16) } END { printf("%f\n", sum) }' > ~{prefix}.count.txt - >>> - - output { - File coverage = "~{prefix}.txt.gz" - Float counts = read_float("~{prefix}.count.txt") - File counts_file = "~{prefix}.count.txt" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 2, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task ReadMetrics { - input { - File bam - - RuntimeAttr? runtime_attr_override - } - - String basename = basename(bam, ".bam") - Int disk_size = 2*ceil(size(bam, "GB")) + cat << "EOF" > summarizeBAM.awk + BEGIN { FS=OFS="\t"; lastZMWid =-1; } + { + # retrieve the number of passes as the value of the "np" tag (default = 1) + nPasses=1; + for( fld=12;fld<=NF;++fld ) + if( substr($fld,1,5)=="np:i:" ) { + nPasses=substr($fld,6); + break; + } + + # get the read length and its bin for histograms + readLength=length($10); + + # default the zero-mode waveguide id as last value + 1 + zmwId=lastZMWid+1; + + #default the range info as 0:0 + rangeStart=0; + rangeStop=0; + + # parse the read name + if( split($1,namePieces,"/")==3 ) { + zmwId = namePieces[2]; + if( split(namePieces[3],range,"_")==2 ) { + rangeStart = range[0]; + rangeStop = range[1]; + } + } + + # calculate value to report for range gap (or -1 to indicate no report for this read) + reportedRangeGap=-1; + if( lastRangeStop!=0 && lastZMWid==zmwId ) { + reportedRangeGap=rangeStart-lastRangeStop; + } + + # calculate value to report for polymerase read length (or -1 to indicate no report for this read) + reportedPRL = -1; + if( lastZMWid!=-1 && lastZMWid!=zmwId ) { + reportedPRL=polymeraseRL; + } + + # track polymerase read length across reads with the same zmwId + if( zmwId!=lastZMWid ) polymeraseRL=0; + polymeraseRL+=readLength; + + # update values maintained across reads + lastZMWid=zmwId; + lastRangeStop=rangeStop; + + # print the parsed data + print readLength,nPasses,readLength*nPasses,zmwId,reportedRangeGap,reportedPRL; + } + EOF + + cat << "EOF" > calculateNX.awk + BEGIN { FS=OFS="\t" } + { + sum+=$1; + cent=int(100.*sum/tot); + if( cent!=lastCent ) + print cent,lastVal; + lastCent=cent; + lastVal=$1 + } + EOF + + # awkOut.tsv file is a tab delimited file with these columns: + # fld: 1 2 3 4 5 6 + # datum: readLength nPasses yield(len*passes) zmwId reportedRangeGap reportedPolymeraseReadLength + samtools view -F0x900 ~{bam} | awk -f summarizeBAM.awk > awkOut.tsv + + # extract the nPasses column for each read, sort numerically, count each group, add a header line + cut -f2 awkOut.tsv | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "np\tcount"}{print}' > "~{basename}.read_metrics.np_hist.txt" + + # extract the reportedRangeGap column, ignore -1 values, sort numerically, count each group, add header line + cut -f5 awkOut.tsv | sed '/^-1$/d' | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "range_gap\tcount"}{print}' > "~{basename}.read_metrics.range_gap_hist.txt" + + # extract the zmwId column, sort numerically, count each group, extract counts and sort numerically, + # count each group of counts, add header line + cut -f4 awkOut.tsv | sort -n | datamash -g1 count 1 | cut -f2 | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "zmw_count\tcount"}{print}' > "~{basename}.read_metrics.zmw_hist.txt" + + # extract yield and prl, ignore -1 prls, sort numerically, mash some summary stats, add header line + cut -f3,6 awkOut.tsv | sed '/ -1$/d' | sort -n | datamash count 2 sum 2 sum 1 min 2 max 2 mean 2 sstdev 2 | \ + awk 'BEGIN{print "num_polymerase_reads\tpolymerase_read_yield_bp\tpolymerase_read_yield_bp_times_passes\tpolymerase_read_min_length_bp\tpolymerase_read_max_length_bp\tpolymerase_read_mean_length_bp\tpolymerase_read_sd_length_bp"}{print}' > "~{basename}.read_metrics.prl_counts.txt" + + # extract prl rounded to 100s ignoring -1 values, sort numerically, count each group, add header line + awk 'BEGIN{FS=OFS="\t"}{if($6>=0)print int(($6+99)/100)*100}' awkOut.tsv | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "polymerase_read_length\tlength_bp"}{print}' > "~{basename}.read_metrics.prl_hist.txt" + + # calculate total of prl column, ignoring -1 values + # then extract the prl column, ignore -1 values, reverse sort numerically, print the centile divisions, + # forget about the 100th percentile, add header line + tot=$(cut -f6 < awkOut.tsv | sed '/^-1$/d' | datamash sum 1) + cut -f6 < awkOut.tsv | sed '/^-1$/d' | sort -rn | awk -v "tot=$tot" -f calculateNX.awk | sed '/^100 /d' | \ + awk 'BEGIN{print "NX\tvalue"}{print}' > "~{basename}.read_metrics.prl_nx.txt" + + # extract prl rounded to 100s and the original prl, sort numerically, sum prls for each bin, add header line + awk 'BEGIN{FS=OFS="\t"}{if($6>=0)print int(($6+99)/100)*100,$6}' awkOut.tsv | sort -n | datamash -g1 sum 2 | \ + awk 'BEGIN{print "polymerase_read_length\tyield_bp"}{print}' > "~{basename}.read_metrics.prl_yield_hist.txt" + + # extract readLength and yield columns, sort numerically, mash some summary stats, add header line + cut -f1,3 awkOut.tsv | sort -n | datamash count 1 sum 1 sum 2 min 1 max 1 mean 1 sstdev 1 | \ + awk 'BEGIN{print "num_reads\tread_yield_bp\tread_yield_bp_times_passes\tread_min_length_bp\tread_max_length_bp\tread_mean_length_bp\tread_sd_length_bp"}{print}' > "~{basename}.read_metrics.rl_counts.txt" + + # extract readLength rounded to 100s, sort numerically, count each group, add header line + awk 'BEGIN{FS=OFS="\t"}{print int(($1+99)/100)*100}' awkOut.tsv | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "read_length_bp\tcount"}{print}' > "~{basename}.read_metrics.rl_hist.txt" + + # calculate total of readLength column, + # then extract the readLength column, reverse sort numerically, print the centile divisions, + # forget about the 100th percentile, add header line + tot=$(cut -f1 < awkOut.tsv | datamash sum 1) + cut -f1 < awkOut.tsv | sort -rn | awk -v "tot=$tot" -f calculateNX.awk | sed '/^100 /d' | \ + awk 'BEGIN{print "NX\tvalue"}{print}' > "~{basename}.read_metrics.rl_nx.txt" + + # extract readLength rounded to 100s and the original readLength, sort numerically, sum each group, add header line + awk 'BEGIN{FS=OFS="\t"}{print int(($1+99)/100)*100,$1}' awkOut.tsv | sort -n | datamash -g1 sum 2 | \ + awk 'BEGIN{print "read_length\tyield_bp"}{print}' > "~{basename}.read_metrics.rl_yield_hist.txt" - command <<< - set -euxo pipefail + samtools flagstat ~{bam} > ~{basename}.flag_stats.txt - java -jar /usr/local/bin/gatk.jar ComputeLongReadMetrics -I ~{bam} -O ~{basename}.read_metrics -DF WellformedReadFilter + if ~{defined(output_dir)}; then + gsutil -m cp \ + "~{basename}.read_metrics.np_hist.txt" \ + "~{basename}.read_metrics.range_gap_hist.txt" \ + "~{basename}.read_metrics.zmw_hist.txt" \ + "~{basename}.read_metrics.prl_counts.txt" \ + "~{basename}.read_metrics.prl_hist.txt" \ + "~{basename}.read_metrics.prl_nx.txt" \ + "~{basename}.read_metrics.prl_yield_hist.txt" \ + "~{basename}.read_metrics.rl_counts.txt" \ + "~{basename}.read_metrics.rl_hist.txt" \ + "~{basename}.read_metrics.rl_nx.txt" \ + "~{basename}.read_metrics.rl_yield_hist.txt" \ + "~{basename}.flag_stats.txt" \ + "~{output_dir}" + fi >>> output { @@ -553,6 +367,7 @@ task ReadMetrics { File rl_hist = "~{basename}.read_metrics.rl_hist.txt" File rl_nx = "~{basename}.read_metrics.rl_nx.txt" File rl_yield_hist = "~{basename}.read_metrics.rl_yield_hist.txt" + File flag_stats = "~{basename}.flag_stats.txt" } ######################### @@ -563,50 +378,7 @@ task ReadMetrics { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task BamToBed { - input { - File bam - File bai - - RuntimeAttr? runtime_attr_override - } - - String bed = basename(bam, ".bam") + ".bed" - Int disk_size = 4*ceil(size(bam, "GB") + size(bai, "GB")) - - command <<< - set -euxo pipefail - - bedtools bamtobed -i ~{bam} > ~{bed} - >>> - - output { - File bedfile = bed - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 2, - mem_gb: 8, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + docker: "us.gcr.io/broad-dsp-lrma/lr-mini:0.1.1" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { diff --git a/wdl/tasks/QC/FPCheckAoU.wdl b/wdl/tasks/QC/FPCheckAoU.wdl index 3c6d63107..90699a186 100644 --- a/wdl/tasks/QC/FPCheckAoU.wdl +++ b/wdl/tasks/QC/FPCheckAoU.wdl @@ -1,5 +1,6 @@ version 1.0 +import "../../structs/Structs.wdl" import "../QC/Fingerprinting.wdl" as FPUtils import "../Utility/VariantUtils.wdl" diff --git a/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl b/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl index 009eb747f..2e0e595ac 100644 --- a/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl +++ b/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl @@ -1,8 +1,8 @@ version 1.0 +import "../../structs/Structs.wdl" import "../Utility/Utils.wdl" import "../Visualization/NanoPlot.wdl" as NP -import "../QC/AlignedMetrics.wdl" as AM workflow SampleLevelAlignedMetrics { @@ -30,7 +30,7 @@ workflow SampleLevelAlignedMetrics { call NP.NanoPlotFromBam { input: bam = aligned_bam, bai = aligned_bai } if (defined(bed_to_compute_coverage)) { - call AM.MosDepthOverBed { + call MosDepthOverBed { input: bam = aligned_bam, bai = aligned_bai, @@ -64,6 +64,55 @@ workflow SampleLevelAlignedMetrics { } } +task MosDepthOverBed { + input { + File bam + File bai + File bed + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) + String basename = basename(bam, ".bam") + String bedname = basename(bed, ".bed") + String prefix = "~{basename}.coverage_over_bed.~{bedname}" + + command <<< + set -euxo pipefail + + mosdepth -t 4 -b ~{bed} -n -x -Q 1 ~{prefix} ~{bam} + >>> + + output { + File global_dist = "~{prefix}.mosdepth.global.dist.txt" + File region_dist = "~{prefix}.mosdepth.region.dist.txt" + File regions = "~{prefix}.regions.bed.gz" + File regions_csi = "~{prefix}.regions.bed.gz.csi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "quay.io/biocontainers/mosdepth:0.2.4--he527e40_0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + task SummarizeDepthOverWholeBed { meta { diff --git a/wdl/tasks/VariantCalling/CallAssemblyVariants.wdl b/wdl/tasks/VariantCalling/CallAssemblyVariants.wdl index e2c99f82b..ae5a26d69 100644 --- a/wdl/tasks/VariantCalling/CallAssemblyVariants.wdl +++ b/wdl/tasks/VariantCalling/CallAssemblyVariants.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../Utility/Utils.wdl" as Utils workflow CallAssemblyVariants { diff --git a/wdl/tasks/VariantCalling/GLNexus.wdl b/wdl/tasks/VariantCalling/GLNexus.wdl index 966e57ed1..714492df5 100644 --- a/wdl/tasks/VariantCalling/GLNexus.wdl +++ b/wdl/tasks/VariantCalling/GLNexus.wdl @@ -1,7 +1,6 @@ version 1.0 -import "../Utility/Utils.wdl" -import "../Utility/VariantUtils.wdl" +import "../../structs/Structs.wdl" workflow JointCall {