From 934f1cae0ea1c00a59177552c9c311b142b917f2 Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Wed, 6 Sep 2023 10:25:18 -0400 Subject: [PATCH 1/8] a little cleanup --- wdl/deprecated/ONT10x.wdl | 2 +- wdl/deprecated/PBCCS10x.wdl | 1 - .../PBCCSDemultiplexWholeGenome.wdl | 1 - .../PBCLRDemultiplexWholeGenome.wdl | 1 - wdl/deprecated/tasks/C3POa.wdl | 1 - wdl/deprecated/tasks/GATKBestPractice.wdl | 1 + .../ONT/Epigenomics/ONTMethylation.wdl | 5 +- .../ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl | 1 - .../ONT/Preprocessing/ONTBasecall.wdl | 1 - .../ONT/VariantCalling/ONTWholeGenome.wdl | 1 - wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl | 7 +- wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl | 1 - .../PacBio/Utility/PBMASIsoSeqQuantify.wdl | 2 - .../AnnotateTranscriptomeWithGuide.wdl | 2 - .../TechAgnostic/Utility/ConvertToHailMT.wdl | 2 - .../TechAgnostic/Utility/DownloadFromSRA.wdl | 1 + .../TechAgnostic/Utility/DownloadFromWeb.wdl | 1 + wdl/structs/Structs.wdl | 5 - wdl/tasks/Assembly/Hifiasm.wdl | 2 - wdl/tasks/Preprocessing/Guppy.wdl | 2 +- wdl/tasks/Preprocessing/Longbow.wdl | 1 - wdl/tasks/QC/FPCheckAoU.wdl | 1 + wdl/tasks/QC/SampleLevelAlignedMetrics.wdl | 1 + wdl/tasks/Utility/MergeVCFs.wdl | 159 ++++++++++++++++++ .../VariantCalling/CallAssemblyVariants.wdl | 1 - wdl/tasks/VariantCalling/GLNexus.wdl | 3 +- 26 files changed, 174 insertions(+), 32 deletions(-) create mode 100644 wdl/tasks/Utility/MergeVCFs.wdl diff --git a/wdl/deprecated/ONT10x.wdl b/wdl/deprecated/ONT10x.wdl index 8ce3068ea..f9a260b5d 100644 --- a/wdl/deprecated/ONT10x.wdl +++ b/wdl/deprecated/ONT10x.wdl @@ -1,10 +1,10 @@ version 1.0 +import "../structs/Structs.wdl" import "../tasks/Utility/Utils.wdl" as Utils import "../tasks/Utility/ONTUtils.wdl" as ONT import "tasks/C3POa.wdl" as C3 import "../tasks/Alignment/AlignReads.wdl" as AR -import "../tasks/QC/AlignedMetrics.wdl" as AM import "../tasks/Utility/Finalize.wdl" as FF workflow ONT10x { diff --git a/wdl/deprecated/PBCCS10x.wdl b/wdl/deprecated/PBCCS10x.wdl index 7b54cf212..d1658c393 100644 --- a/wdl/deprecated/PBCCS10x.wdl +++ b/wdl/deprecated/PBCCS10x.wdl @@ -2,7 +2,6 @@ version 1.0 import "../tasks/Utility/PBUtils.wdl" as PB import "../tasks/Utility/Utils.wdl" as Utils -import "../tasks/Alignment/AlignReads.wdl" as AR import "../tasks/QC/AlignedMetrics.wdl" as AM import "tasks/AnnotateAdapters.wdl" as AA import "../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl b/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl index 05a41636e..edf05806d 100644 --- a/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl +++ b/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl @@ -10,7 +10,6 @@ version 1.0 import "../tasks/Utility/PBUtils.wdl" as PB import "../tasks/Utility/Utils.wdl" as Utils -import "../tasks/Alignment/AlignReads.wdl" as AR import "../tasks/QC/AlignedMetrics.wdl" as AM import "../tasks/VariantCalling/CallVariantsPBCCS.wdl" as VAR import "../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl b/wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl index 764a2ac55..429628a0d 100644 --- a/wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl +++ b/wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl @@ -8,7 +8,6 @@ version 1.0 import "../tasks/Utility/PBUtils.wdl" as PB import "../tasks/Utility/Utils.wdl" as Utils -import "../tasks/Alignment/AlignReads.wdl" as AR import "../tasks/QC/AlignedMetrics.wdl" as AM import "../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/deprecated/tasks/C3POa.wdl b/wdl/deprecated/tasks/C3POa.wdl index 1d82029a5..467c80d37 100644 --- a/wdl/deprecated/tasks/C3POa.wdl +++ b/wdl/deprecated/tasks/C3POa.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../../tasks/Utility/Utils.wdl" as Utils workflow C3POa { input { diff --git a/wdl/deprecated/tasks/GATKBestPractice.wdl b/wdl/deprecated/tasks/GATKBestPractice.wdl index 412ef5fdc..e38d24633 100644 --- a/wdl/deprecated/tasks/GATKBestPractice.wdl +++ b/wdl/deprecated/tasks/GATKBestPractice.wdl @@ -8,6 +8,7 @@ version 1.0 # "https://github.com/PacificBiosciences/hg002-ccs/" ########################################################################################## +import "../../structs/Structs.wdl" import "dsde_pipelines_tasks/GermlineVariantDiscovery.wdl" as Calling import "dsde_pipelines_tasks/Qc.wdl" as QC import "dsde_pipelines_tasks/Utilities.wdl" as DSDEPipelinesUtils diff --git a/wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl b/wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl index c7cff9978..d7b4237bc 100644 --- a/wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl +++ b/wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl @@ -1,9 +1,8 @@ version 1.0 +import "../../../structs/Structs.wdl" import "../../../tasks/Utility/Utils.wdl" as Utils -import "../../../tasks/Utility/ONTUtils.wdl" as ONTUtils import "../../../tasks/Utility/VariantUtils.wdl" -import "../../../tasks/Preprocessing/Guppy.wdl" as Guppy import "../../../tasks/Utility/Finalize.wdl" as FF workflow ONTMethylation { @@ -733,4 +732,4 @@ task CallHaploidVariants { maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) } -} \ No newline at end of file +} diff --git a/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl b/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl index ed387effa..4f13fc945 100644 --- a/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl +++ b/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../../structs/Structs.wdl" -import "../../../tasks/Utility/Finalize.wdl" as FF workflow ONTPfHrp2Hrp3Status { diff --git a/wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl b/wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl index 8173fcf4e..8d09fe9fb 100644 --- a/wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl +++ b/wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../../tasks/Preprocessing/Guppy.wdl" as Guppy -import "../../../tasks/Utility/Finalize.wdl" as FF workflow ONTBasecall { diff --git a/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl b/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl index 7e0470bfb..be29a30b9 100644 --- a/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl +++ b/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl @@ -1,6 +1,5 @@ version 1.0 -import "../../../tasks/Utility/ONTUtils.wdl" as ONT import "../../../tasks/Utility/Utils.wdl" as Utils import "../../../tasks/VariantCalling/CallVariantsONT.wdl" as VAR import "../../../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl b/wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl index 23da6a63f..fc73ed400 100644 --- a/wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl +++ b/wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl @@ -7,10 +7,13 @@ import "../../../tasks/Preprocessing/Longbow.wdl" as Longbow import "../../../tasks/QC/AlignedMetrics.wdl" as AM import "../../../tasks/Visualization/NanoPlot.wdl" as NP import "../../../tasks/Utility/Finalize.wdl" as FF - import "../../../tasks/Transcriptomics/MASSeq.wdl" as MAS -import "../../../tasks/Utility/JupyterNotebooks.wdl" as JUPYTER + +struct DataTypeParameters { + Int num_shards + String map_preset +} workflow PBFlowcell { diff --git a/wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl b/wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl index 84273ce53..c4a320245 100644 --- a/wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl +++ b/wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl @@ -2,7 +2,6 @@ version 1.0 import "../../../tasks/Utility/PBUtils.wdl" as PB import "../../../tasks/Utility/Utils.wdl" as Utils -import "../../../tasks/Alignment/AlignReads.wdl" as AR import "../../../tasks/Utility/Finalize.wdl" as FF workflow PBCCSIsoSeq { diff --git a/wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl b/wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl index c4050b63a..d29e94f83 100644 --- a/wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl +++ b/wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl @@ -2,10 +2,8 @@ version 1.0 import "../../../tasks/Utility/Utils.wdl" as Utils import "../../../tasks/Utility/PBUtils.wdl" as PB -import "../../../tasks/Alignment/AlignReads.wdl" as AR import "../../../tasks/Utility/StringTie2.wdl" as ST2 import "../../../tasks/Transcriptomics/MASSeq.wdl" as MAS -import "../../../tasks/Transcriptomics/UMI_Tools.wdl" as UMI_TOOLS import "../../../tasks/Transcriptomics/Preprocessing_Tasks.wdl" as TX_PRE import "../../../tasks/Transcriptomics/Postprocessing_Tasks.wdl" as TX_POST import "../../../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/pipelines/TechAgnostic/Annotation/AnnotateTranscriptomeWithGuide.wdl b/wdl/pipelines/TechAgnostic/Annotation/AnnotateTranscriptomeWithGuide.wdl index 14d54fa49..d6306030c 100644 --- a/wdl/pipelines/TechAgnostic/Annotation/AnnotateTranscriptomeWithGuide.wdl +++ b/wdl/pipelines/TechAgnostic/Annotation/AnnotateTranscriptomeWithGuide.wdl @@ -1,7 +1,5 @@ version 1.0 -import "../../../tasks/Utility/PBUtils.wdl" as PB -import "../../../tasks/Utility/Utils.wdl" as Utils import "../../../tasks/Utility/StringTie2.wdl" import "../../../tasks/Utility/Finalize.wdl" as FF diff --git a/wdl/pipelines/TechAgnostic/Utility/ConvertToHailMT.wdl b/wdl/pipelines/TechAgnostic/Utility/ConvertToHailMT.wdl index 310782a83..a0071cab7 100644 --- a/wdl/pipelines/TechAgnostic/Utility/ConvertToHailMT.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/ConvertToHailMT.wdl @@ -1,8 +1,6 @@ version 1.0 -import "../../../tasks/VariantCalling/GLNexus.wdl" as GLNexus import "../../../tasks/Utility/Hail.wdl" as Hail -import "../../../tasks/Utility/Finalize.wdl" as FF workflow ConvertToHailMT { diff --git a/wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl b/wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl index fb076fdd1..f411ef583 100644 --- a/wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl @@ -1,5 +1,6 @@ version 1.0 +import "../../../structs/Structs.wdl" import "../../../tasks/Utility/Utils.wdl" as Utils workflow DownloadFromSRA { diff --git a/wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl b/wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl index 7d021e562..224b26ac1 100644 --- a/wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl @@ -6,6 +6,7 @@ version 1.0 # reimagining of the Nextflow/AWS downloading pipeline from @alaincoletta (see: http://broad.io/aws_dl). ########################################################################################## +import "../../../structs/Structs.wdl" import "../../../tasks/Utility/Utils.wdl" as Utils workflow DownloadFromWeb { diff --git a/wdl/structs/Structs.wdl b/wdl/structs/Structs.wdl index eef96b9ce..4bed884f5 100644 --- a/wdl/structs/Structs.wdl +++ b/wdl/structs/Structs.wdl @@ -9,8 +9,3 @@ struct RuntimeAttr { Int? max_retries String? docker } - -struct DataTypeParameters { - Int num_shards - String map_preset -} diff --git a/wdl/tasks/Assembly/Hifiasm.wdl b/wdl/tasks/Assembly/Hifiasm.wdl index 878daa73b..3f0e8bcd5 100644 --- a/wdl/tasks/Assembly/Hifiasm.wdl +++ b/wdl/tasks/Assembly/Hifiasm.wdl @@ -2,8 +2,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../Utility/Utils.wdl" - workflow Hifiasm { meta { diff --git a/wdl/tasks/Preprocessing/Guppy.wdl b/wdl/tasks/Preprocessing/Guppy.wdl index 4ce7f51e5..7a3b57c03 100644 --- a/wdl/tasks/Preprocessing/Guppy.wdl +++ b/wdl/tasks/Preprocessing/Guppy.wdl @@ -9,9 +9,9 @@ version 1.0 # linearly but untested. ########################################################################################## +import "../../structs/Structs.wdl" import "../Utility/Utils.wdl" as Utils import "../Utility/ONTUtils.wdl" as ONT -import "../../structs/Structs.wdl" workflow Guppy { diff --git a/wdl/tasks/Preprocessing/Longbow.wdl b/wdl/tasks/Preprocessing/Longbow.wdl index d98561d32..f6f62283c 100644 --- a/wdl/tasks/Preprocessing/Longbow.wdl +++ b/wdl/tasks/Preprocessing/Longbow.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../Utility/Utils.wdl" import "../Transcriptomics/MASSeq.wdl" as MAS struct LongbowModelParams { diff --git a/wdl/tasks/QC/FPCheckAoU.wdl b/wdl/tasks/QC/FPCheckAoU.wdl index 3c6d63107..90699a186 100644 --- a/wdl/tasks/QC/FPCheckAoU.wdl +++ b/wdl/tasks/QC/FPCheckAoU.wdl @@ -1,5 +1,6 @@ version 1.0 +import "../../structs/Structs.wdl" import "../QC/Fingerprinting.wdl" as FPUtils import "../Utility/VariantUtils.wdl" diff --git a/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl b/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl index 009eb747f..e1c2b7d15 100644 --- a/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl +++ b/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl @@ -1,5 +1,6 @@ version 1.0 +import "../../structs/Structs.wdl" import "../Utility/Utils.wdl" import "../Visualization/NanoPlot.wdl" as NP import "../QC/AlignedMetrics.wdl" as AM diff --git a/wdl/tasks/Utility/MergeVCFs.wdl b/wdl/tasks/Utility/MergeVCFs.wdl new file mode 100644 index 000000000..053a1d6df --- /dev/null +++ b/wdl/tasks/Utility/MergeVCFs.wdl @@ -0,0 +1,159 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +workflow MergeVCFs { + + meta { + description: "Merges VCFs 'horizonally' (e.g., make a single VCF from many single-sample VCFs)" + } + + input { + Array[File] inputArray + Int chunkSize = 0 + RuntimeAttr? runtime_attr_override_chunks + RuntimeAttr? runtime_attr_override_merge + RuntimeAttr? runtime_attr_override_final_merge + } + + parameter_meta { + inputArray: "The array of VCFs to merge" + chunkSize: "Optional number of VCFs to merge in a single task" + runtime_attr_override_chunks: "Override the default runtime attributes for the chunking task" + runtime_attr_override_merge: "Override the default runtime attributes for the initial merging tasks" + runtime_attr_override_final_merge: "Override the default runtime attributes for the final merging task" + } + + + call CreateChunks { + input: + inputArray = inputArray, + chunkSize = chunkSize, + runtime_attr_override = runtime_attr_override_chunks + } + + scatter ( inputVCFs in CreateChunks.outputArray ) { + call MergeVCFsTask { + input: + inputVCFs = inputVCFs, + runtime_attr_override = runtime_attr_override_merge + } + } + + call MergeVCFsTask as FinalMerge { + input: + inputVCFs = MergeVCFsTask.outputVCF, + runtime_attr_override = runtime_attr_override_final_merge + } + + output { + File outputVCF = FinalMerge.outputVCF + } +} + + +task CreateChunks { + + meta { + description: "Breaks an array into chunks" + } + + input { + Array[String] inputArray + Int chunkSize = 0 + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + inputArray: "The array to chunk" + chunkSize: "Optional chunk size override" + runtime_attr_override: "Override the default runtime attributes" + } + + command <<< + set -euo pipefail + mkdir chunks + cd chunks + lines=`echo "~{length(inputArray)}" | \ + awk 'BEGIN{lines=~{chunkSize}} + {if ( !lines ) lines=int(sqrt($1)+.99); + while ( $1%lines==1 ) lines += 1; + print lines}'` + split -a4 -l$lines "~{write_lines(inputArray)}" + for fil in x*;do cat $fil | awk 'BEGIN{sep=""}{printf "%s%s",sep,$0;sep="\t"}END{printf "\n"}' >> ../inputs.tsv; done + >>> + + output { + Array[Array[String]] outputArray = read_tsv("inputs.tsv") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "ubuntu:latest" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeVCFsTask { + + meta { + description: "Merges vcfs" + } + + input { + Array[File] inputVCFs + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + inputVCFs: {help: "The vcfs to merge -- the vcfs may be given as gs: paths", + localization_optional: true} + runtime_attr_override: "Override the default runtime attributes" + } + + command <<< + set -euo pipefail + export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + bcftools merge --no-index -0 --threads 3 -Oz "~{sep='" "' inputVCFs}" > merged.vcf.gz + >>> + + output { + File outputVCF = "merged.vcf.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 4, + disk_gb: ceil(size(inputVCFs, "GB")) + 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2023-02-01-v0.26.8-beta-9b25c72d" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/VariantCalling/CallAssemblyVariants.wdl b/wdl/tasks/VariantCalling/CallAssemblyVariants.wdl index e2c99f82b..ae5a26d69 100644 --- a/wdl/tasks/VariantCalling/CallAssemblyVariants.wdl +++ b/wdl/tasks/VariantCalling/CallAssemblyVariants.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../Utility/Utils.wdl" as Utils workflow CallAssemblyVariants { diff --git a/wdl/tasks/VariantCalling/GLNexus.wdl b/wdl/tasks/VariantCalling/GLNexus.wdl index 966e57ed1..714492df5 100644 --- a/wdl/tasks/VariantCalling/GLNexus.wdl +++ b/wdl/tasks/VariantCalling/GLNexus.wdl @@ -1,7 +1,6 @@ version 1.0 -import "../Utility/Utils.wdl" -import "../Utility/VariantUtils.wdl" +import "../../structs/Structs.wdl" workflow JointCall { From b6224393232d21829011f74c7e1da1eefe3af213 Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Fri, 15 Sep 2023 13:42:19 -0400 Subject: [PATCH 2/8] improved AlignedMetrics --- docker/lr-metrics/Dockerfile | 6 +- docker/lr-metrics/environment.yml | 1 + docker/lr-mosdepth/Dockerfile | 2 +- test/test_data/AlignedMetrics.json | 7 + wdl/deprecated/tasks/Scrapbook.wdl | 183 +++++++ .../ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl | 2 +- wdl/tasks/QC/AlignedMetrics.wdl | 493 +++--------------- wdl/tasks/QC/SampleLevelAlignedMetrics.wdl | 52 +- 8 files changed, 319 insertions(+), 427 deletions(-) create mode 100644 test/test_data/AlignedMetrics.json create mode 100644 wdl/deprecated/tasks/Scrapbook.wdl diff --git a/docker/lr-metrics/Dockerfile b/docker/lr-metrics/Dockerfile index 6048d83f4..557b4a37f 100644 --- a/docker/lr-metrics/Dockerfile +++ b/docker/lr-metrics/Dockerfile @@ -16,9 +16,10 @@ ENV PATH=/miniconda/bin/:/miniconda/envs/lr-metrics/bin/:/root/google-cloud-sdk/ # install conda packages COPY ./environment.yml / +RUN conda install -n base conda-libmamba-solver && conda config --set solver libmamba RUN conda env create -f /environment.yml && conda clean -a -# install gatk +# install super-special version of gatk RUN git clone https://github.com/broadinstitute/gatk.git -b kvg_pbeap \ && cd gatk \ && git checkout c9497220ef13beb05da7c7a820c181be00b9b817 \ @@ -30,6 +31,9 @@ RUN git clone https://github.com/broadinstitute/gatk.git -b kvg_pbeap \ # install picard RUN wget -O /usr/local/bin/picard.jar https://github.com/broadinstitute/picard/releases/download/2.22.1/picard.jar +# install gsutil +RUN curl -sSL https://sdk.cloud.google.com | bash + # install various metric and visualization scripts COPY lima_report_detail.R / COPY lima_report_summary.R / diff --git a/docker/lr-metrics/environment.yml b/docker/lr-metrics/environment.yml index 1414228d8..fdc08c739 100644 --- a/docker/lr-metrics/environment.yml +++ b/docker/lr-metrics/environment.yml @@ -5,6 +5,7 @@ channels: - bioconda - r dependencies: + - conda-forge::ncurses - samtools - bedtools - java-jdk diff --git a/docker/lr-mosdepth/Dockerfile b/docker/lr-mosdepth/Dockerfile index 9a5acff9d..77232063e 100644 --- a/docker/lr-mosdepth/Dockerfile +++ b/docker/lr-mosdepth/Dockerfile @@ -10,7 +10,7 @@ RUN conda env create -f /environment.yml && conda clean -a ENV PATH=/opt/conda/envs/lr-mosdepth/bin/:/root/google-cloud-sdk/bin/:${PATH} RUN apt-get -y update && \ - apt-get -y install curl zlib1g-dev libcurl4-openssl-dev libbz2-dev liblzma-dev && \ + apt-get -y install curl zlib1g-dev libcurl4-openssl-dev libbz2-dev liblzma-dev datamash bsdmainutils && \ apt-get clean # install gsutil diff --git a/test/test_data/AlignedMetrics.json b/test/test_data/AlignedMetrics.json new file mode 100644 index 000000000..fd39c47a6 --- /dev/null +++ b/test/test_data/AlignedMetrics.json @@ -0,0 +1,7 @@ +{ + "AlignedMetrics.aligned_bam": "gs://broad-dsp-lrma-cromwell/test_data/aligned-metrics/NA24385.bam", + "AlignedMetrics.aligned_bai": "gs://broad-dsp-lrma-cromwell/test_data/aligned-metrics/NA24385.bam.bai", + "AlignedMetrics.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "AlignedMetrics.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "AlignedMetrics.gcs_output_dir": "gs://broad-dsp-lrma-ci/test-outputs" +} diff --git a/wdl/deprecated/tasks/Scrapbook.wdl b/wdl/deprecated/tasks/Scrapbook.wdl new file mode 100644 index 000000000..9c21161c6 --- /dev/null +++ b/wdl/deprecated/tasks/Scrapbook.wdl @@ -0,0 +1,183 @@ +version 1.0 + +task CoverageTrack { + input { + File bam + File bai + String chr + String start + String end + + RuntimeAttr? runtime_attr_override + } + + String basename = basename(bam, ".bam") + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) + + command <<< + set -euxo pipefail + + samtools depth -a ~{bam} -r ~{chr}:~{start}-~{end} | bgzip > ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz + tabix -p bed ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz + >>> + + output { + File coverage = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz" + File coverage_tbi = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FilterMQ0Reads { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB")) + String prefix = basename(bam, ".bam") + + command <<< + set -euxo pipefail + + samtools view -q 1 -b ~{bam} > ~{prefix}.no_mq0.bam + samtools index ~{prefix}.no_mq0.bam + >>> + + output { + File no_mq0_bam = "~{prefix}.no_mq0.bam" + File no_mq0_bai = "~{prefix}.no_mq0.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ComputeBedCoverage { + input { + File bam + File bai + File bed + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB") + size(bed, "GB")) + + command <<< + set -euxo pipefail + + bedtools coverage -b ~{bed} -a ~{bam} -nobuf | gzip > ~{prefix}.txt.gz + zcat ~{prefix}.txt.gz | awk '{ sum += sprintf("%f", $15*$16) } END { printf("%f\n", sum) }' > ~{prefix}.count.txt + >>> + + output { + File coverage = "~{prefix}.txt.gz" + Float counts = read_float("~{prefix}.count.txt") + File counts_file = "~{prefix}.count.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task BamToBed { + input { + File bam + File bai + + RuntimeAttr? runtime_attr_override + } + + String bed = basename(bam, ".bam") + ".bed" + Int disk_size = 4*ceil(size(bam, "GB") + size(bai, "GB")) + + command <<< + set -euxo pipefail + + bedtools bamtobed -i ~{bam} > ~{bed} + >>> + + output { + File bedfile = bed + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl b/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl index 4f13fc945..d4d49ac48 100644 --- a/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl +++ b/wdl/pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl @@ -85,7 +85,7 @@ task IsLocusDeleted { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.1" + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.2" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { diff --git a/wdl/tasks/QC/AlignedMetrics.wdl b/wdl/tasks/QC/AlignedMetrics.wdl index 7ad2ec933..5c75d4e93 100644 --- a/wdl/tasks/QC/AlignedMetrics.wdl +++ b/wdl/tasks/QC/AlignedMetrics.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../structs/Structs.wdl" -import "../Utility/Finalize.wdl" as FF workflow AlignedMetrics { @@ -11,7 +10,7 @@ workflow AlignedMetrics { parameter_meta { aligned_bam: "Aligned BAM file" aligned_bai: "Index for aligned BAM file" - ref_fasta: "Reference FASTA file" + ref_fasta: "Not used" ref_dict: "Reference dictionary file" gcs_output_dir: "GCS output directory" } @@ -20,13 +19,17 @@ workflow AlignedMetrics { File aligned_bam File aligned_bai - File ref_fasta + File? ref_fasta File ref_dict String? gcs_output_dir } - call ReadMetrics as AlignedReadMetrics { input: bam = aligned_bam } + call ReadMetrics { + input: + bam = aligned_bam, + gcs_output_dir = gcs_output_dir + } call MakeChrIntervalList { input: ref_dict = ref_dict } @@ -35,51 +38,12 @@ workflow AlignedMetrics { input: bam = aligned_bam, bai = aligned_bai, - chr = chr_info[0] - } - - call SummarizeDepth { input: regions = MosDepth.regions } - } - - call FlagStats as AlignedFlagStats { input: bam = aligned_bam } - - if (defined(gcs_output_dir)) { - String outdir = sub(gcs_output_dir + "", "/$", "") - - call FF.FinalizeToDir as FFYieldAligned { - input: - outdir = outdir + "/yield_aligned/", - files = [ - AlignedFlagStats.flag_stats, - AlignedReadMetrics.np_hist, - AlignedReadMetrics.range_gap_hist, - AlignedReadMetrics.zmw_hist, - AlignedReadMetrics.prl_counts, - AlignedReadMetrics.prl_hist, - AlignedReadMetrics.prl_nx, - AlignedReadMetrics.prl_yield_hist, - AlignedReadMetrics.rl_counts, - AlignedReadMetrics.rl_hist, - AlignedReadMetrics.rl_nx, - AlignedReadMetrics.rl_yield_hist - ] + chr = chr_info[0], + gcs_output_dir = gcs_output_dir } - - call FF.FinalizeToDir as FFCoverageFullDist { input: outdir = outdir + "/coverage/", files = MosDepth.full_dist } - call FF.FinalizeToDir as FFCoverageGlobalDist { input: outdir = outdir + "/coverage/", files = MosDepth.global_dist } - call FF.FinalizeToDir as FFCoverageRegionDist { input: outdir = outdir + "/coverage/", files = MosDepth.region_dist } - call FF.FinalizeToDir as FFCoverageRegions { input: outdir = outdir + "/coverage/", files = MosDepth.regions } - call FF.FinalizeToDir as FFCoverageRegionsCsi { input: outdir = outdir + "/coverage/", files = MosDepth.regions_csi } - call FF.FinalizeToDir as FFCoverageQuantizedDist { input: outdir = outdir + "/coverage/", files = MosDepth.quantized_dist } - call FF.FinalizeToDir as FFCoverageQuantized { input: outdir = outdir + "/coverage/", files = MosDepth.quantized } - call FF.FinalizeToDir as FFCoverageQuantizedCsi { input: outdir = outdir + "/coverage/", files = MosDepth.quantized_csi } - - call FF.FinalizeToDir as FFDepthSummaries { input: outdir = outdir + "/coverage_summaries/", files = SummarizeDepth.cov_summary } } output { - File aligned_flag_stats = AlignedFlagStats.flag_stats - Array[File] coverage_full_dist = MosDepth.full_dist Array[File] coverage_global_dist = MosDepth.global_dist Array[File] coverage_region_dist = MosDepth.region_dist @@ -88,18 +52,20 @@ workflow AlignedMetrics { Array[File] coverage_quantized_dist = MosDepth.quantized_dist Array[File] coverage_quantized = MosDepth.quantized Array[File] coverage_quantized_csi = MosDepth.quantized_csi - - File aligned_np_hist = AlignedReadMetrics.np_hist - File aligned_range_gap_hist = AlignedReadMetrics.range_gap_hist - File aligned_zmw_hist = AlignedReadMetrics.zmw_hist - File aligned_prl_counts = AlignedReadMetrics.prl_counts - File aligned_prl_hist = AlignedReadMetrics.prl_hist - File aligned_prl_nx = AlignedReadMetrics.prl_nx - File aligned_prl_yield_hist = AlignedReadMetrics.prl_yield_hist - File aligned_rl_counts = AlignedReadMetrics.rl_counts - File aligned_rl_hist = AlignedReadMetrics.rl_hist - File aligned_rl_nx = AlignedReadMetrics.rl_nx - File aligned_rl_yield_hist = AlignedReadMetrics.rl_yield_hist + Array[File] coverage_summary = MosDepth.cov_summary + + File aligned_np_hist = ReadMetrics.np_hist + File aligned_range_gap_hist = ReadMetrics.range_gap_hist + File aligned_zmw_hist = ReadMetrics.zmw_hist + File aligned_prl_counts = ReadMetrics.prl_counts + File aligned_prl_hist = ReadMetrics.prl_hist + File aligned_prl_nx = ReadMetrics.prl_nx + File aligned_prl_yield_hist = ReadMetrics.prl_yield_hist + File aligned_rl_counts = ReadMetrics.rl_counts + File aligned_rl_hist = ReadMetrics.rl_hist + File aligned_rl_nx = ReadMetrics.rl_nx + File aligned_rl_yield_hist = ReadMetrics.rl_yield_hist + File aligned_flag_stats = ReadMetrics.flag_stats File raw_chr_intervals = MakeChrIntervalList.raw_chrs } @@ -133,7 +99,7 @@ task MakeChrIntervalList { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + docker: "ubuntu:21.04" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -152,13 +118,15 @@ task MosDepth { File bam File bai String chr - Int? window_size + Int window_size = 500 + String? gcs_output_dir RuntimeAttr? runtime_attr_override } + String? coverage_dir = if defined(gcs_output_dir) then sub(select_first([gcs_output_dir]), "/?$", "/coverage/") else gcs_output_dir + String? summary_dir = if defined(gcs_output_dir) then sub(select_first([gcs_output_dir]), "/?$", "/coverage_summaries/") else gcs_output_dir Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) - Int ws = select_first([window_size, 500]) String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage.~{chr}" @@ -166,7 +134,7 @@ task MosDepth { set -euxo pipefail mosdepth -t 4 -c "~{chr}" -n -x -Q 1 ~{prefix}.full ~{bam} - mosdepth -t 4 -c "~{chr}" -n -x -Q 1 -b ~{ws} ~{prefix} ~{bam} + mosdepth -t 4 -c "~{chr}" -n -x -Q 1 -b ~{window_size} ~{prefix} ~{bam} export MOSDEPTH_Q0=NO_COVERAGE # 0 -- defined by the arguments to --quantize export MOSDEPTH_Q1=LOW_COVERAGE # 1..4 @@ -174,6 +142,25 @@ task MosDepth { export MOSDEPTH_Q3=HIGH_COVERAGE # 150 ... mosdepth -t 4 -c "~{chr}" -n -x -Q 1 --quantize 0:1:5:150: ~{prefix}.quantized ~{bam} + + ( echo 'chr start stop cov_mean cov_sd cov_q1 cov_median cov_q3 cov_iqr' && \ + zcat ~{prefix}.regions.bed.gz | datamash first 1 first 2 last 3 mean 4 sstdev 4 q1 4 median 4 q3 4 iqr 4 ) | \ + column -t > ~{prefix}.summary.txt + + if ~{defined(coverage_dir)}; then + gsutil -m cp \ + "~{prefix}.full.mosdepth.global.dist.txt" \ + "~{prefix}.mosdepth.global.dist.txt" \ + "~{prefix}.mosdepth.region.dist.txt" \ + "~{prefix}.regions.bed.gz" \ + "~{prefix}.regions.bed.gz.csi" \ + "~{prefix}.quantized.mosdepth.global.dist.txt" \ + "~{prefix}.quantized.quantized.bed.gz" \ + "~{prefix}.quantized.quantized.bed.gz.csi" \ + "~{coverage_dir}" + gsutil cp "~{prefix}.summary.txt" "~{summary_dir}" + fi + >>> output { @@ -185,6 +172,7 @@ task MosDepth { File quantized_dist = "~{prefix}.quantized.mosdepth.global.dist.txt" File quantized = "~{prefix}.quantized.quantized.bed.gz" File quantized_csi = "~{prefix}.quantized.quantized.bed.gz.csi" + File cov_summary = "~{prefix}.summary.txt" } ######################### @@ -195,323 +183,7 @@ task MosDepth { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.1" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task MosDepthOverBed { - input { - File bam - File bai - File bed - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) - String basename = basename(bam, ".bam") - String bedname = basename(bed, ".bed") - String prefix = "~{basename}.coverage_over_bed.~{bedname}" - - command <<< - set -euxo pipefail - - mosdepth -t 4 -b ~{bed} -n -x -Q 1 ~{prefix} ~{bam} - >>> - - output { - File global_dist = "~{prefix}.mosdepth.global.dist.txt" - File region_dist = "~{prefix}.mosdepth.region.dist.txt" - File regions = "~{prefix}.regions.bed.gz" - File regions_csi = "~{prefix}.regions.bed.gz.csi" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 4, - mem_gb: 8, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "quay.io/biocontainers/mosdepth:0.2.4--he527e40_0" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task SummarizeDepth { - input { - File regions - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 2*ceil(size(regions, "GB")) - String chrName = sub(basename(regions, ".regions.bed.gz"), "out.coverage.", "") - - command <<< - set -euxo pipefail - - ((echo 'chr start stop cov_mean cov_sd cov_q1 cov_median cov_q3 cov_iqr') && \ - (zcat ~{regions} | datamash first 1 first 2 last 3 mean 4 sstdev 4 q1 4 median 4 q3 4 iqr 4)) | \ - column -t > ~{chrName}.summary.txt - >>> - - output { - File cov_summary = "~{chrName}.summary.txt" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 2, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task CoverageTrack { - input { - File bam - File bai - String chr - String start - String end - - RuntimeAttr? runtime_attr_override - } - - String basename = basename(bam, ".bam") - Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) - - command <<< - set -euxo pipefail - - samtools depth -a ~{bam} -r ~{chr}:~{start}-~{end} | bgzip > ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz - tabix -p bed ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz - >>> - - output { - File coverage = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz" - File coverage_tbi = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz.tbi" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task FlagStats { - input { - File bam - - RuntimeAttr? runtime_attr_override - } - - String basename = basename(bam, ".bam") - Int disk_size = 2*ceil(size(bam, "GB")) - - command <<< - set -euxo pipefail - - samtools flagstat ~{bam} > ~{basename}.flag_stats.txt - >>> - - output { - File flag_stats = "~{basename}.flag_stats.txt" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task ReadNamesAndLengths { - input { - File bam - - RuntimeAttr? runtime_attr_override - } - - String basename = basename(bam, ".bam") - Int disk_size = 2*ceil(size(bam, "GB")) - - command <<< - set -euxo pipefail - - samtools view ~{bam} | awk '{ print $1, length($10) }' | gzip -1 > ~{basename}.read_names_and_lengths.txt.gz - >>> - - output { - File read_names_and_lengths = "~{basename}.read_names_and_lengths.txt.gz" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task FilterMQ0Reads { - input { - File bam - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 2*ceil(size(bam, "GB")) - String prefix = basename(bam, ".bam") - - command <<< - set -euxo pipefail - - samtools view -q 1 -b ~{bam} > ~{prefix}.no_mq0.bam - samtools index ~{prefix}.no_mq0.bam - >>> - - output { - File no_mq0_bam = "~{prefix}.no_mq0.bam" - File no_mq0_bai = "~{prefix}.no_mq0.bam.bai" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 2, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task ComputeBedCoverage { - input { - File bam - File bai - File bed - String prefix - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB") + size(bed, "GB")) - - command <<< - set -euxo pipefail - - bedtools coverage -b ~{bed} -a ~{bam} -nobuf | gzip > ~{prefix}.txt.gz - zcat ~{prefix}.txt.gz | awk '{ sum += sprintf("%f", $15*$16) } END { printf("%f\n", sum) }' > ~{prefix}.count.txt - >>> - - output { - File coverage = "~{prefix}.txt.gz" - Float counts = read_float("~{prefix}.count.txt") - File counts_file = "~{prefix}.count.txt" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 2, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.2" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -528,10 +200,12 @@ task ComputeBedCoverage { task ReadMetrics { input { File bam + String? gcs_output_dir RuntimeAttr? runtime_attr_override } + String? output_dir = if defined(gcs_output_dir) then sub(select_first([gcs_output_dir]), "/?$", "/yield_aligned/") else gcs_output_dir String basename = basename(bam, ".bam") Int disk_size = 2*ceil(size(bam, "GB")) @@ -539,6 +213,23 @@ task ReadMetrics { set -euxo pipefail java -jar /usr/local/bin/gatk.jar ComputeLongReadMetrics -I ~{bam} -O ~{basename}.read_metrics -DF WellformedReadFilter + samtools flagstat ~{bam} > ~{basename}.flag_stats.txt + if ~{defined(output_dir)}; then + gsutil -m cp \ + "~{basename}.read_metrics.np_hist.txt" \ + "~{basename}.read_metrics.range_gap_hist.txt" \ + "~{basename}.read_metrics.zmw_hist.txt" \ + "~{basename}.read_metrics.prl_counts.txt" \ + "~{basename}.read_metrics.prl_hist.txt" \ + "~{basename}.read_metrics.prl_nx.txt" \ + "~{basename}.read_metrics.prl_yield_hist.txt" \ + "~{basename}.read_metrics.rl_counts.txt" \ + "~{basename}.read_metrics.rl_hist.txt" \ + "~{basename}.read_metrics.rl_nx.txt" \ + "~{basename}.read_metrics.rl_yield_hist.txt" \ + "~{basename}.flag_stats.txt" \ + "~{output_dir}" + fi >>> output { @@ -553,6 +244,7 @@ task ReadMetrics { File rl_hist = "~{basename}.read_metrics.rl_hist.txt" File rl_nx = "~{basename}.read_metrics.rl_nx.txt" File rl_yield_hist = "~{basename}.read_metrics.rl_yield_hist.txt" + File flag_stats = "~{basename}.flag_stats.txt" } ######################### @@ -563,50 +255,7 @@ task ReadMetrics { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task BamToBed { - input { - File bam - File bai - - RuntimeAttr? runtime_attr_override - } - - String bed = basename(bam, ".bam") + ".bed" - Int disk_size = 4*ceil(size(bam, "GB") + size(bai, "GB")) - - command <<< - set -euxo pipefail - - bedtools bamtobed -i ~{bam} > ~{bed} - >>> - - output { - File bedfile = bed - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 2, - mem_gb: 8, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 2, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.12" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { diff --git a/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl b/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl index e1c2b7d15..2e0e595ac 100644 --- a/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl +++ b/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl @@ -3,7 +3,6 @@ version 1.0 import "../../structs/Structs.wdl" import "../Utility/Utils.wdl" import "../Visualization/NanoPlot.wdl" as NP -import "../QC/AlignedMetrics.wdl" as AM workflow SampleLevelAlignedMetrics { @@ -31,7 +30,7 @@ workflow SampleLevelAlignedMetrics { call NP.NanoPlotFromBam { input: bam = aligned_bam, bai = aligned_bai } if (defined(bed_to_compute_coverage)) { - call AM.MosDepthOverBed { + call MosDepthOverBed { input: bam = aligned_bam, bai = aligned_bai, @@ -65,6 +64,55 @@ workflow SampleLevelAlignedMetrics { } } +task MosDepthOverBed { + input { + File bam + File bai + File bed + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) + String basename = basename(bam, ".bam") + String bedname = basename(bed, ".bed") + String prefix = "~{basename}.coverage_over_bed.~{bedname}" + + command <<< + set -euxo pipefail + + mosdepth -t 4 -b ~{bed} -n -x -Q 1 ~{prefix} ~{bam} + >>> + + output { + File global_dist = "~{prefix}.mosdepth.global.dist.txt" + File region_dist = "~{prefix}.mosdepth.region.dist.txt" + File regions = "~{prefix}.regions.bed.gz" + File regions_csi = "~{prefix}.regions.bed.gz.csi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "quay.io/biocontainers/mosdepth:0.2.4--he527e40_0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + task SummarizeDepthOverWholeBed { meta { From d3249019ac67782fcbbe3b373072eb091d2d123f Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Fri, 29 Sep 2023 14:43:45 -0400 Subject: [PATCH 3/8] get rid of weird gatk version in ReadMetrics task --- docker/lr-metrics/Dockerfile | 6 +- docker/lr-mini/Dockerfile | 14 ++++ docker/lr-mini/Makefile | 17 +++++ wdl/tasks/QC/AlignedMetrics.wdl | 127 +++++++++++++++++++++++++++++++- 4 files changed, 157 insertions(+), 7 deletions(-) create mode 100644 docker/lr-mini/Dockerfile create mode 100644 docker/lr-mini/Makefile diff --git a/docker/lr-metrics/Dockerfile b/docker/lr-metrics/Dockerfile index 557b4a37f..6048d83f4 100644 --- a/docker/lr-metrics/Dockerfile +++ b/docker/lr-metrics/Dockerfile @@ -16,10 +16,9 @@ ENV PATH=/miniconda/bin/:/miniconda/envs/lr-metrics/bin/:/root/google-cloud-sdk/ # install conda packages COPY ./environment.yml / -RUN conda install -n base conda-libmamba-solver && conda config --set solver libmamba RUN conda env create -f /environment.yml && conda clean -a -# install super-special version of gatk +# install gatk RUN git clone https://github.com/broadinstitute/gatk.git -b kvg_pbeap \ && cd gatk \ && git checkout c9497220ef13beb05da7c7a820c181be00b9b817 \ @@ -31,9 +30,6 @@ RUN git clone https://github.com/broadinstitute/gatk.git -b kvg_pbeap \ # install picard RUN wget -O /usr/local/bin/picard.jar https://github.com/broadinstitute/picard/releases/download/2.22.1/picard.jar -# install gsutil -RUN curl -sSL https://sdk.cloud.google.com | bash - # install various metric and visualization scripts COPY lima_report_detail.R / COPY lima_report_summary.R / diff --git a/docker/lr-mini/Dockerfile b/docker/lr-mini/Dockerfile new file mode 100644 index 000000000..096dab081 --- /dev/null +++ b/docker/lr-mini/Dockerfile @@ -0,0 +1,14 @@ +FROM us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2023-07-28-v0.28.1-beta-e70dfbd7 + +#install some small, commonly used tools +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends bsdmainutils datamash tree && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* + diff --git a/docker/lr-mini/Makefile b/docker/lr-mini/Makefile new file mode 100644 index 000000000..7d4253440 --- /dev/null +++ b/docker/lr-mini/Makefile @@ -0,0 +1,17 @@ +IMAGE_NAME = lr-mini +VERSION = 0.1.1 + +TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest + +all: | build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +build_no_cache: + docker build --no-cache -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/wdl/tasks/QC/AlignedMetrics.wdl b/wdl/tasks/QC/AlignedMetrics.wdl index 5c75d4e93..6e60abd64 100644 --- a/wdl/tasks/QC/AlignedMetrics.wdl +++ b/wdl/tasks/QC/AlignedMetrics.wdl @@ -212,8 +212,131 @@ task ReadMetrics { command <<< set -euxo pipefail - java -jar /usr/local/bin/gatk.jar ComputeLongReadMetrics -I ~{bam} -O ~{basename}.read_metrics -DF WellformedReadFilter + cat << "EOF" > summarizeBAM.awk + BEGIN { FS=OFS="\t"; lastZMWid =-1; } + { + # retrieve the number of passes as the value of the "np" tag (default = 1) + nPasses=1; + for( fld=12;fld<=NF;++fld ) + if( substr($fld,1,5)=="np:i:" ) { + nPasses=substr($fld,6); + break; + } + + # get the read length and its bin for histograms + readLength=length($10); + + # default the zero-mode waveguide id as last value + 1 + zmwId=lastZMWid+1; + + #default the range info as 0:0 + rangeStart=0; + rangeStop=0; + + # parse the read name + if( split($1,namePieces,"/")==3 ) { + zmwId = namePieces[2]; + if( split(namePieces[3],range,"_")==2 ) { + rangeStart = range[0]; + rangeStop = range[1]; + } + } + + # calculate value to report for range gap (or -1 to indicate no report for this read) + reportedRangeGap=-1; + if( lastRangeStop!=0 && lastZMWid==zmwId ) { + reportedRangeGap=rangeStart-lastRangeStop; + } + + # calculate value to report for polymerase read length (or -1 to indicate no report for this read) + reportedPRL = -1; + if( lastZMWid!=-1 && lastZMWid!=zmwId ) { + reportedPRL=polymeraseRL; + } + + # track polymerase read length across reads with the same zmwId + if( zmwId!=lastZMWid ) polymeraseRL=0; + polymeraseRL+=readLength; + + # update values maintained across reads + lastZMWid=zmwId; + lastRangeStop=rangeStop; + + # print the parsed data + print readLength,nPasses,readLength*nPasses,zmwId,reportedRangeGap,reportedPRL; + } + EOF + + cat << "EOF" > calculateNX.awk + BEGIN { FS=OFS="\t" } + { + sum+=$1; + cent=int(100.*sum/tot); + if( cent!=lastCent ) + print cent,lastVal; + lastCent=cent; + lastVal=$1 + } + EOF + + # awkOut.tsv file is a tab delimited file with these columns: + # fld: 1 2 3 4 5 6 + # datum: readLength nPasses yield(len*passes) zmwId reportedRangeGap reportedPolymeraseReadLength + samtools view -F0x900 ~{bam} | awk -f summarizeBAM.awk > awkOut.tsv + + # extract the nPasses column for each read, sort numerically, count each group, add a header line + cut -f2 awkOut.tsv | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "np\tcount"}{print}' > "~{basename}.read_metrics.np_hist.txt" + + # extract the reportedRangeGap column, ignore -1 values, sort numerically, count each group, add header line + cut -f5 awkOut.tsv | sed '/^-1$/d' | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "range_gap\tcount"}{print}' > "~{basename}.read_metrics.range_gap_hist.txt" + + # extract the zmwId column, sort numerically, count each group, extract counts and sort numerically, + # count each group of counts, add header line + cut -f4 awkOut.tsv | sort -n | datamash -g1 count 1 | cut -f2 | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "zmw_count\tcount"}{print}' > "~{basename}.read_metrics.zmw_hist.txt" + + # extract yield and prl, ignore -1 prls, sort numerically, mash some summary stats, add header line + cut -f3,6 awkOut.tsv | sed '/ -1$/d' | sort -n | datamash count 2 sum 2 sum 1 min 2 max 2 mean 2 sstdev 2 | \ + awk 'BEGIN{print "num_polymerase_reads\tpolymerase_read_yield_bp\tpolymerase_read_yield_bp_times_passes\tpolymerase_read_min_length_bp\tpolymerase_read_max_length_bp\tpolymerase_read_mean_length_bp\tpolymerase_read_sd_length_bp"}{print}' > "~{basename}.read_metrics.prl_counts.txt" + + # extract prl rounded to 100s ignoring -1 values, sort numerically, count each group, add header line + awk 'BEGIN{FS=OFS="\t"}{if($6>=0)print int(($6+99)/100)*100}' awkOut.tsv | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "polymerase_read_length\tlength_bp"}{print}' > "~{basename}.read_metrics.prl_hist.txt" + + # calculate total of prl column, ignoring -1 values + # then extract the prl column, ignore -1 values, reverse sort numerically, print the centile divisions, + # forget about the 100th percentile, add header line + tot=$(cut -f6 < awkOut.tsv | sed '/^-1$/d' | datamash sum 1) + cut -f6 < awkOut.tsv | sed '/^-1$/d' | sort -rn | awk -v "tot=$tot" -f calculateNX.awk | sed '/^100 /d' | \ + awk 'BEGIN{print "NX\tvalue"}{print}' > "~{basename}.read_metrics.prl_nx.txt" + + # extract prl rounded to 100s and the original prl, sort numerically, sum prls for each bin, add header line + awk 'BEGIN{FS=OFS="\t"}{if($6>=0)print int(($6+99)/100)*100,$6}' awkOut.tsv | sort -n | datamash -g1 sum 2 | \ + awk 'BEGIN{print "polymerase_read_length\tyield_bp"}{print}' > "~{basename}.read_metrics.prl_yield_hist.txt" + + # extract readLength and yield columns, sort numerically, mash some summary stats, add header line + cut -f1,3 awkOut.tsv | sort -n | datamash count 1 sum 1 sum 2 min 1 max 1 mean 1 sstdev 1 | \ + awk 'BEGIN{print "num_reads\tread_yield_bp\tread_yield_bp_times_passes\tread_min_length_bp\tread_max_length_bp\tread_mean_length_bp\tread_sd_length_bp"}{print}' > "~{basename}.read_metrics.rl_counts.txt" + + # extract readLength rounded to 100s, sort numerically, count each group, add header line + awk 'BEGIN{FS=OFS="\t"}{print int(($1+99)/100)*100}' awkOut.tsv | sort -n | datamash -g1 count 1 | \ + awk 'BEGIN{print "read_length_bp\tcount"}{print}' > "~{basename}.read_metrics.rl_hist.txt" + + # calculate total of readLength column, + # then extract the readLength column, reverse sort numerically, print the centile divisions, + # forget about the 100th percentile, add header line + tot=$(cut -f1 < awkOut.tsv | datamash sum 1) + cut -f1 < awkOut.tsv | sort -rn | awk -v "tot=$tot" -f calculateNX.awk | sed '/^100 /d' | \ + awk 'BEGIN{print "NX\tvalue"}{print}' > "~{basename}.read_metrics.rl_nx.txt" + + # extract readLength rounded to 100s and the original readLength, sort numerically, sum each group, add header line + awk 'BEGIN{FS=OFS="\t"}{print int(($1+99)/100)*100,$1}' awkOut.tsv | sort -n | datamash -g1 sum 2 | \ + awk 'BEGIN{print "read_length\tyield_bp"}{print}' > "~{basename}.read_metrics.rl_yield_hist.txt" + samtools flagstat ~{bam} > ~{basename}.flag_stats.txt + if ~{defined(output_dir)}; then gsutil -m cp \ "~{basename}.read_metrics.np_hist.txt" \ @@ -255,7 +378,7 @@ task ReadMetrics { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.12" + docker: "us.gcr.io/broad-dsp-lrma/lr-mini:0.1.1" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { From 6ab28a25f6d93406046dd09c898d3acb2647257e Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Fri, 29 Sep 2023 15:04:55 -0400 Subject: [PATCH 4/8] fix missing Structs import in scrapbook --- wdl/deprecated/tasks/Scrapbook.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/wdl/deprecated/tasks/Scrapbook.wdl b/wdl/deprecated/tasks/Scrapbook.wdl index 9c21161c6..110103b05 100644 --- a/wdl/deprecated/tasks/Scrapbook.wdl +++ b/wdl/deprecated/tasks/Scrapbook.wdl @@ -1,5 +1,7 @@ version 1.0 +import "../../../structs/Structs.wdl" + task CoverageTrack { input { File bam From 3b8b9f5d436c830b7d34a3a2dac7d9726a0160ec Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Fri, 29 Sep 2023 16:11:57 -0400 Subject: [PATCH 5/8] delete test json for now --- test/test_data/AlignedMetrics.json | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 test/test_data/AlignedMetrics.json diff --git a/test/test_data/AlignedMetrics.json b/test/test_data/AlignedMetrics.json deleted file mode 100644 index fd39c47a6..000000000 --- a/test/test_data/AlignedMetrics.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "AlignedMetrics.aligned_bam": "gs://broad-dsp-lrma-cromwell/test_data/aligned-metrics/NA24385.bam", - "AlignedMetrics.aligned_bai": "gs://broad-dsp-lrma-cromwell/test_data/aligned-metrics/NA24385.bam.bai", - "AlignedMetrics.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "AlignedMetrics.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "AlignedMetrics.gcs_output_dir": "gs://broad-dsp-lrma-ci/test-outputs" -} From 9674dd2bcaccc2e9280ee79a2eec22cf26f44793 Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Fri, 29 Sep 2023 16:32:25 -0400 Subject: [PATCH 6/8] dammit --- wdl/deprecated/tasks/Scrapbook.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/deprecated/tasks/Scrapbook.wdl b/wdl/deprecated/tasks/Scrapbook.wdl index 110103b05..c186e875b 100644 --- a/wdl/deprecated/tasks/Scrapbook.wdl +++ b/wdl/deprecated/tasks/Scrapbook.wdl @@ -1,6 +1,6 @@ version 1.0 -import "../../../structs/Structs.wdl" +import "../../structs/Structs.wdl" task CoverageTrack { input { From ceaf133f513195d008d030f7fa140149bfc5bf28 Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Sat, 30 Sep 2023 10:04:37 -0400 Subject: [PATCH 7/8] bump mosdepth version --- docker/lr-mosdepth/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/lr-mosdepth/Makefile b/docker/lr-mosdepth/Makefile index 9b2693d2b..0186202bd 100644 --- a/docker/lr-mosdepth/Makefile +++ b/docker/lr-mosdepth/Makefile @@ -1,5 +1,5 @@ IMAGE_NAME = lr-mosdepth -VERSION = 0.3.1 # This should match the MosDepth version number +VERSION = 0.3.2 # This should match the MosDepth version number TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest From f9dc53ac6f168b96e23d0e62d68b73d70acc4f5d Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Sat, 30 Sep 2023 10:26:20 -0400 Subject: [PATCH 8/8] remove MergeVCFs.wdl --- wdl/tasks/Utility/MergeVCFs.wdl | 159 -------------------------------- 1 file changed, 159 deletions(-) delete mode 100644 wdl/tasks/Utility/MergeVCFs.wdl diff --git a/wdl/tasks/Utility/MergeVCFs.wdl b/wdl/tasks/Utility/MergeVCFs.wdl deleted file mode 100644 index 053a1d6df..000000000 --- a/wdl/tasks/Utility/MergeVCFs.wdl +++ /dev/null @@ -1,159 +0,0 @@ -version 1.0 - -import "../../structs/Structs.wdl" - -workflow MergeVCFs { - - meta { - description: "Merges VCFs 'horizonally' (e.g., make a single VCF from many single-sample VCFs)" - } - - input { - Array[File] inputArray - Int chunkSize = 0 - RuntimeAttr? runtime_attr_override_chunks - RuntimeAttr? runtime_attr_override_merge - RuntimeAttr? runtime_attr_override_final_merge - } - - parameter_meta { - inputArray: "The array of VCFs to merge" - chunkSize: "Optional number of VCFs to merge in a single task" - runtime_attr_override_chunks: "Override the default runtime attributes for the chunking task" - runtime_attr_override_merge: "Override the default runtime attributes for the initial merging tasks" - runtime_attr_override_final_merge: "Override the default runtime attributes for the final merging task" - } - - - call CreateChunks { - input: - inputArray = inputArray, - chunkSize = chunkSize, - runtime_attr_override = runtime_attr_override_chunks - } - - scatter ( inputVCFs in CreateChunks.outputArray ) { - call MergeVCFsTask { - input: - inputVCFs = inputVCFs, - runtime_attr_override = runtime_attr_override_merge - } - } - - call MergeVCFsTask as FinalMerge { - input: - inputVCFs = MergeVCFsTask.outputVCF, - runtime_attr_override = runtime_attr_override_final_merge - } - - output { - File outputVCF = FinalMerge.outputVCF - } -} - - -task CreateChunks { - - meta { - description: "Breaks an array into chunks" - } - - input { - Array[String] inputArray - Int chunkSize = 0 - RuntimeAttr? runtime_attr_override - } - - parameter_meta { - inputArray: "The array to chunk" - chunkSize: "Optional chunk size override" - runtime_attr_override: "Override the default runtime attributes" - } - - command <<< - set -euo pipefail - mkdir chunks - cd chunks - lines=`echo "~{length(inputArray)}" | \ - awk 'BEGIN{lines=~{chunkSize}} - {if ( !lines ) lines=int(sqrt($1)+.99); - while ( $1%lines==1 ) lines += 1; - print lines}'` - split -a4 -l$lines "~{write_lines(inputArray)}" - for fil in x*;do cat $fil | awk 'BEGIN{sep=""}{printf "%s%s",sep,$0;sep="\t"}END{printf "\n"}' >> ../inputs.tsv; done - >>> - - output { - Array[Array[String]] outputArray = read_tsv("inputs.tsv") - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 1, - disk_gb: 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1, - docker: "ubuntu:latest" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task MergeVCFsTask { - - meta { - description: "Merges vcfs" - } - - input { - Array[File] inputVCFs - RuntimeAttr? runtime_attr_override - } - - parameter_meta { - inputVCFs: {help: "The vcfs to merge -- the vcfs may be given as gs: paths", - localization_optional: true} - runtime_attr_override: "Override the default runtime attributes" - } - - command <<< - set -euo pipefail - export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` - bcftools merge --no-index -0 --threads 3 -Oz "~{sep='" "' inputVCFs}" > merged.vcf.gz - >>> - - output { - File outputVCF = "merged.vcf.gz" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 4, - mem_gb: 4, - disk_gb: ceil(size(inputVCFs, "GB")) + 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1, - docker: "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2023-02-01-v0.26.8-beta-9b25c72d" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -}