Merge pull request #74 from DataBiosphere/hotfix-assoc-agg

aofarrel · web-flow · commit 41a0276644d3 · 2022-06-16T17:43:19.000-07:00
Hotfix assoc agg
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@ assoc-aggregate/cromwell-68.1.jar
 *.integer
 *.zip
 cwl.output.json
-job.json
+job.json
+_test-data-and-truths_/assoc/big_silly_file_chr1.gds
diff --git a/README.md b/README.md
@@ -13,9 +13,9 @@ This project is a Workflow Description Language (WDL) implementation of several
 * Documentation of inputs, how each workflow works, and WDL-specific workarounds
 
 ## Usage
-These workflows are tested on both Terra and the local Cromwell execution engine. Example files are provided in `test-data-and-truths` and in `gs://topmed_workflow_testing/UWGAC_WDL/`.  
+These workflows are tested on both Terra and the local Cromwell execution engine. Example files are provided in `test-data-and-truths` and in `gs://topmed_workflow_testing/UWGAC_WDL/`.   
 
-Essentially all workflows which take in chromosome-level files share filename requirements. For these files, the chromosome must be included in the filename with the format `chr##` where `##` is the name of the chromosome (1-24 or X, Y). Chromosome can be included at any part of the filename provided they follow this format. For instance, data_subset_chr1.gds, data_chr1_subset.gds, and chr1_data_subset.gds are all valid names, while data_chromosome1_subset.gds and data_subset_c1.gds are not valid.  
+Essentially all workflows which take in chromosome-level files share filename requirements. For these files, the chromosome must be included in the filename with the format `chr##` where `##` is the name of the chromosome (1-24 or X, Y). Chromosome can be included at any part of the filename provided they follow this format. For instance, data_subset_chr1.gds, data_chr1_subset.gds, and chr1_data_subset.gds are all valid names, while data_chromosome1_subset.gds and data_subset_c1.gds are not valid. Note that the association aggregate, LD prune, and null model workflows additionally require that you have greater than one input GDS file (ie, input at least chr1 and chr2).  
 
 The original CWL pipelines had arguments relating to runtime such as `ncores` and `cluster_type` that do not apply to WDL. Please familiarize yourself with the [runtime attributes of WDL](https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/) if you are unsure how your settings may transfer. For more information on specific runtime attributes for specific tasks, see [the further reading section](https://github.com/DataBiosphere/analysis_pipeline_WDL/main/README.md#further-reading).  
 
diff --git a/assoc-aggregate/assoc-aggregate.wdl b/assoc-aggregate/assoc-aggregate.wdl
@@ -16,12 +16,20 @@ task wdl_validate_inputs {
 		String? genome_build
 		String? aggregate_type
 		String? test
+		Int? num_gds_files
 
 		# no runtime attr because this is a trivial task that does not scale
 	}
 
 	command <<<
 		set -eux -o pipefail
+
+		if [[ ~{num_gds_files} = 1 ]]
+		then
+			echo "Invalid input - you need to put it at least two GDS files (preferably consecutive ones, like chr1 and chr2)"
+			exit 1
+		fi
+
 		#acceptable genome builds: ("hg38" "hg19")
 		#acceptable aggreg types:  ("allele" "position")
 		acceptable_test_values=("burden" "skat" "smmat" "fastskat" "skato")
@@ -97,6 +105,10 @@ task sbg_gds_renamer {
 	input {
 		File in_variant
 
+		# this is ignored by the script itself, but including this stops this task from firing
+		# before wdl_validate_inputs finishes
+		String? noop
+
 		Boolean debug = false
 
 		# runtime attributes, which you shouldn't need to adjust as this is a very light task
@@ -404,7 +416,7 @@ task sbg_prepare_segments_1 {
 	# Although the format of the outputs are different from the CWL, the actual contents of each
 	# component (gds, segment number, and agg file) should match the CWL perfectly (barring compute
 	# platform differences, etc). The format is a zip file containing each segment's components in
-	# order to work around WDL's limitations. Essentially, CWL easily scatters on the dot-product
+	# order to work around WDL's limitations. Essentially, CWL easily scatters on the dot-product of
 	# multiple arrays, but trying to that in WDL is painful. See cwl-vs-wdl-dev.md for more info.
 
 	input {
@@ -414,14 +426,14 @@ task sbg_prepare_segments_1 {
 		Array[File]? variant_include_files
 
 		# runtime attr
-		Int addldisk = 10
-		Int cpu = 2
-		Int memory = 4
-		Int preempt = 2
+		Int addldisk = 100
+		Int cpu = 12
+		Int memory = 16
+		Int preempt = 0
 	}
 
 	# estimate disk size required
-	Int gds_size = 2 * ceil(size(input_gds_files, "GB"))
+	Int gds_size = 5 * ceil(size(input_gds_files, "GB"))
 	Int seg_size = 2 * ceil(size(segments_file, "GB"))
 	Int agg_size = 2 * ceil(size(aggregate_files, "GB"))
 	Int dsk_size = gds_size + seg_size + agg_size + addldisk
@@ -463,6 +475,7 @@ task sbg_prepare_segments_1 {
 		from zipfile import ZipFile
 		import os
 		import shutil
+		import datetime
 
 		def find_chromosome(file):
 			chr_array = []
@@ -516,7 +529,10 @@ task sbg_prepare_segments_1 {
 			segments = segments[1:] # remove first line
 			return segments
 
-		# prepare GDS output
+		######################
+		# prepare GDS output #
+		######################
+		beginning = datetime.datetime.now()
 		input_gdss = pair_chromosome_gds(IIinput_gds_filesII)
 		output_gdss = []
 		gds_segments = wdl_get_segments()
@@ -530,8 +546,12 @@ task sbg_prepare_segments_1 {
 		gds_output_hack = open("gds_output_debug.txt", "w")
 		gds_output_hack.writelines(["%s " % thing for thing in output_gdss])
 		gds_output_hack.close()
+		print("Info: GDS output prepared in %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 
-		# prepare segment output
+		######################
+		# prepare seg output #
+		######################
+		beginning = datetime.datetime.now()
 		input_gdss = pair_chromosome_gds(IIinput_gds_filesII)
 		output_segments = []
 		actual_segments = wdl_get_segments()
@@ -544,23 +564,28 @@ task sbg_prepare_segments_1 {
 				seg_num = i+1
 				output_segments.append(seg_num)
 				output_seg_as_file = open("%s.integer" % seg_num, "w")
-		
+
 		# I don't know for sure if this case is actually problematic, but I suspect it will be.
 		if max(output_segments) != len(output_segments):
 			print("ERROR: output_segments needs to be a list of consecutive integers.")
 			print("Debug: Max of list: %s. Len of list: %s." % 
 				[max(output_segments), len(output_segments)])
 			print("Debug: List is as follows:\n\t%s" % output_segments)
 			exit(1)
+
 		segs_output_hack = open("segs_output_debug.txt", "w")
 		segs_output_hack.writelines(["%s " % thing for thing in output_segments])
 		segs_output_hack.close()
+		print("Info: Segment output prepared in %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 
-		# prepare aggregate output
+		######################
+		# prepare agg output #
+		######################
 		# The CWL accounts for there being no aggregate files as the CWL considers them an optional
 		# input. We don't need to account for that because the way WDL works means it they are a
 		# required output of a previous task and a required input of this task. That said, if this
 		# code is reused for other WDLs, it may need some adjustments right around here.
+		beginning = datetime.datetime.now()
 		input_gdss = pair_chromosome_gds(IIinput_gds_filesII)
 		agg_segments = wdl_get_segments()
 		if 'chr' in os.path.basename(IIaggregate_filesII[0]):
@@ -577,8 +602,12 @@ task sbg_prepare_segments_1 {
 				output_aggregate_files.append(input_aggregate_files[chr])
 			elif (chr in input_gdss):
 				output_aggregate_files.append(None)
+		print("Info: Aggregate output prepared in %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 
-		# prepare variant include output
+		#########################
+		# prepare varinc output #
+		#########################
+		beginning = datetime.datetime.now()
 		input_gdss = pair_chromosome_gds(IIinput_gds_filesII)
 		var_segments = wdl_get_segments()
 		if IIvariant_include_filesII != [""]:
@@ -608,6 +637,7 @@ task sbg_prepare_segments_1 {
 		var_output_hack = open("variant_output_debug.txt", "w")
 		var_output_hack.writelines(["%s " % thing for thing in output_variant_files])
 		var_output_hack.close()
+		print("Info: Variant include output prepared in %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 
 		# We can only consistently tell output files apart by their extension. If var include files 
 		# and agg files are both outputs, this is problematic, as they both share the RData ext.
@@ -617,9 +647,11 @@ task sbg_prepare_segments_1 {
 			os.mkdir("temp")
 
 		# make a bunch of zip files
+		print("Preparing zip file outputs...")
 		for i in range(0, max(output_segments)):
+			beginning = datetime.datetime.now()
 			plusone = i+1
-			this_zip = ZipFile("dotprod%s.zip" % plusone, "w")
+			this_zip = ZipFile("dotprod%s.zip" % plusone, "w", allowZip64=True)
 			this_zip.write("%s" % output_gdss[i])
 			this_zip.write("%s.integer" % output_segments[i])
 			this_zip.write("%s" % output_aggregate_files[i])
@@ -651,13 +683,15 @@ task sbg_prepare_segments_1 {
 				
 				this_zip.write("varinclude/%s" % output_variant_files[i])
 			this_zip.close()
+			print("Info: Wrote dotprod%s.zip" % plusone)
+			print("Info: This took %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 		CODE
 	>>>
 
 	runtime {
 		cpu: cpu
 		docker: "uwgac/topmed-master@sha256:c564d54f5a3b9daed7a7677f860155f3b8c310b0771212c2eef1d6338f5c2600" # uwgac/topmed-master:2.12.0
-		disks: "local-disk " + dsk_size + " HDD"
+		disks: "local-disk " + dsk_size + " SSD"
 		memory: "${memory} GB"
 		preemptibles: "${preempt}"
 	}
@@ -692,10 +726,10 @@ task assoc_aggregate {
 		String? genome_build # acts as enum
 
 		# runtime attr
-		Int addldisk = 1
-		Int cpu = 1
-		Int memory = 8
-		Int preempt = 0
+		Int addldisk = 50
+		Int cpu = 4
+		Int memory = 16
+		Int preempt = 1
 
 		Boolean debug = false
 	}
@@ -1346,19 +1380,23 @@ workflow assoc_agg {
 		String?      weight_user
 	}
 
+	Int num_gds_files = length(input_gds_files)
+
 	# In order to force this to run first, all other tasks that use these "psuedoenums"
 	# (Strings that mimic type Enum from CWL) will take them in via outputs of this task
 	call wdl_validate_inputs {
 		input:
 			genome_build = genome_build,
 			aggregate_type = aggregate_type,
-			test = test
+			test = test,
+			num_gds_files = num_gds_files
 	}
 
 	scatter(gds_file in input_gds_files) {
 		call sbg_gds_renamer {
 			input:
-				in_variant = gds_file
+				in_variant = gds_file,
+				noop = wdl_validate_inputs.valid_genome_build
 		}
 	}
 	
diff --git a/assoc-aggregate/checker/assoc-aggregate-checker.wdl b/assoc-aggregate/checker/assoc-aggregate-checker.wdl
@@ -1,5 +1,6 @@
 version 1.0
 import "https://raw.githubusercontent.com/DataBiosphere/analysis_pipeline_WDL/v7.0.0/assoc-aggregate/assoc-aggregate.wdl" as assoc_agg_wf
+#import "../assoc-aggregate.wdl" as assoc_agg_wf # use this if you want to test a local version
 import "https://raw.githubusercontent.com/dockstore/checker-WDL-templates/v1.1.0/checker_tasks/arraycheck_task.wdl" as verify_array
 
 workflow aggie_checker {
diff --git a/assoc-aggregate/prepare_segments_1.py b/assoc-aggregate/prepare_segments_1.py
@@ -13,6 +13,7 @@
 from zipfile import ZipFile
 import os
 import shutil
+import datetime
 
 def find_chromosome(file):
 	chr_array = []
@@ -66,7 +67,10 @@ def wdl_get_segments():
 	segments = segments[1:] # remove first line
 	return segments
 
-# prepare GDS output
+######################
+# prepare GDS output #
+######################
+beginning = datetime.datetime.now()
 input_gdss = pair_chromosome_gds(IIinput_gds_filesII)
 output_gdss = []
 gds_segments = wdl_get_segments()
@@ -80,8 +84,12 @@ def wdl_get_segments():
 gds_output_hack = open("gds_output_debug.txt", "w")
 gds_output_hack.writelines(["%s " % thing for thing in output_gdss])
 gds_output_hack.close()
+print("Info: GDS output prepared in %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 
-# prepare segment output
+######################
+# prepare seg output #
+######################
+beginning = datetime.datetime.now()
 input_gdss = pair_chromosome_gds(IIinput_gds_filesII)
 output_segments = []
 actual_segments = wdl_get_segments()
@@ -106,12 +114,16 @@ def wdl_get_segments():
 segs_output_hack = open("segs_output_debug.txt", "w")
 segs_output_hack.writelines(["%s " % thing for thing in output_segments])
 segs_output_hack.close()
+print("Info: Segment output prepared in %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 
-# prepare aggregate output
+######################
+# prepare agg output #
+######################
 # The CWL accounts for there being no aggregate files as the CWL considers them an optional
 # input. We don't need to account for that because the way WDL works means it they are a
 # required output of a previous task and a required input of this task. That said, if this
 # code is reused for other WDLs, it may need some adjustments right around here.
+beginning = datetime.datetime.now()
 input_gdss = pair_chromosome_gds(IIinput_gds_filesII)
 agg_segments = wdl_get_segments()
 if 'chr' in os.path.basename(IIaggregate_filesII[0]):
@@ -128,8 +140,12 @@ def wdl_get_segments():
 		output_aggregate_files.append(input_aggregate_files[chr])
 	elif (chr in input_gdss):
 		output_aggregate_files.append(None)
+print("Info: Aggregate output prepared in %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 
-# prepare variant include output
+#########################
+# prepare varinc output #
+#########################
+beginning = datetime.datetime.now()
 input_gdss = pair_chromosome_gds(IIinput_gds_filesII)
 var_segments = wdl_get_segments()
 if IIvariant_include_filesII != [""]:
@@ -159,6 +175,7 @@ def wdl_get_segments():
 var_output_hack = open("variant_output_debug.txt", "w")
 var_output_hack.writelines(["%s " % thing for thing in output_variant_files])
 var_output_hack.close()
+print("Info: Variant include output prepared in %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
 
 # We can only consistently tell output files apart by their extension. If var include files 
 # and agg files are both outputs, this is problematic, as they both share the RData ext.
@@ -168,9 +185,11 @@ def wdl_get_segments():
 	os.mkdir("temp")
 
 # make a bunch of zip files
+print("Preparing zip file outputs...")
 for i in range(0, max(output_segments)):
+	beginning = datetime.datetime.now()
 	plusone = i+1
-	this_zip = ZipFile("dotprod%s.zip" % plusone, "w")
+	this_zip = ZipFile("dotprod%s.zip" % plusone, "w", allowZip64=True)
 	this_zip.write("%s" % output_gdss[i])
 	this_zip.write("%s.integer" % output_segments[i])
 	this_zip.write("%s" % output_aggregate_files[i])
@@ -201,4 +220,6 @@ def wdl_get_segments():
 			exit(1)
 		
 		this_zip.write("varinclude/%s" % output_variant_files[i])
-	this_zip.close()
+	this_zip.close()
+	print("Info: Wrote dotprod%s.zip" % plusone)
+	print("Info: This took %s minutes" % divmod((datetime.datetime.now()-beginning).total_seconds(), 60)[0])
diff --git a/assoc-aggregate/readme.md b/assoc-aggregate/readme.md
@@ -31,6 +31,10 @@ Aggregate tests are typically used to jointly test rare variants. This workflow
 
 * Do not use this pipeline with non-consecutive chromosomes. A run containing chr1, chr2, and chr3 will work. A run containing chr1, chr20, and chr13 may not. (Non-autosomes excluded -- a run containing chr1, chr2, and chrX will not error out, but as noted above chrX will be dropped.)
 
+* Do not use this pipeline on only one chromosome.
+
+* If your data consists of GDS files that are >10 GB, it is *recommended* to set a lower number of segments in order to decrease delocalization time in the prepare segments task. For reference, it took almost two hours for a 25-segment output of 4.3 GB GDS files to finish delocalizing in prepare segments. I am currently seeking additional guidance from Cromwell and Terra devs on handling this bottleneck better.
+
 ## Sample Inputs
 * terra-allele, local-allele, and the first part of the checker workflow are based upon [assoc_aggregate_allele.config](https://github.com/UW-GAC/analysis_pipeline/blob/master/testdata/assoc_aggregate_allele.config)
 * terra-position, local-position, and the second part of the checker workflow are based upon [assoc_aggregate_position.config](https://github.com/UW-GAC/analysis_pipeline/blob/master/testdata/assoc_aggregate_position.config)
diff --git a/ld-pruning/ld-pruning-terra.json b/ld-pruning/ld-pruning-terra.json
@@ -23,5 +23,8 @@
  "gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr21.gds",
  "gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr22.gds"
 ],
+  "ldpruning.ld_pruning.addldisk": "5",
+  "ldpruning.ld_pruning.cpu": "2",
+  "ldpruning.ld_pruning.memory": "4",
   "ldpruning.ld_pruning.genome_build": "hg38"
 }
diff --git a/ld-pruning/ld-pruning.wdl b/ld-pruning/ld-pruning.wdl
@@ -18,9 +18,9 @@ task ld_pruning {
 		String? out_prefix
 		
 		# runtime attributes
-		Int addldisk = 5
-		Int cpu = 2
-		Int memory = 4
+		Int addldisk = 10
+		Int cpu = 4
+		Int memory = 8
 		Int preempt = 3
 	}
 
@@ -201,7 +201,7 @@ task merge_gds {
 		# runtime attributes
 		Int addldisk = 5
 		Int cpu = 2
-		Int memory = 4
+		Int memory = 8
 		Int preempt = 3
 	}
 
diff --git a/vcf-to-gds/vcf-to-gds-terra.json b/vcf-to-gds/vcf-to-gds-terra.json
@@ -25,7 +25,10 @@
  "gs://topmed_workflow_testing/UWGAC_WDL/1KG_phase3_subset_chrX.vcf.gz"
 ],
   "vcftogds.vcf2gds.addldisk": 1,
+  "vcftogds.vcf2gds.cpu": 2,
+  "vcftogds.vcf2gds.memory": 4,
   "vcftogds.unique_variant_id.addldisk": 1,
+  "vcftogds.unique_variant_id.memory": 4,
   "vcftogds.check_gds": true,
   "vcftogds.check_gds.addldisk": 1,
   "vcftogds.check_gds.preempt": 1
diff --git a/vcf-to-gds/vcf-to-gds.wdl b/vcf-to-gds/vcf-to-gds.wdl

Original file line number	Diff line number	Diff line change
`@@ -23,5 +23,8 @@`
`23`	`23`	`"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr21.gds",`
`24`	`24`	`"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr22.gds"`
`25`	`25`	`],`
	`26`	`+ "ldpruning.ld_pruning.addldisk": "5",`
	`27`	`+ "ldpruning.ld_pruning.cpu": "2",`
	`28`	`+ "ldpruning.ld_pruning.memory": "4",`
`26`	`29`	`"ldpruning.ld_pruning.genome_build": "hg38"`
`27`	`30`	`}`