From 59fb35b5cea237cd2868b6ffbe66245bb19f65fb Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 20 Nov 2025 18:49:22 -0500 Subject: [PATCH 01/60] wip --- .../GvsCreateVATfromVDS.wdl | 196 ++++++++++++++++++ scripts/variantstore/wdl/GvsUtils.wdl | 1 + 2 files changed, 197 insertions(+) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 467c08a3953..d56e4a6ddab 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -35,6 +35,12 @@ workflow GvsCreateVATfromVDS { String? gatk_docker String? variants_docker String? variants_nirvana_docker + String? vep_loftee_docker + + String? vep_loftee_data_table_raw + String? vep_loftee_data_table_cooked + + String loftee_references_dir = "gs://gvs-internal/loftee/" } parameter_meta { @@ -84,6 +90,7 @@ workflow GvsCreateVATfromVDS { String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) String effective_gatk_docker = select_first([gatk_docker, GetToolVersions.gatk_docker]) String effective_variants_nirvana_docker = select_first([variants_nirvana_docker, GetToolVersions.variants_nirvana_docker]) + String effective_vep_loftee_docker = select_first([vep_loftee_docker, GetToolVersions.vep_loftee_docker]) String effective_hail_version = select_first([hail_version, GetToolVersions.hail_version]) String effective_google_project = select_first([workspace_gcs_project, GetToolVersions.google_project]) @@ -265,6 +272,27 @@ workflow GvsCreateVATfromVDS { variants_docker = effective_variants_docker, } + call GenerateVepAndLofteeAnnotations { + input: + vep_loftee_docker = effective_vep_loftee_docker, + loftee_human_ancestor_fa_gz = loftee_references_dir + "human_ancestor.fa.gz", + loftee_human_ancestor_fa_gz_fai = loftee_references_dir + "human_ancestor.fa.gz.fai", + loftee_human_ancestor_fa_gz_gzi = loftee_references_dir + "human_ancestor.fa.gz.gzi", + loftee_gerp_scores = loftee_references_dir + "gerp_conservation_scores.homo_sapiens.GRCh38.bw", + loftee_phylo_csf_database = loftee_references_dir + "loftee.sql", + sites_only_vcf = CopySitesOnlyVcf.output_file_path, + sites_only_vcf_index = CopySitesOnlyVcfIndex.output_file_path, + } + + call BigQueryLoadRawVepAndLofteeAnnotations { + input: + vep_loftee_raw_output = GenerateVepAndLofteeAnnotations.output_file, + project_id = project_id, + dataset_name = dataset_name, + raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), + variants_docker = effective_variants_docker, + } + ## Use Nirvana to annotate the sites-only VCF and include the AC/AN/AF calculations as custom annotations call AnnotateVCF { input: @@ -291,7 +319,15 @@ workflow GvsCreateVATfromVDS { output_path = genes_output_path, variants_docker = effective_variants_docker, } + } + call BigQueryCookVepAndLofteeRawAnnotations { + input: + project_id = project_id, + dataset_name = dataset_name, + raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), + cooked_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_cooked"]), + variants_docker = effective_variants_docker, } call Utils.MergeTsvs { @@ -732,6 +768,166 @@ for line in sys.stdin: } } +task GenerateVepAndLofteeAnnotations { + input { + String vep_loftee_docker + # TODO make a reference disk for this stuff, that + File loftee_human_ancestor_fa_gz + File loftee_human_ancestor_fa_gz_fai + File loftee_human_ancestor_fa_gz_gzi + File loftee_gerp_scores + File loftee_phylo_csf_database + File sites_only_vcf + File sites_only_vcf_index + } + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + LOFTEE_PATH=/opt/vep/src/loftee-1.0.4_GRCh38 + args=( + + # "Splats" out data into their own columns that otherwise would be nested (semicolon delimted) in the "Extra" column. + --tab + + # Force writing versions on Ensembl transcripts for VAT compatibility. + --transcript_version + + # Emit HGNC symbols and IDs. + --symbol + + # Basic LOFTEE plugin setup + --plugin LoF,loftee_path:$LOFTEE_PATH,gerp_bigwig:~{loftee_gerp_scores},human_ancestor_fa:~{loftee_human_ancestor_fa_gz},conservation_file:~{loftee_phylo_csf_database} + --dir_plugins $LOFTEE_PATH + + # Basic VEP/LOFTEE cache setup + --cache + --dir_cache . + + # For GERP (Genomic Evolutionary Rate Profiling) score output. + --custom file=~{loftee_gerp_scores},short_name=GERP,format=bigwig + + # Input and output files + --input_file ~{sites_only_vcf} + --output_file vep_loftee_raw_output.txt + ) + + vep "${args[@}}" + + >>> + + runtime { + docker: vep_loftee_docker + memory: "15 GB" + disks: "1000 HDD" + } + + output { + File output_file = "vep_loftee_raw_output.txt" + } +} + +task BigQueryLoadRawVepAndLofteeAnnotations { + input { + String variants_docker + File vep_loftee_raw_output + String project_id + String dataset_name + String raw_data_table + } + + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + # Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data + # - Remove lines beginning with '##'. + # - Remove the leading '#' from the one line that should be left with a single leading '#' so the line can + # serve as a TSV header. + sed -E '/^##/d' ~{vep_loftee_raw_output} | sed -E 's/^#//' > vep_loftee_load_file.txt + + # Schema autodetection doesn't seem to work with --autodetect here for reasons unknown 😭 + # Explicitly get the header and sed it into schema form + schema=$(head -1 vep_loftee_load_file.txt| sed "s/\t/:STRING,/g" | sed 's/$/:STRING/') + + bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' --skip_leading_rows=1 \ + --null_marker="-" --schema ${schema} ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt + >>> + + runtime { + docker: variants_docker + memory: "7 GB" + disk: "1000 GB" + } + + output { + + } +} + +task BigQueryCookVepAndLofteeRawAnnotations { + input { + String variants_docker + String project_id + String dataset_name + String raw_data_table + String cooked_data_table + } + + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{cooked_data_table} --replace \ + --project_id=~{project_id} ' + + SELECT + REGEXP_EXTRACT(Uploaded_variation, '^chr([^_]+)') || '-' || REGEXP_EXTRACT(Uploaded_variation, '_(\\d+)') || '-' || REGEXP_EXTRACT(Uploaded_variation, '_([ACGT]+)/') || '-' || REGEXP_EXTRACT(Uploaded_variation, '([ACGT]+)$') AS vid, + Gene, + Feature, + Feature_type, + Consequence, + cDNA_position, + CDS_position, + Protein_position, + Amino_acids, + Codons, + Existing_variation, + IMPACT, + DISTANCE, + STRAND, + SPLIT(FLAGS, ',') AS FLAGS, + SYMBOL, + SYMBOL_SOURCE, + HGNC_ID, + SOURCE, + LoF, + SPLIT(LoF_filter, ',') AS LoF_filter, + SPLIT(LoF_flags, ',') AS LoF_flags, + SPLIT(LoF_info, ',') AS LoF_info, + GERP + FROM + ~{project_id}.~{dataset_name}.~{raw_data_table} + + ' + + >>> + + runtime { + docker: variants_docker + memory: "7 GB" + disk: "1000 GB" + } + + output { + + } +} + + task AnnotateVCF { input { diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 388535907de..71e99721af4 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -137,6 +137,7 @@ task GetToolVersions { String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" + String vep_loftee_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/loftee:2025-11-18-1ea988fc4bbf" String workspace_bucket = read_string(workspace_bucket_output) String workspace_id = read_string(workspace_id_output) From 3f1ecfc977331d59edd63064c08ba4da216acbcd Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 20 Nov 2025 19:18:41 -0500 Subject: [PATCH 02/60] fixes --- .../GvsCreateVATfromVDS.wdl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index d56e4a6ddab..b0391be3f98 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -885,7 +885,10 @@ task BigQueryCookVepAndLofteeRawAnnotations { --project_id=~{project_id} ' SELECT - REGEXP_EXTRACT(Uploaded_variation, '^chr([^_]+)') || '-' || REGEXP_EXTRACT(Uploaded_variation, '_(\\d+)') || '-' || REGEXP_EXTRACT(Uploaded_variation, '_([ACGT]+)/') || '-' || REGEXP_EXTRACT(Uploaded_variation, '([ACGT]+)$') AS vid, + REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") || "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, + Uploaded_variation, + Location, + Allele, Gene, Feature, Feature_type, @@ -899,16 +902,17 @@ task BigQueryCookVepAndLofteeRawAnnotations { IMPACT, DISTANCE, STRAND, - SPLIT(FLAGS, ',') AS FLAGS, + SPLIT(FLAGS, ",") AS FLAGS, SYMBOL, SYMBOL_SOURCE, HGNC_ID, SOURCE, LoF, - SPLIT(LoF_filter, ',') AS LoF_filter, - SPLIT(LoF_flags, ',') AS LoF_flags, - SPLIT(LoF_info, ',') AS LoF_info, + SPLIT(LoF_filter, ",") AS LoF_filter, + SPLIT(LoF_flags, ",") AS LoF_flags, + SPLIT(LoF_info, ",") AS LoF_info, GERP + FROM ~{project_id}.~{dataset_name}.~{raw_data_table} From 11cc37c59569b3bd7793d0471af80ad9e6c7cbda Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Fri, 21 Nov 2025 18:57:11 -0500 Subject: [PATCH 03/60] wire everything up --- .../schema/variant_transcript_schema.json | 42 +++++++++++++ .../schema/vat_schema.json | 42 +++++++++++++ .../GvsCreateVATfromVDS.wdl | 59 ++++++++++++++++--- 3 files changed, 136 insertions(+), 7 deletions(-) diff --git a/scripts/variantstore/scripts/variant_annotation_table/schema/variant_transcript_schema.json b/scripts/variantstore/scripts/variant_annotation_table/schema/variant_transcript_schema.json index 067910322b1..8a420eca440 100644 --- a/scripts/variantstore/scripts/variant_annotation_table/schema/variant_transcript_schema.json +++ b/scripts/variantstore/scripts/variant_annotation_table/schema/variant_transcript_schema.json @@ -431,6 +431,48 @@ "type": "String", "mode": "Nullable" }, + { + "description": "HGNC (HUGO Gene Nomenclature Committee) Symbol", + "name": "hgnc_symbol", + "type": "String", + "mode": "Nullable" + }, + { + "description": "HGNC_ID (HUGO Gene Nomenclature Committee) Identifier", + "name": "hgnc_id", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "LOFTEE Loss-of-function annotation (HC = High Confidence; LC = Low Confidence)", + "name": "LoF", + "type": "String", + "mode": "Nullable" + }, + { + "description": "LOFTEE Reason for LoF not being HC", + "name": "LoF_filter", + "type": "String", + "mode": "Repeated" + }, + { + "description": "LOFTEE Possible warning flags for LoF", + "name": "LoF_flags", + "type": "String", + "mode": "Repeated" + }, + { + "description": "LOFTEE Info used for LoF annotation", + "name": "LoF_info", + "type": "String", + "mode": "Repeated" + }, + { + "description": "GERP (Genomic Evolutionary Rate Profiling) Scores", + "name": "GERP", + "type": "Float", + "mode": "Repeated" + }, { "description": "gnomAD: 'Total' frequency", "name": "gnomad_all_af", diff --git a/scripts/variantstore/scripts/variant_annotation_table/schema/vat_schema.json b/scripts/variantstore/scripts/variant_annotation_table/schema/vat_schema.json index f8d829b6048..241f06b8529 100644 --- a/scripts/variantstore/scripts/variant_annotation_table/schema/vat_schema.json +++ b/scripts/variantstore/scripts/variant_annotation_table/schema/vat_schema.json @@ -431,6 +431,48 @@ "type": "String", "mode": "Nullable" }, + { + "description": "HGNC (HUGO Gene Nomenclature Committee) Symbol", + "name": "hgnc_symbol", + "type": "String", + "mode": "Nullable" + }, + { + "description": "HGNC_ID (HUGO Gene Nomenclature Committee) Identifier", + "name": "hgnc_id", + "type": "Integer", + "mode": "Nullable" + }, + { + "description": "LOFTEE Loss-of-function annotation (HC = High Confidence; LC = Low Confidence)", + "name": "LoF", + "type": "String", + "mode": "Nullable" + }, + { + "description": "LOFTEE Reason for LoF not being HC", + "name": "LoF_filter", + "type": "String", + "mode": "Repeated" + }, + { + "description": "LOFTEE Possible warning flags for LoF", + "name": "LoF_flags", + "type": "String", + "mode": "Repeated" + }, + { + "description": "LOFTEE Info used for LoF annotation", + "name": "LoF_info", + "type": "String", + "mode": "Repeated" + }, + { + "description": "GERP (Genomic Evolutionary Rate Profiling) Scores", + "name": "GERP", + "type": "Float", + "mode": "Repeated" + }, { "description": "gnomAD: 'Total' frequency", "name": "gnomad_all_af", diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index b0391be3f98..c4d9e7e625f 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -323,6 +323,7 @@ workflow GvsCreateVATfromVDS { call BigQueryCookVepAndLofteeRawAnnotations { input: + go = GenerateVepAndLofteeAnnotations.done, project_id = project_id, dataset_name = dataset_name, raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), @@ -350,6 +351,7 @@ workflow GvsCreateVATfromVDS { variant_transcript_schema = MakeSubpopulationFilesAndReadSchemaFiles.variant_transcript_schema_json_file, genes_schema = MakeSubpopulationFilesAndReadSchemaFiles.genes_schema_json_file, mane_table_name = LoadManeDataIntoBigQuery.mane_table, + vep_loftee_cooked_table_name = BigQueryCookVepAndLofteeRawAnnotations.cooked_table_name, project_id = project_id, dataset_name = dataset_name, variant_transcripts_path = variant_transcripts_output_path, @@ -379,6 +381,8 @@ workflow GvsCreateVATfromVDS { vat_table_name = DeduplicateVatInBigQuery.vat_table, output_path = effective_output_path, merge_vcfs_disk_size_override = merge_vcfs_disk_size_override, + # This precondition seems wrong / misleading. This task is actually gated on DeduplicateVatInBigQuery, + # as it should be. precondition_met = BigQueryLoadJson.done, cloud_sdk_docker = effective_cloud_sdk_docker, cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, @@ -825,6 +829,7 @@ task GenerateVepAndLofteeAnnotations { output { File output_file = "vep_loftee_raw_output.txt" + Boolean done = true } } @@ -869,6 +874,7 @@ task BigQueryLoadRawVepAndLofteeAnnotations { task BigQueryCookVepAndLofteeRawAnnotations { input { + Array[Boolean] go String variants_docker String project_id String dataset_name @@ -885,6 +891,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { --project_id=~{project_id} ' SELECT + -- Make a VID-compatible string from the data in Uploaded_variation. REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") || "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, Uploaded_variation, Location, @@ -902,16 +909,25 @@ task BigQueryCookVepAndLofteeRawAnnotations { IMPACT, DISTANCE, STRAND, + -- FLAGS can be multi-valued so SPLIT to make this REPEATED. SPLIT(FLAGS, ",") AS FLAGS, - SYMBOL, + SYMBOL as HGNC_SYMBOL, SYMBOL_SOURCE, - HGNC_ID, + -- HGNC IDs are formatted like HGNC:1234; we only want the number part. + CAST(SPLIT(HGNC_ID, ":")[OFFSET(1)] AS INTEGER) AS HGNC_ID, SOURCE, LoF, + -- These three appear to sometimes be multi-valued so SPLIT to make them REPEATEDs. SPLIT(LoF_filter, ",") AS LoF_filter, SPLIT(LoF_flags, ",") AS LoF_flags, SPLIT(LoF_info, ",") AS LoF_info, - GERP + -- Split and cast the GERP string to REPEATED FLOAT64s. + ( + SELECT + ARRAY_AGG(SAFE_CAST(s AS FLOAT64)) + FROM + UNNEST(SPLIT(GERP, ",")) AS s + ) AS GERP FROM ~{project_id}.~{dataset_name}.~{raw_data_table} @@ -923,16 +939,16 @@ task BigQueryCookVepAndLofteeRawAnnotations { runtime { docker: variants_docker memory: "7 GB" - disk: "1000 GB" + disk: "1000 HDD" } output { - + Boolean done = true + String cooked_table_name = cooked_data_table } } - task AnnotateVCF { input { File input_vcf @@ -1249,6 +1265,7 @@ task BigQueryLoadJson { File variant_transcript_schema File genes_schema String mane_table_name + String vep_loftee_cooked_table_name String project_id String dataset_name String variant_transcripts_path @@ -1301,6 +1318,27 @@ task BigQueryLoadJson { bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \ 'UPDATE `~{dataset_name}.~{variant_transcript_table}` vtt SET vtt.mane_plus_clinical_name = mane.name FROM `~{dataset_name}.~{mane_table_name}` mane WHERE vtt.transcript = mane.Ensembl_nuc AND mane.MANE_status = "MANE Plus Clinical" AND vtt.transcript is not null;' + echo "Adding VET + LOFTEE annotation data to the pre-vat table ~{dataset_name}.~{variant_transcript_table}" + bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} ' + + UPDATE `~{dataset_name}.~{variant_transcript_table}` vtt SET + + vtt.hgnc_symbol = vep.hgnc_symbol, + vtt.hgnc_id = vep.hgnc_id, + vtt.LoF = vep.LoF, + vtt.LoF_filter = vep.LoF_filter, + vtt.LoF_flags = vep.LoF_flags, + vtt.LoF_info = vep.LoF_info, + vtt.GERP = vep.GERP + + FROM `~{dataset_name}.~{vep_loftee_cooked_table_name}` vep WHERE + + vtt.transcript = vep.transcript AND + vtt.vid = vep.vid AND + vtt.transcript is not null; + + ' + set +o errexit bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{genes_table} > /dev/null BQ_SHOW_RC=$? @@ -1451,7 +1489,14 @@ task BigQueryLoadJson { v.clinvar_rcv_classifications, v.clinvar_rcv_num_stars, v.mane_select_name, - v.mane_plus_clinical_name + v.mane_plus_clinical_name, + v.hgnc_symbol, + v.hgnc_id, + v.LoF, + v.LoF_filter, + v.LoF_flags, + v.LoF_info, + v.GERP FROM `~{dataset_name}.~{variant_transcript_table}` as v left join (SELECT gene_symbol, ANY_VALUE(gene_omim_id) AS gene_omim_id, ANY_VALUE(omim_phenotypes_id) AS omim_phenotypes_id, ANY_VALUE(omim_phenotypes_name) AS omim_phenotypes_name FROM `~{dataset_name}.~{genes_table}` group by gene_symbol) as g From f72283587fe379e457c95ee2c6eb70cf60e2c011 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Fri, 21 Nov 2025 19:02:59 -0500 Subject: [PATCH 04/60] dockstore --- .dockstore.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dockstore.yml b/.dockstore.yml index 263f8e54d89..d60c7a87401 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -343,7 +343,7 @@ workflows: branches: - master - ah_var_store - - vs_1739_cmrg_learnings + - vs_1520_loftee tags: - /.*/ - name: GvsIngestTieout From 9acd0be7b1409e82ea1810e5e1b7c0aff0c74ffb Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Fri, 21 Nov 2025 19:13:31 -0500 Subject: [PATCH 05/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index c4d9e7e625f..6fc93f8150b 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -775,7 +775,7 @@ for line in sys.stdin: task GenerateVepAndLofteeAnnotations { input { String vep_loftee_docker - # TODO make a reference disk for this stuff, that + # TODO make a reference disk for this stuff, some of these references are huge. File loftee_human_ancestor_fa_gz File loftee_human_ancestor_fa_gz_fai File loftee_human_ancestor_fa_gz_gzi @@ -817,7 +817,7 @@ task GenerateVepAndLofteeAnnotations { --output_file vep_loftee_raw_output.txt ) - vep "${args[@}}" + vep "${args[@]}" >>> From d183c11165efdb9b4414b66fd97b3f83a4417e4f Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 22 Nov 2025 05:50:13 -0500 Subject: [PATCH 06/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 6fc93f8150b..7693e40ca37 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -824,7 +824,7 @@ task GenerateVepAndLofteeAnnotations { runtime { docker: vep_loftee_docker memory: "15 GB" - disks: "1000 HDD" + disks: "local-disk 1000 HDD" } output { @@ -864,11 +864,11 @@ task BigQueryLoadRawVepAndLofteeAnnotations { runtime { docker: variants_docker memory: "7 GB" - disk: "1000 GB" + disks: "local-disk 1000 HDD" } output { - + Boolean done = true } } @@ -939,7 +939,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { runtime { docker: variants_docker memory: "7 GB" - disk: "1000 HDD" + disks: "local-disk 1000 HDD" } output { @@ -1508,7 +1508,7 @@ task BigQueryLoadJson { memory: "3 GB" preemptible: 3 cpu: "1" - disks: "local-disk 100 HDD" + disks: "local-disk 1000 HDD" } output { From 3fbd1806814df082646933f5005f5f29c853d861 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 22 Nov 2025 08:31:50 -0500 Subject: [PATCH 07/60] vep cache --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 7693e40ca37..99bce5b0ff8 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -789,6 +789,9 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace + curl -O https://ftp.ensembl.org/pub/release-115/variation/indexed_vep_cache/homo_sapiens_vep_115_GRCh38.tar.gz + tar xzf homo_sapiens_vep_115_GRCh38.tar.gz + LOFTEE_PATH=/opt/vep/src/loftee-1.0.4_GRCh38 args=( From ab101805806459bcbf669c067b8de55e843ed0aa Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 22 Nov 2025 11:17:23 -0500 Subject: [PATCH 08/60] stage vep reference --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 99bce5b0ff8..6fc215127ec 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -275,6 +275,7 @@ workflow GvsCreateVATfromVDS { call GenerateVepAndLofteeAnnotations { input: vep_loftee_docker = effective_vep_loftee_docker, + vep_cache = loftee_references_dir + "homo_sapiens_vep_115_GRCh38.tar.gz", loftee_human_ancestor_fa_gz = loftee_references_dir + "human_ancestor.fa.gz", loftee_human_ancestor_fa_gz_fai = loftee_references_dir + "human_ancestor.fa.gz.fai", loftee_human_ancestor_fa_gz_gzi = loftee_references_dir + "human_ancestor.fa.gz.gzi", @@ -775,7 +776,8 @@ for line in sys.stdin: task GenerateVepAndLofteeAnnotations { input { String vep_loftee_docker - # TODO make a reference disk for this stuff, some of these references are huge. + # TODO make a reference disk for this stuff, some of these references are quite large. + File vep_cache File loftee_human_ancestor_fa_gz File loftee_human_ancestor_fa_gz_fai File loftee_human_ancestor_fa_gz_gzi @@ -789,13 +791,12 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - curl -O https://ftp.ensembl.org/pub/release-115/variation/indexed_vep_cache/homo_sapiens_vep_115_GRCh38.tar.gz - tar xzf homo_sapiens_vep_115_GRCh38.tar.gz + tar xzf ~{vep_cache} LOFTEE_PATH=/opt/vep/src/loftee-1.0.4_GRCh38 args=( - # "Splats" out data into their own columns that otherwise would be nested (semicolon delimted) in the "Extra" column. + # Breaks out data into their own columns that otherwise would be nested (semicolon delimited) in the "Extra" column. --tab # Force writing versions on Ensembl transcripts for VAT compatibility. @@ -808,7 +809,7 @@ task GenerateVepAndLofteeAnnotations { --plugin LoF,loftee_path:$LOFTEE_PATH,gerp_bigwig:~{loftee_gerp_scores},human_ancestor_fa:~{loftee_human_ancestor_fa_gz},conservation_file:~{loftee_phylo_csf_database} --dir_plugins $LOFTEE_PATH - # Basic VEP/LOFTEE cache setup + # Basic VEP cache setup --cache --dir_cache . From a61a20635517250d28e8fe64164b86ede860f0f2 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 22 Nov 2025 13:54:18 -0500 Subject: [PATCH 09/60] try to get rid of ellipsis --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 6fc215127ec..98781446ae7 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -814,7 +814,7 @@ task GenerateVepAndLofteeAnnotations { --dir_cache . # For GERP (Genomic Evolutionary Rate Profiling) score output. - --custom file=~{loftee_gerp_scores},short_name=GERP,format=bigwig + --custom file=~{loftee_gerp_scores},short_name=GERP,format=bigwig,num_records=all # Input and output files --input_file ~{sites_only_vcf} From 3844cc6d6818946d6ecc24118f79ac24f8ff4681 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 22 Nov 2025 13:55:07 -0500 Subject: [PATCH 10/60] try preemptible --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 98781446ae7..deee6e564ae 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -826,6 +826,7 @@ task GenerateVepAndLofteeAnnotations { >>> runtime { + preemptible: 2 docker: vep_loftee_docker memory: "15 GB" disks: "local-disk 1000 HDD" From c56cea63f904bfb98f821b04f11da48623aaef4a Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 22 Nov 2025 16:08:10 -0500 Subject: [PATCH 11/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index deee6e564ae..ae026e9198a 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -1338,9 +1338,10 @@ task BigQueryLoadJson { FROM `~{dataset_name}.~{vep_loftee_cooked_table_name}` vep WHERE - vtt.transcript = vep.transcript AND - vtt.vid = vep.vid AND - vtt.transcript is not null; + vtt.transcript is not null AND + vep.Feature_type is not null AND + vtt.transcript = vep.Feature AND + vtt.vid = vep.vid ' From d0cb27b34d72b9af42fa0b0f24d43fe3b0553ad5 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 22 Nov 2025 17:54:02 -0500 Subject: [PATCH 12/60] pull load out of loop to avoid rate limits --- .../GvsCreateVATfromVDS.wdl | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index ae026e9198a..8832034f409 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -285,15 +285,6 @@ workflow GvsCreateVATfromVDS { sites_only_vcf_index = CopySitesOnlyVcfIndex.output_file_path, } - call BigQueryLoadRawVepAndLofteeAnnotations { - input: - vep_loftee_raw_output = GenerateVepAndLofteeAnnotations.output_file, - project_id = project_id, - dataset_name = dataset_name, - raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), - variants_docker = effective_variants_docker, - } - ## Use Nirvana to annotate the sites-only VCF and include the AC/AN/AF calculations as custom annotations call AnnotateVCF { input: @@ -322,6 +313,15 @@ workflow GvsCreateVATfromVDS { } } + call BigQueryLoadRawVepAndLofteeAnnotations { + input: + vep_loftee_raw_output = GenerateVepAndLofteeAnnotations.output_file, + project_id = project_id, + dataset_name = dataset_name, + raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), + variants_docker = effective_variants_docker, + } + call BigQueryCookVepAndLofteeRawAnnotations { input: go = GenerateVepAndLofteeAnnotations.done, @@ -841,7 +841,7 @@ task GenerateVepAndLofteeAnnotations { task BigQueryLoadRawVepAndLofteeAnnotations { input { String variants_docker - File vep_loftee_raw_output + Array[File] vep_loftee_raw_output String project_id String dataset_name String raw_data_table @@ -852,18 +852,21 @@ task BigQueryLoadRawVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - # Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data - # - Remove lines beginning with '##'. - # - Remove the leading '#' from the one line that should be left with a single leading '#' so the line can - # serve as a TSV header. - sed -E '/^##/d' ~{vep_loftee_raw_output} | sed -E 's/^#//' > vep_loftee_load_file.txt - - # Schema autodetection doesn't seem to work with --autodetect here for reasons unknown 😭 - # Explicitly get the header and sed it into schema form - schema=$(head -1 vep_loftee_load_file.txt| sed "s/\t/:STRING,/g" | sed 's/$/:STRING/') - - bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' --skip_leading_rows=1 \ - --null_marker="-" --schema ${schema} ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt + for file in ~{vep_loftee_raw_output} + do + # Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data + # - Remove lines beginning with '##'. + # - Remove the leading '#' from the one line that should be left with a single leading '#' so the line can + # serve as a TSV header. + sed -E '/^##/d' $file | sed -E 's/^#//' > vep_loftee_load_file.txt + + # Schema autodetection doesn't seem to work with --autodetect here for reasons unknown 😭 + # Explicitly get the header and sed it into schema form + schema=$(head -1 vep_loftee_load_file.txt| sed "s/\t/:STRING,/g" | sed 's/$/:STRING/') + + bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' --skip_leading_rows=1 \ + --null_marker="-" --schema ${schema} ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt + done >>> runtime { From b35a1eedb0e1149ac561fe6e19286b39913a8279 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 22 Nov 2025 19:14:15 -0500 Subject: [PATCH 13/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 8832034f409..e6b25a8f923 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -852,7 +852,7 @@ task BigQueryLoadRawVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - for file in ~{vep_loftee_raw_output} + for file in ~{sep=' ' vep_loftee_raw_output} do # Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data # - Remove lines beginning with '##'. From e52cd1bf4c63358d06e2916626f758a2dd759a88 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 05:57:07 -0500 Subject: [PATCH 14/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index e6b25a8f923..8cc0cb44b3c 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -324,7 +324,7 @@ workflow GvsCreateVATfromVDS { call BigQueryCookVepAndLofteeRawAnnotations { input: - go = GenerateVepAndLofteeAnnotations.done, + go = BigQueryLoadRawVepAndLofteeAnnotations.done, project_id = project_id, dataset_name = dataset_name, raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), From 3e4f2247dc8c2f7c7d2dc3ab083388a19106c0b4 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 06:17:29 -0500 Subject: [PATCH 15/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 8cc0cb44b3c..372cb2621d4 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -882,7 +882,7 @@ task BigQueryLoadRawVepAndLofteeAnnotations { task BigQueryCookVepAndLofteeRawAnnotations { input { - Array[Boolean] go + Boolean go String variants_docker String project_id String dataset_name From e1ba7bbcf68cfde05b3e1f5d93f41900e05a95d3 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 08:20:26 -0500 Subject: [PATCH 16/60] update Docker --- scripts/variantstore/wdl/GvsUtils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 71e99721af4..71c83c81ae7 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -131,7 +131,7 @@ task GetToolVersions { # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:524.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-10-28-alpine-4a74414607d9" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-11-23-alpine-631465808626" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-10-17-gatkbase-0a4709121758" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" From 3db29e310f27f9ccc6f6b82bc8d5d4e833c33a1f Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 11:36:51 -0500 Subject: [PATCH 17/60] big oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 372cb2621d4..a00734d8974 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -281,8 +281,7 @@ workflow GvsCreateVATfromVDS { loftee_human_ancestor_fa_gz_gzi = loftee_references_dir + "human_ancestor.fa.gz.gzi", loftee_gerp_scores = loftee_references_dir + "gerp_conservation_scores.homo_sapiens.GRCh38.bw", loftee_phylo_csf_database = loftee_references_dir + "loftee.sql", - sites_only_vcf = CopySitesOnlyVcf.output_file_path, - sites_only_vcf_index = CopySitesOnlyVcfIndex.output_file_path, + input_vcf = StripCustomAnnotationsFromSitesOnlyVCF.output_vcf, } ## Use Nirvana to annotate the sites-only VCF and include the AC/AN/AF calculations as custom annotations @@ -783,8 +782,7 @@ task GenerateVepAndLofteeAnnotations { File loftee_human_ancestor_fa_gz_gzi File loftee_gerp_scores File loftee_phylo_csf_database - File sites_only_vcf - File sites_only_vcf_index + File input_vcf } command <<< # Prepend date, time and pwd to xtrace log entries. @@ -817,7 +815,7 @@ task GenerateVepAndLofteeAnnotations { --custom file=~{loftee_gerp_scores},short_name=GERP,format=bigwig,num_records=all # Input and output files - --input_file ~{sites_only_vcf} + --input_file ~{input_vcf} --output_file vep_loftee_raw_output.txt ) From 6d896dc605a26c6656d1dd49a716bcd422366d4a Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 13:07:43 -0500 Subject: [PATCH 18/60] do not error on empty input --- .../GvsCreateVATfromVDS.wdl | 72 +++++++++++++------ 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index a00734d8974..18ec2c86486 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -784,42 +784,70 @@ task GenerateVepAndLofteeAnnotations { File loftee_phylo_csf_database File input_vcf } + + parameter_meta { + vep_cache: { + localization_optional: true + } + loftee_human_ancestor_fa_gz: { + localization_optional: true + } + loftee_human_ancestor_fa_gz_fai: { + localization_optional: true + } + loftee_human_ancestor_fa_gz_gzi: { + localization_optional: true + } + loftee_gerp_scores: { + localization_optional: true + } + loftee_phylo_csf_database: { + localization_optional: true + } + } command <<< # Prepend date, time and pwd to xtrace log entries. PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - tar xzf ~{vep_cache} + if [[ grep -E -v '^#' ]] + then + # only copy these references if there are actually data lines in the VCF to be processed. + gcloud storage cp ~{vep_cache} ~{vep_cache} ~{loftee_human_ancestor_fa_gz} ~{loftee_human_ancestor_fa_gz_fai} ~{loftee_human_ancestor_fa_gz_gzi} ~{loftee_gerp_scores} ~{loftee_phylo_csf_database} . + tar xzf ~{vep_cache} - LOFTEE_PATH=/opt/vep/src/loftee-1.0.4_GRCh38 - args=( + LOFTEE_PATH=/opt/vep/src/loftee-1.0.4_GRCh38 + args=( - # Breaks out data into their own columns that otherwise would be nested (semicolon delimited) in the "Extra" column. - --tab + # Breaks out data into their own columns that otherwise would be nested (semicolon delimited) in the "Extra" column. + --tab - # Force writing versions on Ensembl transcripts for VAT compatibility. - --transcript_version + # Force writing versions on Ensembl transcripts for VAT compatibility. + --transcript_version - # Emit HGNC symbols and IDs. - --symbol + # Emit HGNC symbols and IDs. + --symbol - # Basic LOFTEE plugin setup - --plugin LoF,loftee_path:$LOFTEE_PATH,gerp_bigwig:~{loftee_gerp_scores},human_ancestor_fa:~{loftee_human_ancestor_fa_gz},conservation_file:~{loftee_phylo_csf_database} - --dir_plugins $LOFTEE_PATH + # Basic LOFTEE plugin setup + --plugin LoF,loftee_path:$LOFTEE_PATH,gerp_bigwig:~{loftee_gerp_scores},human_ancestor_fa:~{loftee_human_ancestor_fa_gz},conservation_file:~{loftee_phylo_csf_database} + --dir_plugins $LOFTEE_PATH - # Basic VEP cache setup - --cache - --dir_cache . + # Basic VEP cache setup + --cache + --dir_cache . - # For GERP (Genomic Evolutionary Rate Profiling) score output. - --custom file=~{loftee_gerp_scores},short_name=GERP,format=bigwig,num_records=all + # For GERP (Genomic Evolutionary Rate Profiling) score output. + --custom file=~{loftee_gerp_scores},short_name=GERP,format=bigwig,num_records=all - # Input and output files - --input_file ~{input_vcf} - --output_file vep_loftee_raw_output.txt - ) + # Input and output files + --input_file ~{input_vcf} + --output_file vep_loftee_raw_output.txt + ) - vep "${args[@]}" + vep "${args[@]}" + else + echo "No data found for processing in VCF, exit 0." + fi >>> From e12f69f4f9b3157ae5234e89c1b76dde93873b30 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 13:32:02 -0500 Subject: [PATCH 19/60] fix empty file handling --- .../GvsCreateVATfromVDS.wdl | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 18ec2c86486..31a25d22a46 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -847,6 +847,7 @@ task GenerateVepAndLofteeAnnotations { vep "${args[@]}" else echo "No data found for processing in VCF, exit 0." + touch "vep_loftee_raw_output.txt" fi >>> @@ -880,18 +881,23 @@ task BigQueryLoadRawVepAndLofteeAnnotations { for file in ~{sep=' ' vep_loftee_raw_output} do - # Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data - # - Remove lines beginning with '##'. - # - Remove the leading '#' from the one line that should be left with a single leading '#' so the line can - # serve as a TSV header. - sed -E '/^##/d' $file | sed -E 's/^#//' > vep_loftee_load_file.txt - - # Schema autodetection doesn't seem to work with --autodetect here for reasons unknown 😭 - # Explicitly get the header and sed it into schema form - schema=$(head -1 vep_loftee_load_file.txt| sed "s/\t/:STRING,/g" | sed 's/$/:STRING/') - - bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' --skip_leading_rows=1 \ - --null_marker="-" --schema ${schema} ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt + if [ -s $file ] + then + # Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data + # - Remove lines beginning with '##'. + # - Remove the leading '#' from the one line that should be left with a single leading '#' so the line can + # serve as a TSV header. + sed -E '/^##/d' $file | sed -E 's/^#//' > vep_loftee_load_file.txt + + # Schema autodetection doesn't seem to work with --autodetect here for reasons unknown 😭 + # Explicitly get the header and sed it into schema form + schema=$(head -1 vep_loftee_load_file.txt| sed "s/\t/:STRING,/g" | sed 's/$/:STRING/') + + bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' --skip_leading_rows=1 \ + --null_marker="-" --schema ${schema} ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt + else + echo "File $file is empty, skipping." + fi done >>> From fbab0fb5a508d5b6e1b825e74d636aab7ebdd4d1 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 14:52:25 -0500 Subject: [PATCH 20/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 31a25d22a46..e13dd8210db 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -810,7 +810,7 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - if [[ grep -E -v '^#' ]] + if grep -E -v '^#' 2>&1 > /dev/null then # only copy these references if there are actually data lines in the VCF to be processed. gcloud storage cp ~{vep_cache} ~{vep_cache} ~{loftee_human_ancestor_fa_gz} ~{loftee_human_ancestor_fa_gz_fai} ~{loftee_human_ancestor_fa_gz_gzi} ~{loftee_gerp_scores} ~{loftee_phylo_csf_database} . From 8b318f26d14ea3c83d3b141ce6698301409cf5a6 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 17:08:46 -0500 Subject: [PATCH 21/60] maybe --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index e13dd8210db..35645990d6f 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -810,7 +810,7 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - if grep -E -v '^#' 2>&1 > /dev/null + if { grep -E -v '^#' 2>&1 > /dev/null; } then # only copy these references if there are actually data lines in the VCF to be processed. gcloud storage cp ~{vep_cache} ~{vep_cache} ~{loftee_human_ancestor_fa_gz} ~{loftee_human_ancestor_fa_gz_fai} ~{loftee_human_ancestor_fa_gz_gzi} ~{loftee_gerp_scores} ~{loftee_phylo_csf_database} . From 54f8333021ddaff2e400e66eb5d50a3ff372fb39 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 23 Nov 2025 19:07:41 -0500 Subject: [PATCH 22/60] geez --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 35645990d6f..d00119fb02e 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -810,7 +810,7 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - if { grep -E -v '^#' 2>&1 > /dev/null; } + if { grep -E -v '^#' ~{input_vcf} 2>&1 > /dev/null; } then # only copy these references if there are actually data lines in the VCF to be processed. gcloud storage cp ~{vep_cache} ~{vep_cache} ~{loftee_human_ancestor_fa_gz} ~{loftee_human_ancestor_fa_gz_fai} ~{loftee_human_ancestor_fa_gz_gzi} ~{loftee_gerp_scores} ~{loftee_phylo_csf_database} . From 76a60724f105c072638e8b34f850239442ebcf07 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 24 Nov 2025 06:04:40 -0500 Subject: [PATCH 23/60] more --- .../GvsCreateVATfromVDS.wdl | 30 +++++-------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index d00119fb02e..b529974db99 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -785,26 +785,6 @@ task GenerateVepAndLofteeAnnotations { File input_vcf } - parameter_meta { - vep_cache: { - localization_optional: true - } - loftee_human_ancestor_fa_gz: { - localization_optional: true - } - loftee_human_ancestor_fa_gz_fai: { - localization_optional: true - } - loftee_human_ancestor_fa_gz_gzi: { - localization_optional: true - } - loftee_gerp_scores: { - localization_optional: true - } - loftee_phylo_csf_database: { - localization_optional: true - } - } command <<< # Prepend date, time and pwd to xtrace log entries. PS4='\D{+%F %T} \w $ ' @@ -812,8 +792,14 @@ task GenerateVepAndLofteeAnnotations { if { grep -E -v '^#' ~{input_vcf} 2>&1 > /dev/null; } then - # only copy these references if there are actually data lines in the VCF to be processed. - gcloud storage cp ~{vep_cache} ~{vep_cache} ~{loftee_human_ancestor_fa_gz} ~{loftee_human_ancestor_fa_gz_fai} ~{loftee_human_ancestor_fa_gz_gzi} ~{loftee_gerp_scores} ~{loftee_phylo_csf_database} . + # Only copy these references if there are actually data lines in the VCF to be processed, + # Most of the shards in 20/X/Y integration runs don't have any work to do and don't need + # to localize the references. + # + # gcloud storage cp ~{vep_cache} ~{vep_cache} ~{loftee_human_ancestor_fa_gz} ~{loftee_human_ancestor_fa_gz_fai} ~{loftee_human_ancestor_fa_gz_gzi} ~{loftee_gerp_scores} ~{loftee_phylo_csf_database} . + # + # TODO yeah that would be nice but here's no gcloud on the VEP + LOFTEE image. These references + # *really* should be on a reference disk. tar xzf ~{vep_cache} LOFTEE_PATH=/opt/vep/src/loftee-1.0.4_GRCh38 From 58c06034f2969561b56aa13064a1964c2ecea52c Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 25 Nov 2025 12:25:37 -0500 Subject: [PATCH 24/60] My robot friend catches and fixes my bugs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index b529974db99..8c3aab57b62 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -327,7 +327,7 @@ workflow GvsCreateVATfromVDS { project_id = project_id, dataset_name = dataset_name, raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), - cooked_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_cooked"]), + cooked_data_table = select_first([vep_loftee_data_table_cooked, "vep_loftee_data_table_cooked"]), variants_docker = effective_variants_docker, } @@ -1344,7 +1344,7 @@ task BigQueryLoadJson { bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \ 'UPDATE `~{dataset_name}.~{variant_transcript_table}` vtt SET vtt.mane_plus_clinical_name = mane.name FROM `~{dataset_name}.~{mane_table_name}` mane WHERE vtt.transcript = mane.Ensembl_nuc AND mane.MANE_status = "MANE Plus Clinical" AND vtt.transcript is not null;' - echo "Adding VET + LOFTEE annotation data to the pre-vat table ~{dataset_name}.~{variant_transcript_table}" + echo "Adding VEP + LOFTEE annotation data to the pre-vat table ~{dataset_name}.~{variant_transcript_table}" bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} ' UPDATE `~{dataset_name}.~{variant_transcript_table}` vtt SET From 2b884f0e932761a7836fef18132248377c15851d Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 26 Nov 2025 05:01:39 -0500 Subject: [PATCH 25/60] attempt to fix coordinates --- .../GvsCreateVATfromVDS.wdl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 8c3aab57b62..fcf603a3075 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -918,7 +918,17 @@ task BigQueryCookVepAndLofteeRawAnnotations { SELECT -- Make a VID-compatible string from the data in Uploaded_variation. - REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") || "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, + IF((Allele IS NOT NULL AND LENGTH(Allele) = 1), + -- VEP appears to use a different convention for the encoding of indel locations than what is used in GVS: the + -- positions are based on the first *discrepant* base, not the first base mentioned which actually agrees + -- between the reference and allele. Correct for that in the VID-building code below to substract 1 if the + -- variant in question is an indel. + REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || + IF ((Allele IS NOT NULL AND LENGTH(Allele) = 1), + REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)"), + CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)"),) AS INT64) - 1) AS STRING) || + "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || + REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, Uploaded_variation, Location, Allele, From e1e8a5ea803659ca517fb8aac2a47369d488c2c1 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 26 Nov 2025 05:22:25 -0500 Subject: [PATCH 26/60] fix parens --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index fcf603a3075..20be8ee9f60 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -926,7 +926,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || IF ((Allele IS NOT NULL AND LENGTH(Allele) = 1), REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)"), - CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)"),) AS INT64) - 1) AS STRING) || + CAST(((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)"),) AS INT64) - 1) AS STRING)) || "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, Uploaded_variation, From 491335a5c5659c97b17e2b03fb632480fc02d0b7 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 26 Nov 2025 06:02:44 -0500 Subject: [PATCH 27/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 20be8ee9f60..a0f947c5bb4 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -926,7 +926,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || IF ((Allele IS NOT NULL AND LENGTH(Allele) = 1), REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)"), - CAST(((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)"),) AS INT64) - 1) AS STRING)) || + CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING)) || "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, Uploaded_variation, From d587750c7f8d44168a994a3fb7442ed852c72919 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 26 Nov 2025 06:43:21 -0500 Subject: [PATCH 28/60] whoops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index a0f947c5bb4..db3efe21e1b 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -918,7 +918,6 @@ task BigQueryCookVepAndLofteeRawAnnotations { SELECT -- Make a VID-compatible string from the data in Uploaded_variation. - IF((Allele IS NOT NULL AND LENGTH(Allele) = 1), -- VEP appears to use a different convention for the encoding of indel locations than what is used in GVS: the -- positions are based on the first *discrepant* base, not the first base mentioned which actually agrees -- between the reference and allele. Correct for that in the VID-building code below to substract 1 if the From 9009322c90949d8fa249b6d1d0bb2b5acf93cb79 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 26 Nov 2025 10:54:35 -0500 Subject: [PATCH 29/60] fix --- .../GvsCreateVATfromVDS.wdl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index db3efe21e1b..d594f433a8d 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -918,14 +918,17 @@ task BigQueryCookVepAndLofteeRawAnnotations { SELECT -- Make a VID-compatible string from the data in Uploaded_variation. - -- VEP appears to use a different convention for the encoding of indel locations than what is used in GVS: the - -- positions are based on the first *discrepant* base, not the first base mentioned which actually agrees - -- between the reference and allele. Correct for that in the VID-building code below to substract 1 if the - -- variant in question is an indel. + -- VEP appears to use a different convention for the encoding of indel positions than what is used in GVS: + -- VEP indel positions are based on the first *discrepant* base and not the first base mentioned, which in the + -- GVS convention agrees between reference and allele. Correct for that in the VID-building code below to + -- subtract 1 if the variant is an indel. REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || - IF ((Allele IS NOT NULL AND LENGTH(Allele) = 1), - REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)"), - CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING)) || + -- A Location specified with a '-' range is an indel. + IF (Location LIKE '%-%', + -- If this is an indel decrement the position by one for VAT compatibility. + CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING) + -- Else SNPs use position without adjustment. + REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)")) || "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, Uploaded_variation, From 8aa3ba4d0fcbd4ab575bebf487c302bc34c7fb7f Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 26 Nov 2025 11:51:40 -0500 Subject: [PATCH 30/60] argh --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index d594f433a8d..bdac41c673e 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -923,8 +923,8 @@ task BigQueryCookVepAndLofteeRawAnnotations { -- GVS convention agrees between reference and allele. Correct for that in the VID-building code below to -- subtract 1 if the variant is an indel. REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || - -- A Location specified with a '-' range is an indel. - IF (Location LIKE '%-%', + -- A Location specified with a "-" range is an indel. + IF (Location LIKE "%-%", -- If this is an indel decrement the position by one for VAT compatibility. CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING) -- Else SNPs use position without adjustment. From 1df94e1f63648e1ae75cf1c8fc189aadf6d321a4 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 26 Nov 2025 13:11:06 -0500 Subject: [PATCH 31/60] arghhhh --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index bdac41c673e..218d0d6f81f 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -926,7 +926,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { -- A Location specified with a "-" range is an indel. IF (Location LIKE "%-%", -- If this is an indel decrement the position by one for VAT compatibility. - CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING) + CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING), -- Else SNPs use position without adjustment. REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)")) || "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || From 41a46acf142130a4fd9ad7b32fd8d5837ba2455e Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 26 Nov 2025 17:10:13 -0500 Subject: [PATCH 32/60] more --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 218d0d6f81f..eaa22c17066 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -923,8 +923,9 @@ task BigQueryCookVepAndLofteeRawAnnotations { -- GVS convention agrees between reference and allele. Correct for that in the VID-building code below to -- subtract 1 if the variant is an indel. REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || - -- A Location specified with a "-" range is an indel. - IF (Location LIKE "%-%", + -- A Location specified with a "-" range is an indel. Single-base deletions are a special case with a single + -- position, but like all deletions they have a NULL Allele so look for that as well. + IF ((Location LIKE "%-%") OR (Allele is NULL), -- If this is an indel decrement the position by one for VAT compatibility. CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING), -- Else SNPs use position without adjustment. From 5b61a20ba9fc6ec21428433693129a80398f6a08 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 1 Dec 2025 19:16:39 -0500 Subject: [PATCH 33/60] relax transcript version number matching --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index eaa22c17066..9a969a0cffe 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -1374,8 +1374,10 @@ task BigQueryLoadJson { vtt.transcript is not null AND vep.Feature_type is not null AND - vtt.transcript = vep.Feature AND - vtt.vid = vep.vid + vtt.vid = vep.vid AND + -- Do not consider version numbers when matching on transcripts. In Quickstart about 25% of the transcripts are + -- mismatched on version number, with VEP having newer versions. + SPLIT(vtt.transcript, ".")[OFFSET(0)] = SPLIT(vep.Feature, ".")[OFFSET(0)] ' From aca846a1394f5195af976dae7df147837113ba59 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 4 Dec 2025 11:53:10 -0500 Subject: [PATCH 34/60] dockstore --- .dockstore.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockstore.yml b/.dockstore.yml index d60c7a87401..c51407c4d95 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -240,6 +240,7 @@ workflows: branches: - master - ah_var_store + - vs_1520_loftee tags: - /.*/ - name: GvsCreateVATFilesFromBigQuery From 917e3b145c6b87634a768a695b2572288b11057a Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 4 Dec 2025 19:34:05 -0500 Subject: [PATCH 35/60] scale testing improvements: TTL on load tables, creating if necessary; explicit schemas; output after VEP run to be able to see how long vep was running. --- .../schema/vep_loftee_115_cooked.json | 137 ++++++++++++++++++ .../schema/vep_loftee_115_raw.json | 132 +++++++++++++++++ .../GvsCreateVATfromVDS.wdl | 41 +++++- 3 files changed, 305 insertions(+), 5 deletions(-) create mode 100644 scripts/variantstore/scripts/variant_annotation_table/schema/vep_loftee_115_cooked.json create mode 100644 scripts/variantstore/scripts/variant_annotation_table/schema/vep_loftee_115_raw.json diff --git a/scripts/variantstore/scripts/variant_annotation_table/schema/vep_loftee_115_cooked.json b/scripts/variantstore/scripts/variant_annotation_table/schema/vep_loftee_115_cooked.json new file mode 100644 index 00000000000..a14cefb5c77 --- /dev/null +++ b/scripts/variantstore/scripts/variant_annotation_table/schema/vep_loftee_115_cooked.json @@ -0,0 +1,137 @@ +[ + { + "name": "vid", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Uploaded_variation", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Location", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Allele", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Gene", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Feature", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Feature_type", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Consequence", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "cDNA_position", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "CDS_position", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Protein_position", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Amino_acids", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Codons", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Existing_variation", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "IMPACT", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "DISTANCE", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "STRAND", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "FLAGS", + "type": "STRING", + "mode": "REPEATED" + }, + { + "name": "HGNC_SYMBOL", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "SYMBOL_SOURCE", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "HGNC_ID", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "SOURCE", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "LoF", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "LoF_filter", + "type": "STRING", + "mode": "REPEATED" + }, + { + "name": "LoF_flags", + "type": "STRING", + "mode": "REPEATED" + }, + { + "name": "LoF_info", + "type": "STRING", + "mode": "REPEATED" + }, + { + "name": "GERP", + "type": "FLOAT", + "mode": "REPEATED" + } +] diff --git a/scripts/variantstore/scripts/variant_annotation_table/schema/vep_loftee_115_raw.json b/scripts/variantstore/scripts/variant_annotation_table/schema/vep_loftee_115_raw.json new file mode 100644 index 00000000000..33ca7eda8c3 --- /dev/null +++ b/scripts/variantstore/scripts/variant_annotation_table/schema/vep_loftee_115_raw.json @@ -0,0 +1,132 @@ +[ + { + "name": "Uploaded_variation", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Location", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Allele", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Gene", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Feature", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Feature_type", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Consequence", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "cDNA_position", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "CDS_position", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Protein_position", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Amino_acids", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Codons", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "Existing_variation", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "IMPACT", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "DISTANCE", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "STRAND", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "FLAGS", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "SYMBOL", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "SYMBOL_SOURCE", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "HGNC_ID", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "SOURCE", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "LoF", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "LoF_filter", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "LoF_flags", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "LoF_info", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "GERP", + "type": "STRING", + "mode": "NULLABLE" + } +] \ No newline at end of file diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 9a969a0cffe..a30ad2449ec 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -318,6 +318,7 @@ workflow GvsCreateVATfromVDS { project_id = project_id, dataset_name = dataset_name, raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), + raw_data_table_schema = MakeSubpopulationFilesAndReadSchemaFiles.vep_loftee_raw_schema_json_file, variants_docker = effective_variants_docker, } @@ -328,6 +329,7 @@ workflow GvsCreateVATfromVDS { dataset_name = dataset_name, raw_data_table = select_first([vep_loftee_data_table_raw, "vep_loftee_data_table_raw"]), cooked_data_table = select_first([vep_loftee_data_table_cooked, "vep_loftee_data_table_cooked"]), + cooked_data_table_schema = MakeSubpopulationFilesAndReadSchemaFiles.vep_loftee_cooked_schema_json_file, variants_docker = effective_variants_docker, } @@ -548,6 +550,8 @@ task MakeSubpopulationFilesAndReadSchemaFiles { String vat_schema_json_filename = "vat_schema.json" String variant_transcript_schema_json_filename = "variant_transcript_schema.json" String genes_schema_json_filename = "genes_schema.json" + String vep_loftee_115_raw_schema_json_filename = "vep_loftee_115_raw.json" + String vep_loftee_115_cooked_schema_json_filename = "vep_loftee_115_cooked.json" String variants_docker } String output_ancestry_filename = "ancestry_mapping.tsv" @@ -579,6 +583,8 @@ task MakeSubpopulationFilesAndReadSchemaFiles { File vat_schema_json_file = vat_schema_json_filename File variant_transcript_schema_json_file = variant_transcript_schema_json_filename File genes_schema_json_file = genes_schema_json_filename + File vep_loftee_raw_schema_json_file = vep_loftee_115_raw_schema_json_filename + File vep_loftee_cooked_schema_json_file = vep_loftee_115_cooked_schema_json_filename File ancestry_mapping_list = output_ancestry_filename File custom_annotations_template_file = custom_annotations_template_filename @@ -831,6 +837,7 @@ task GenerateVepAndLofteeAnnotations { ) vep "${args[@]}" + echo "VEP + LOFTEE run complete." else echo "No data found for processing in VCF, exit 0." touch "vep_loftee_raw_output.txt" @@ -858,6 +865,7 @@ task BigQueryLoadRawVepAndLofteeAnnotations { String project_id String dataset_name String raw_data_table + File raw_data_table_schema } command <<< @@ -865,6 +873,19 @@ task BigQueryLoadRawVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace + set +o errexit + bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} > /dev/null + BQ_SHOW_RC=$? + set -o errexit + + if [ $BQ_SHOW_RC -ne 0 ]; then + echo "Creating raw VEP + LOFTEE table ~{dataset_name}.~{raw_data_table}" + + # 24 TTL for this table + DATE=$((24 * 60 * 60)) + bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} ~{raw_data_table_schema} + fi + for file in ~{sep=' ' vep_loftee_raw_output} do if [ -s $file ] @@ -875,12 +896,8 @@ task BigQueryLoadRawVepAndLofteeAnnotations { # serve as a TSV header. sed -E '/^##/d' $file | sed -E 's/^#//' > vep_loftee_load_file.txt - # Schema autodetection doesn't seem to work with --autodetect here for reasons unknown 😭 - # Explicitly get the header and sed it into schema form - schema=$(head -1 vep_loftee_load_file.txt| sed "s/\t/:STRING,/g" | sed 's/$/:STRING/') - bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' --skip_leading_rows=1 \ - --null_marker="-" --schema ${schema} ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt + --null_marker="-" ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt else echo "File $file is empty, skipping." fi @@ -906,6 +923,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { String dataset_name String raw_data_table String cooked_data_table + File cooked_data_table_schema } command <<< @@ -913,6 +931,19 @@ task BigQueryCookVepAndLofteeRawAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace + set +o errexit + bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} > /dev/null + BQ_SHOW_RC=$? + set -o errexit + + if [ $BQ_SHOW_RC -ne 0 ]; then + echo 'Creating "cooked" VEP + LOFTEE table ~{dataset_name}.~{cooked_data_table}' + + # 24 TTL for this table + DATE=$((24 * 60 * 60)) + bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} ~{cooked_data_table_schema} + fi + bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{cooked_data_table} --replace \ --project_id=~{project_id} ' From b9786c22ad0b8d979ba515e79068b8dea4fe1153 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 4 Dec 2025 19:39:05 -0500 Subject: [PATCH 36/60] update Docker --- scripts/variantstore/wdl/GvsUtils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 71c83c81ae7..d86f2b92511 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -131,7 +131,7 @@ task GetToolVersions { # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:524.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-11-23-alpine-631465808626" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-12-04-alpine-11ede1e609a0" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-10-17-gatkbase-0a4709121758" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" From c3bf9d5666d4be71959579cddc3b8a432371fa35 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 4 Dec 2025 19:43:14 -0500 Subject: [PATCH 37/60] monitoring log --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index a30ad2449ec..049ac8e30d7 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -789,6 +789,7 @@ task GenerateVepAndLofteeAnnotations { File loftee_gerp_scores File loftee_phylo_csf_database File input_vcf + File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh" } command <<< @@ -796,6 +797,8 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace + bash ~{monitoring_script} > monitoring.log & + if { grep -E -v '^#' ~{input_vcf} 2>&1 > /dev/null; } then # Only copy these references if there are actually data lines in the VCF to be processed, @@ -854,6 +857,7 @@ task GenerateVepAndLofteeAnnotations { output { File output_file = "vep_loftee_raw_output.txt" + File monitoring_log = "monitoring.log" Boolean done = true } } From b53f12a943db3135078638aa36e2ee0d817e22f5 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Fri, 5 Dec 2025 05:38:49 -0500 Subject: [PATCH 38/60] oops --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 049ac8e30d7..a8674318784 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -945,7 +945,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { # 24 TTL for this table DATE=$((24 * 60 * 60)) - bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} ~{cooked_data_table_schema} + bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} ~{cooked_data_table_schema} fi bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{cooked_data_table} --replace \ From 8e08539ed3c6f2a3ff4659bac03546a58b7664ac Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Fri, 5 Dec 2025 07:34:11 -0500 Subject: [PATCH 39/60] adjust size expectations --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index ed8e8f5e106..fb6cc781cd2 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -38,8 +38,8 @@ workflow GvsQuickstartIntegration { } String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/" else "" - File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2025-07-21/" + expected_subdir - File truth_data_prefix = "gs://gvs-internal-quickstart/integration/test_data/2025-07-21/" + expected_subdir + File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2025-12-05/" + expected_subdir + File truth_data_prefix = "gs://gvs-internal-quickstart/integration/test_data/2025-12-05/" + expected_subdir # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { From a9e55f6641417f4f2fa7f04b5f67392959c88ed0 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Fri, 5 Dec 2025 11:33:23 -0500 Subject: [PATCH 40/60] adjusting memory --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index a8674318784..8a0be8fba20 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -850,8 +850,9 @@ task GenerateVepAndLofteeAnnotations { runtime { preemptible: 2 + maxRetries: 2 docker: vep_loftee_docker - memory: "15 GB" + memory: "8 GB" disks: "local-disk 1000 HDD" } From 4120841e23d484b22e8f320eb837957dc761e42f Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 6 Dec 2025 17:48:13 -0500 Subject: [PATCH 41/60] more retries plz --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 8a0be8fba20..38ff0a189da 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -850,7 +850,7 @@ task GenerateVepAndLofteeAnnotations { runtime { preemptible: 2 - maxRetries: 2 + maxRetries: 3 docker: vep_loftee_docker memory: "8 GB" disks: "local-disk 1000 HDD" From 36b3a999593816a98d8cc09899a60470fa248575 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 7 Dec 2025 05:25:23 -0500 Subject: [PATCH 42/60] manually adjust memory --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 38ff0a189da..d482ccea881 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -852,7 +852,7 @@ task GenerateVepAndLofteeAnnotations { preemptible: 2 maxRetries: 3 docker: vep_loftee_docker - memory: "8 GB" + memory: "16 GB" disks: "local-disk 1000 HDD" } From c36a277af3132ded6d6e5309ca6f41662c6f031b Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sun, 7 Dec 2025 06:22:10 -0500 Subject: [PATCH 43/60] comments --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index d482ccea881..5bb188ebc71 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -886,7 +886,7 @@ task BigQueryLoadRawVepAndLofteeAnnotations { if [ $BQ_SHOW_RC -ne 0 ]; then echo "Creating raw VEP + LOFTEE table ~{dataset_name}.~{raw_data_table}" - # 24 TTL for this table + # 24 hour TTL for this table DATE=$((24 * 60 * 60)) bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} ~{raw_data_table_schema} fi @@ -944,7 +944,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { if [ $BQ_SHOW_RC -ne 0 ]; then echo 'Creating "cooked" VEP + LOFTEE table ~{dataset_name}.~{cooked_data_table}' - # 24 TTL for this table + # 24 hour TTL for this table DATE=$((24 * 60 * 60)) bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} ~{cooked_data_table_schema} fi From c42d390094844ae01af270918e3d0c753fdb1f61 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 8 Dec 2025 18:52:43 -0500 Subject: [PATCH 44/60] adjust TTL, dedup --- .../GvsCreateVATfromVDS.wdl | 129 ++++++++++-------- 1 file changed, 71 insertions(+), 58 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 5bb188ebc71..22fed364363 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -886,8 +886,8 @@ task BigQueryLoadRawVepAndLofteeAnnotations { if [ $BQ_SHOW_RC -ne 0 ]; then echo "Creating raw VEP + LOFTEE table ~{dataset_name}.~{raw_data_table}" - # 24 hour TTL for this table - DATE=$((24 * 60 * 60)) + # 3 day TTL for this table + DATE=$((3 * 24 * 60 * 60)) bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} ~{raw_data_table_schema} fi @@ -944,68 +944,81 @@ task BigQueryCookVepAndLofteeRawAnnotations { if [ $BQ_SHOW_RC -ne 0 ]; then echo 'Creating "cooked" VEP + LOFTEE table ~{dataset_name}.~{cooked_data_table}' - # 24 hour TTL for this table - DATE=$((24 * 60 * 60)) + # 3 day TTL for this table + DATE=$((3 * 24 * 60 * 60)) bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} ~{cooked_data_table_schema} fi bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{cooked_data_table} --replace \ --project_id=~{project_id} ' - SELECT - -- Make a VID-compatible string from the data in Uploaded_variation. - -- VEP appears to use a different convention for the encoding of indel positions than what is used in GVS: - -- VEP indel positions are based on the first *discrepant* base and not the first base mentioned, which in the - -- GVS convention agrees between reference and allele. Correct for that in the VID-building code below to - -- subtract 1 if the variant is an indel. - REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || - -- A Location specified with a "-" range is an indel. Single-base deletions are a special case with a single - -- position, but like all deletions they have a NULL Allele so look for that as well. - IF ((Location LIKE "%-%") OR (Allele is NULL), - -- If this is an indel decrement the position by one for VAT compatibility. - CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING), - -- Else SNPs use position without adjustment. - REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)")) || - "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || - REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, - Uploaded_variation, - Location, - Allele, - Gene, - Feature, - Feature_type, - Consequence, - cDNA_position, - CDS_position, - Protein_position, - Amino_acids, - Codons, - Existing_variation, - IMPACT, - DISTANCE, - STRAND, - -- FLAGS can be multi-valued so SPLIT to make this REPEATED. - SPLIT(FLAGS, ",") AS FLAGS, - SYMBOL as HGNC_SYMBOL, - SYMBOL_SOURCE, - -- HGNC IDs are formatted like HGNC:1234; we only want the number part. - CAST(SPLIT(HGNC_ID, ":")[OFFSET(1)] AS INTEGER) AS HGNC_ID, - SOURCE, - LoF, - -- These three appear to sometimes be multi-valued so SPLIT to make them REPEATEDs. - SPLIT(LoF_filter, ",") AS LoF_filter, - SPLIT(LoF_flags, ",") AS LoF_flags, - SPLIT(LoF_info, ",") AS LoF_info, - -- Split and cast the GERP string to REPEATED FLOAT64s. - ( - SELECT - ARRAY_AGG(SAFE_CAST(s AS FLOAT64)) - FROM - UNNEST(SPLIT(GERP, ",")) AS s - ) AS GERP - - FROM - ~{project_id}.~{dataset_name}.~{raw_data_table} + SELECT * EXCEPT(row_number) FROM ( + SELECT + -- Make a VID-compatible string from the data in Uploaded_variation. + -- VEP appears to use a different convention for the encoding of indel positions than what is used in GVS: + -- VEP indel positions are based on the first *discrepant* base and not the first base mentioned, which in the + -- GVS convention agrees between reference and allele. Correct for that in the VID-building code below to + -- subtract 1 if the variant is an indel. + REGEXP_EXTRACT(Uploaded_variation, "^chr([^_]+)") || "-" || + -- A Location specified with a "-" range is an indel. Single-base deletions are a special case with a single + -- position, but like all deletions they have a NULL Allele so look for that as well. + IF ((Location LIKE "%-%") OR (Allele is NULL), + -- If this is an indel decrement the position by one for VAT compatibility. + CAST((CAST(REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)") AS INT64) - 1) AS STRING), + -- Else SNPs use position without adjustment. + REGEXP_EXTRACT(Uploaded_variation, "_(\\d+)")) || + "-" || REGEXP_EXTRACT(Uploaded_variation, "_([ACGT]+)/") || "-" || + REGEXP_EXTRACT(Uploaded_variation, "([ACGT]+)$") AS vid, + Uploaded_variation, + Location, + Allele, + Gene, + Feature, + Feature_type, + Consequence, + cDNA_position, + CDS_position, + Protein_position, + Amino_acids, + Codons, + Existing_variation, + IMPACT, + DISTANCE, + STRAND, + -- FLAGS can be multi-valued so SPLIT to make this REPEATED. + SPLIT(FLAGS, ",") AS FLAGS, + SYMBOL as HGNC_SYMBOL, + SYMBOL_SOURCE, + -- HGNC IDs are formatted like HGNC:1234; we only want the number part. + CAST(SPLIT(HGNC_ID, ":")[OFFSET(1)] AS INTEGER) AS HGNC_ID, + SOURCE, + LoF, + -- These three appear to sometimes be multi-valued so SPLIT to make them REPEATEDs. + SPLIT(LoF_filter, ",") AS LoF_filter, + SPLIT(LoF_flags, ",") AS LoF_flags, + SPLIT(LoF_info, ",") AS LoF_info, + -- Split and cast the GERP string to REPEATED FLOAT64s. + ( + SELECT + ARRAY_AGG(SAFE_CAST(s AS FLOAT64)) + FROM + UNNEST(SPLIT(GERP, ",")) AS s + ) AS GERP, + + -- Use the ROW_NUMBER() magic to squash duplicates. A small number of deletions span interval boundaries + -- and are assigned to two different VEP processing shards. This duplicate data would cause problems when + -- we try to assign back + ROW_NUMBER() + -- The expression below uses Uploaded_variation rather than vid because BigQuery claims to not be able to + -- find the vid identifier. Uploaded_variation contains equivalent information to vid in a different format. + OVER (PARTITION BY Uploaded_variation, Feature) + ROW_NUMBER + + FROM + ~{project_id}.~{dataset_name}.~{raw_data_table} + ) + + WHERE ROW_NUMBER = 1 ' From 67d0dda7563d3a4a07532604612b05994083c69e Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 8 Dec 2025 18:53:29 -0500 Subject: [PATCH 45/60] data loading rework --- .../GvsCreateVATfromVDS.wdl | 54 +++++++++++++------ 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 22fed364363..a3d3b8ec3a8 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -873,6 +873,12 @@ task BigQueryLoadRawVepAndLofteeAnnotations { File raw_data_table_schema } + parameter_meta { + vep_loftee_raw_output: { + localization_optional: true + } + } + command <<< # Prepend date, time and pwd to xtrace log entries. PS4='\D{+%F %T} \w $ ' @@ -891,26 +897,35 @@ task BigQueryLoadRawVepAndLofteeAnnotations { bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} ~{raw_data_table_schema} fi - for file in ~{sep=' ' vep_loftee_raw_output} - do - if [ -s $file ] - then - # Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data - # - Remove lines beginning with '##'. - # - Remove the leading '#' from the one line that should be left with a single leading '#' so the line can - # serve as a TSV header. - sed -E '/^##/d' $file | sed -E 's/^#//' > vep_loftee_load_file.txt - - bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' --skip_leading_rows=1 \ - --null_marker="-" ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt - else - echo "File $file is empty, skipping." - fi - done + num_rows=$(bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} --format json | jq -r .numRows) + if ((num_rows != 0)) + then + echo "Found preexisting table with data, not adding more raw data." + else + echo "Raw data table is empty, copying VEP output to be loaded." + gcloud storage cp ~{sep=' ' vep_loftee_raw_output} . + for file in ~{sep=' ' vep_loftee_raw_output} + do + filename=$(basename $file) + if [ ! -e load_file.txt ] + then + # Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data + # - Remove lines beginning with '##'. + # - Remove the leading '#' from the one line that should be left with a single leading '#' so + # the line can serve as a TSV header. + sed -E '/^##/d' $filename | sed -E 's/^#//' > load_file.txt + fi + grep -E -v '^#' $filename >> load_file.txt + done + + bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' \ + --skip_leading_rows=1 --null_marker="-" ~{dataset_name}.~{raw_data_table} load_file.txt + fi >>> runtime { docker: variants_docker + preemptible: 2 memory: "7 GB" disks: "local-disk 1000 HDD" } @@ -949,6 +964,12 @@ task BigQueryCookVepAndLofteeRawAnnotations { bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} ~{cooked_data_table_schema} fi + num_rows=$(bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} --format json | jq -r .numRows) + if ((num_rows != 0)) + then + echo "Found preexisting table with data, not adding more cooked data." + else + bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{cooked_data_table} --replace \ --project_id=~{project_id} ' @@ -1021,6 +1042,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { WHERE ROW_NUMBER = 1 ' + fi >>> From 870487aee6078851060c162d74b31d17ba3d61ab Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 8 Dec 2025 21:00:24 -0500 Subject: [PATCH 46/60] fix arg order --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index a3d3b8ec3a8..5205dc04ea5 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -897,7 +897,7 @@ task BigQueryLoadRawVepAndLofteeAnnotations { bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} ~{raw_data_table_schema} fi - num_rows=$(bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} --format json | jq -r .numRows) + num_rows=$(bq --apilog=false show --project_id=~{project_id} --format json ~{dataset_name}.~{raw_data_table} | jq -r .numRows) if ((num_rows != 0)) then echo "Found preexisting table with data, not adding more raw data." @@ -964,7 +964,7 @@ task BigQueryCookVepAndLofteeRawAnnotations { bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} ~{cooked_data_table_schema} fi - num_rows=$(bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} --format json | jq -r .numRows) + num_rows=$(bq --apilog=false show --project_id=~{project_id} --format json ~{dataset_name}.~{cooked_data_table} | jq -r .numRows) if ((num_rows != 0)) then echo "Found preexisting table with data, not adding more cooked data." From d8d060a73697864a3b8e6aa37e496bc075e7fddd Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 9 Dec 2025 05:34:04 -0500 Subject: [PATCH 47/60] adjust mem and disk back down --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 5205dc04ea5..790e6c117ca 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -852,8 +852,8 @@ task GenerateVepAndLofteeAnnotations { preemptible: 2 maxRetries: 3 docker: vep_loftee_docker - memory: "16 GB" - disks: "local-disk 1000 HDD" + memory: "8 GB" + disks: "local-disk 500 HDD" } output { From e3f010fa3563a996e585f56eb84f7eee76bdeac7 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 8 Dec 2025 13:53:48 -0500 Subject: [PATCH 48/60] Silence CodeQL JavaScript / TypeScript errors [VS-1777] (#9304) * CodeQL was enabled on all public Broad repositories. Since this repository contains no JavaScript / TypeScript, CodeQL always fails. Because of this we have to add a workaround here so it does not fail and our checks all come up nice and green. --- src/main/resources/placate_codeql.js | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 src/main/resources/placate_codeql.js diff --git a/src/main/resources/placate_codeql.js b/src/main/resources/placate_codeql.js new file mode 100644 index 00000000000..af400fa033e --- /dev/null +++ b/src/main/resources/placate_codeql.js @@ -0,0 +1,9 @@ +` + +This file exists solely to prevent CodeQL from failing its "Analyze (javascript-typescript)" action with: + +CodeQL detected code written in Java/Kotlin, Python and GitHub Actions, but not any written in JavaScript/TypeScript. Confirm that there is some source code for JavaScript/TypeScript in the project. + +Because apart from the contents of this file, there is currently no JavaScript or TypeScript in the GATK repo. + +`; From 767fd8b1d5c506dbf5c333db76ee14f37a1c4055 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 9 Dec 2025 05:46:47 -0500 Subject: [PATCH 49/60] use ulimit to try to get retry with more memory working --- .../GvsCreateVATfromVDS.wdl | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 790e6c117ca..35d99646ce3 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -790,6 +790,9 @@ task GenerateVepAndLofteeAnnotations { File loftee_phylo_csf_database File input_vcf File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh" + Float memory_mib = 8 * 1024 + # The memory headroom left for other processes including the Batch agent. + Float overhead_memory_mib = 1.6 * 1024 } command <<< @@ -797,6 +800,24 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace + echo "MEM_SIZE is ${MEM_SIZE}" + echo "MEM_UNIT is ${MEM_UNIT}" + + if [[ -z "${MEM_UNIT:-}" ]] + then + vep_memory_kib=$(python3 -c "from math import floor; print(floor((~{memory_mib} - ~{overhead_memory_mib}) * 1024))") + elif [[ ${MEM_UNIT} == "GB" ]] + then + vep_memory_kib=$(python3 -c "from math import floor; print(floor((${MEM_SIZE} - ~{overhead_memory_mib}) * 1024))") + else + echo "Unexpected memory unit: ${MEM_UNIT}" 1>&2 + exit 1 + fi + + echo "memory_mib is ~{memory_mib}" + echo "overhead_memory_mib is ~{overhead_memory_mib}" + echo "vep_memory_kib is ${vep_memory_kib}" + bash ~{monitoring_script} > monitoring.log & if { grep -E -v '^#' ~{input_vcf} 2>&1 > /dev/null; } @@ -839,8 +860,31 @@ task GenerateVepAndLofteeAnnotations { --output_file vep_loftee_raw_output.txt ) + # Limit the amount of memory the VEP Python process uses, expressed in KiB. + # If we don't do this it seems that the Batch agent is often (though not always) starved for memory and + # unable to check in with the Batch service. If this happens the job fails for reasons that appear to + # Cromwell to be unretryable, and thus the whole workflow fails. e.g. + # + # Task GvsCreateVATfromVDS.GenerateVepAndLofteeAnnotations:150:4 failed. The job was stopped before the command finished. GCP Batch task exited with VMReportingTimeout(50002). + # + ulimit -m $vep_memory_kib + set +o errexit vep "${args[@]}" - echo "VEP + LOFTEE run complete." + set -o errexit + + VEP_RC=$? + if (( VEP_RC == 137 )) + then + # Cromwell does not currently consider the value in the rc file when determining retryability, though + # there are PRs open that would enable this. + # https://github.com/broadinstitute/cromwell/pull/7786/files + echo "VEP + LOFTEE appears to have OOMed with exit code 137, writing messages to stderr to hopefully trigger Cromwell to retry with more memory." + echo "Killed" >& 2 + echo "java.lang.OutOfMemoryError" >& 2 + exit 1 + else + echo "VEP + LOFTEE run complete." + fi else echo "No data found for processing in VCF, exit 0." touch "vep_loftee_raw_output.txt" From 4e47b7d92080db2e244c2001cc65671acc511e8a Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 9 Dec 2025 06:36:21 -0500 Subject: [PATCH 50/60] a dash of logging --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 35d99646ce3..77298397540 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -964,6 +964,8 @@ task BigQueryLoadRawVepAndLofteeAnnotations { bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' \ --skip_leading_rows=1 --null_marker="-" ~{dataset_name}.~{raw_data_table} load_file.txt + + echo "VEP + LOFTEE raw data loading complete." fi >>> From 846b3f49c421d68b9b7986f22f1fe8e5ed9ee6ac Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 9 Dec 2025 09:19:54 -0500 Subject: [PATCH 51/60] no Python 3 on that image, only Python 2 but hey it works --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 77298397540..4f149b09631 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -805,10 +805,10 @@ task GenerateVepAndLofteeAnnotations { if [[ -z "${MEM_UNIT:-}" ]] then - vep_memory_kib=$(python3 -c "from math import floor; print(floor((~{memory_mib} - ~{overhead_memory_mib}) * 1024))") + vep_memory_kib=$(python -c "from math import floor; print(floor((~{memory_mib} - ~{overhead_memory_mib}) * 1024))") elif [[ ${MEM_UNIT} == "GB" ]] then - vep_memory_kib=$(python3 -c "from math import floor; print(floor((${MEM_SIZE} - ~{overhead_memory_mib}) * 1024))") + vep_memory_kib=$(python -c "from math import floor; print(floor((${MEM_SIZE} - ~{overhead_memory_mib}) * 1024))") else echo "Unexpected memory unit: ${MEM_UNIT}" 1>&2 exit 1 From 6625463d1cee18f9a720dd656b64cfa3079fb142 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 9 Dec 2025 11:50:32 -0500 Subject: [PATCH 52/60] fix --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 4f149b09631..ed80060b5fe 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -805,10 +805,10 @@ task GenerateVepAndLofteeAnnotations { if [[ -z "${MEM_UNIT:-}" ]] then - vep_memory_kib=$(python -c "from math import floor; print(floor((~{memory_mib} - ~{overhead_memory_mib}) * 1024))") + vep_memory_kib=$(python -c "from math import floor; print(int(floor((~{memory_mib} - ~{overhead_memory_mib}) * 1024)))") elif [[ ${MEM_UNIT} == "GB" ]] then - vep_memory_kib=$(python -c "from math import floor; print(floor((${MEM_SIZE} - ~{overhead_memory_mib}) * 1024))") + vep_memory_kib=$(python -c "from math import floor; print(int(floor(((${MEM_SIZE} * 1024) - ~{overhead_memory_mib}) * 1024)))") else echo "Unexpected memory unit: ${MEM_UNIT}" 1>&2 exit 1 From e5c0bd8055d91924c0a3ec14777044041b509195 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 9 Dec 2025 16:44:03 -0500 Subject: [PATCH 53/60] change ulimit arg as limiting -m seems to force super slow paging --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index ed80060b5fe..44e2f38d5c1 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -867,7 +867,7 @@ task GenerateVepAndLofteeAnnotations { # # Task GvsCreateVATfromVDS.GenerateVepAndLofteeAnnotations:150:4 failed. The job was stopped before the command finished. GCP Batch task exited with VMReportingTimeout(50002). # - ulimit -m $vep_memory_kib + ulimit -v $vep_memory_kib set +o errexit vep "${args[@]}" set -o errexit From 0c33013cb0f8a8a227edd3dadd9027f12486961d Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 9 Dec 2025 19:15:46 -0500 Subject: [PATCH 54/60] revert ulimit stuff, vep just seems to hang immediately --- .../GvsCreateVATfromVDS.wdl | 48 +------------------ 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 44e2f38d5c1..790e6c117ca 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -790,9 +790,6 @@ task GenerateVepAndLofteeAnnotations { File loftee_phylo_csf_database File input_vcf File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh" - Float memory_mib = 8 * 1024 - # The memory headroom left for other processes including the Batch agent. - Float overhead_memory_mib = 1.6 * 1024 } command <<< @@ -800,24 +797,6 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - echo "MEM_SIZE is ${MEM_SIZE}" - echo "MEM_UNIT is ${MEM_UNIT}" - - if [[ -z "${MEM_UNIT:-}" ]] - then - vep_memory_kib=$(python -c "from math import floor; print(int(floor((~{memory_mib} - ~{overhead_memory_mib}) * 1024)))") - elif [[ ${MEM_UNIT} == "GB" ]] - then - vep_memory_kib=$(python -c "from math import floor; print(int(floor(((${MEM_SIZE} * 1024) - ~{overhead_memory_mib}) * 1024)))") - else - echo "Unexpected memory unit: ${MEM_UNIT}" 1>&2 - exit 1 - fi - - echo "memory_mib is ~{memory_mib}" - echo "overhead_memory_mib is ~{overhead_memory_mib}" - echo "vep_memory_kib is ${vep_memory_kib}" - bash ~{monitoring_script} > monitoring.log & if { grep -E -v '^#' ~{input_vcf} 2>&1 > /dev/null; } @@ -860,31 +839,8 @@ task GenerateVepAndLofteeAnnotations { --output_file vep_loftee_raw_output.txt ) - # Limit the amount of memory the VEP Python process uses, expressed in KiB. - # If we don't do this it seems that the Batch agent is often (though not always) starved for memory and - # unable to check in with the Batch service. If this happens the job fails for reasons that appear to - # Cromwell to be unretryable, and thus the whole workflow fails. e.g. - # - # Task GvsCreateVATfromVDS.GenerateVepAndLofteeAnnotations:150:4 failed. The job was stopped before the command finished. GCP Batch task exited with VMReportingTimeout(50002). - # - ulimit -v $vep_memory_kib - set +o errexit vep "${args[@]}" - set -o errexit - - VEP_RC=$? - if (( VEP_RC == 137 )) - then - # Cromwell does not currently consider the value in the rc file when determining retryability, though - # there are PRs open that would enable this. - # https://github.com/broadinstitute/cromwell/pull/7786/files - echo "VEP + LOFTEE appears to have OOMed with exit code 137, writing messages to stderr to hopefully trigger Cromwell to retry with more memory." - echo "Killed" >& 2 - echo "java.lang.OutOfMemoryError" >& 2 - exit 1 - else - echo "VEP + LOFTEE run complete." - fi + echo "VEP + LOFTEE run complete." else echo "No data found for processing in VCF, exit 0." touch "vep_loftee_raw_output.txt" @@ -964,8 +920,6 @@ task BigQueryLoadRawVepAndLofteeAnnotations { bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' \ --skip_leading_rows=1 --null_marker="-" ~{dataset_name}.~{raw_data_table} load_file.txt - - echo "VEP + LOFTEE raw data loading complete." fi >>> From 91525ed7913b50a67017c6680fd2565420b2bee6 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 10 Dec 2025 08:16:00 -0500 Subject: [PATCH 55/60] force offline --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 790e6c117ca..80c03a547ed 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -829,6 +829,7 @@ task GenerateVepAndLofteeAnnotations { # Basic VEP cache setup --cache + --offline --dir_cache . # For GERP (Genomic Evolutionary Rate Profiling) score output. From 9c25376d76f69a41073ef2e14d8330f611aaf054 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 10 Dec 2025 08:16:32 -0500 Subject: [PATCH 56/60] noAddress --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 80c03a547ed..b6039b71960 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -852,6 +852,7 @@ task GenerateVepAndLofteeAnnotations { runtime { preemptible: 2 maxRetries: 3 + noAddress: true docker: vep_loftee_docker memory: "8 GB" disks: "local-disk 500 HDD" From e8366f81fc08570a814167c8f1e70961c292c631 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 10 Dec 2025 17:31:23 -0500 Subject: [PATCH 57/60] copilot suggestions --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index b6039b71960..77482d88ffc 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -814,6 +814,13 @@ task GenerateVepAndLofteeAnnotations { LOFTEE_PATH=/opt/vep/src/loftee-1.0.4_GRCh38 args=( + # Some logging please. + --verbose + --warning_file warnings.txt + + # Explicitly turn off forking as LOFTEE might not deal well with that. + --fork 1 + # Breaks out data into their own columns that otherwise would be nested (semicolon delimited) in the "Extra" column. --tab @@ -824,7 +831,7 @@ task GenerateVepAndLofteeAnnotations { --symbol # Basic LOFTEE plugin setup - --plugin LoF,loftee_path:$LOFTEE_PATH,gerp_bigwig:~{loftee_gerp_scores},human_ancestor_fa:~{loftee_human_ancestor_fa_gz},conservation_file:~{loftee_phylo_csf_database} + --plugin LoF,loftee_path:$LOFTEE_PATH,gerp_bigwig:~{loftee_gerp_scores},human_ancestor_fa:~{loftee_human_ancestor_fa_gz},conservation_file:~{loftee_phylo_csf_database},check_complete_cds:false --dir_plugins $LOFTEE_PATH # Basic VEP cache setup @@ -861,6 +868,7 @@ task GenerateVepAndLofteeAnnotations { output { File output_file = "vep_loftee_raw_output.txt" File monitoring_log = "monitoring.log" + File warnings = "warnings.txt" Boolean done = true } } From 33358a58ed5edf4751e4c930b78727a4f80aa39b Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 10 Dec 2025 18:53:57 -0500 Subject: [PATCH 58/60] VEP + LOFTEE Dockerfile and docs --- .../vep_loftee/Dockerfile | 261 ++++++++++++++++++ .../vep_loftee/README.md | 30 ++ scripts/variantstore/wdl/GvsUtils.wdl | 2 +- 3 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 scripts/variantstore/variant-annotations-table/vep_loftee/Dockerfile create mode 100644 scripts/variantstore/variant-annotations-table/vep_loftee/README.md diff --git a/scripts/variantstore/variant-annotations-table/vep_loftee/Dockerfile b/scripts/variantstore/variant-annotations-table/vep_loftee/Dockerfile new file mode 100644 index 00000000000..7e938c73147 --- /dev/null +++ b/scripts/variantstore/variant-annotations-table/vep_loftee/Dockerfile @@ -0,0 +1,261 @@ +ARG BRANCH=release/115 + +################################################### +# Stage 1 - docker container to build ensembl-vep # +################################################### +FROM ubuntu:22.04 AS builder + +# Update aptitude and install some required packages +# a lot of them are required for Bio::DB::BigFile +RUN apt-get update && apt-get -y install \ + build-essential \ + git \ + libpng-dev \ + zlib1g-dev \ + libbz2-dev \ + liblzma-dev \ + perl \ + perl-base \ + unzip \ + wget \ + curl \ + libncurses5-dev \ + libncursesw5-dev \ + libcurl4-openssl-dev && \ + rm -rf /var/lib/apt/lists/* + +# Setup VEP environment +ENV OPT=/opt/vep +ENV OPT_SRC=$OPT/src +ENV HTSLIB_DIR=$OPT_SRC/htslib +ENV SAMTOOLS_DIR=$OPT_SRC/samtools +ENV HTS_VERSION=1.9 +ARG BRANCH + +# samtools +WORKDIR /tmp +RUN wget -q https://github.com/samtools/samtools/releases/download/$HTS_VERSION/samtools-$HTS_VERSION.tar.bz2 -O samtools-$HTS_VERSION.tar.bz2 && \ + tar -xjf samtools-$HTS_VERSION.tar.bz2 +WORKDIR /tmp/samtools-$HTS_VERSION +RUN ./configure --prefix=$SAMTOOLS_DIR && make && make install && rm -r Makefile *.c + +# Working directory +WORKDIR $OPT_SRC + +# loftee +ENV LOFTEE_ZIP=v1.0.4_GRCh38.zip +RUN wget -q https://github.com/konradjk/loftee/archive/refs/tags/${LOFTEE_ZIP} && \ + unzip ${LOFTEE_ZIP} && \ + rm ${LOFTEE_ZIP} + +# Add ensembl-vep files from current context +ADD . ensembl-vep + +# For release branches, raise an error if VEP version does not match the branch name +RUN if expr "$BRANCH" : "^release/.*" > /dev/null ; \ + then \ + branch_version=$(echo $BRANCH | sed -E 's|release/([0-9]+).*|\1|g'); \ + vep_version=$(grep VEP_VERSION */modules/Bio/EnsEMBL/VEP/Constants.pm | grep -Eo '[0-9]+'); \ + if [ $branch_version -ne $vep_version ]; then \ + echo "ERROR: VEP version $vep_version does not match version in branch name '$BRANCH'"; exit 1; \ + fi; \ + fi + +# Clone/download repositories/libraries +RUN if [ "$BRANCH" = "main" ]; \ + then export BRANCH_OPT=""; \ + else export BRANCH_OPT="-b $BRANCH"; \ + fi && \ + # Get ensembl cpanfile in order to get the list of the required Perl libraries + wget -q "https://raw.githubusercontent.com/Ensembl/ensembl/$BRANCH/cpanfile" -O "ensembl_cpanfile" && \ + # Clone ensembl-variation git repository and compile C code + git clone $BRANCH_OPT --depth 1 https://github.com/Ensembl/ensembl-variation.git && \ + mkdir var_c_code && \ + cp ensembl-variation/C_code/*.c ensembl-variation/C_code/Makefile var_c_code/ && \ + rm -rf ensembl-variation && \ + chmod u+x var_c_code/* && \ + # Clone bioperl-ext git repository - used by Haplosaurus + git clone --depth 1 https://github.com/bioperl/bioperl-ext.git && \ + # Download ensembl-xs - it contains compiled versions of certain key subroutines used in VEP + wget https://github.com/Ensembl/ensembl-xs/archive/2.3.2.zip -O ensembl-xs.zip && \ + unzip -q ensembl-xs.zip && mv ensembl-xs-2.3.2 ensembl-xs && rm -rf ensembl-xs.zip && \ + # Clone/Download other repositories: bioperl-live is needed so the cpanm dependencies installation from the ensembl-vep/cpanfile file takes less disk space + ensembl-vep/travisci/get_dependencies.sh && \ + # Only keep the bioperl-live "Bio" library + mv bioperl-live bioperl-live_bak && mkdir bioperl-live && mv bioperl-live_bak/Bio bioperl-live/ && rm -rf bioperl-live_bak && \ + ## A lot of cleanup on the imported libraries, in order to reduce the docker image ## + rm -rf Bio-HTS/.??* Bio-HTS/Changes Bio-HTS/DISCLAIMER Bio-HTS/MANIFEST* Bio-HTS/README Bio-HTS/scripts Bio-HTS/t Bio-HTS/travisci \ + bioperl-ext/.??* bioperl-ext/Bio/SeqIO bioperl-ext/Bio/Tools bioperl-ext/Makefile.PL bioperl-ext/README* bioperl-ext/t bioperl-ext/examples \ + ensembl-xs/.??* ensembl-xs/TODO ensembl-xs/Changes ensembl-xs/INSTALL ensembl-xs/MANIFEST ensembl-xs/README ensembl-xs/t ensembl-xs/travisci \ + htslib/.??* htslib/INSTALL htslib/NEWS htslib/README* htslib/test && \ + # Only keep needed kent-335_base libraries for VEP - used by Bio::DB::BigFile (bigWig parsing) + mv kent-335_base kent-335_base_bak && mkdir -p kent-335_base/src && \ + cp -R kent-335_base_bak/src/lib kent-335_base_bak/src/inc kent-335_base_bak/src/jkOwnLib kent-335_base/src/ && \ + cp kent-335_base_bak/src/*.sh kent-335_base/src/ && \ + rm -rf kent-335_base_bak + +# Setup bioperl-ext +WORKDIR bioperl-ext/Bio/Ext/Align/ +RUN perl -pi -e"s|(cd libs.+)CFLAGS=\\\'|\$1CFLAGS=\\\'-fPIC |" Makefile.PL + +# Install htslib binaries (for 'bgzip' and 'tabix') +# htslib requires the packages 'zlib1g-dev', 'libbz2-dev' and 'liblzma-dev' +WORKDIR $HTSLIB_DIR +RUN make install && rm -f Makefile *.c + +# Compile Variation LD C scripts +WORKDIR $OPT_SRC/var_c_code +RUN make && rm -f Makefile *.c + + +################################################### +# Stage 2 - docker container to build ensembl-vep # +################################################### +FROM ubuntu:22.04 + +# Update aptitude and install some required packages +# a lot of them are required for Bio::DB::BigFile +RUN apt-get update && apt-get -y install \ + build-essential \ + cpanminus \ + curl \ + libmysqlclient-dev \ + libdbd-mysql-perl \ + libpng-dev \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + liblzma-dev \ + locales \ + openssl \ + perl \ + perl-base \ + unzip \ + vim && \ + apt-get -y purge manpages-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Setup VEP environment +ENV OPT=/opt/vep +ENV OPT_SRC=$OPT/src +ENV LOFTEE_PATH=$OPT_SRC/loftee-1.0.4_GRCh38 +ENV PERL5LIB_TMP=${LOFTEE_PATH}:$OPT_SRC/ensembl-vep:$OPT_SRC/ensembl-vep/modules:/plugins +ENV PERL5LIB=$PERL5LIB_TMP:$OPT_SRC/bioperl-live +ENV KENT_SRC=$OPT/src/kent-335_base/src +ENV HTSLIB_DIR=$OPT_SRC/htslib +ENV DEPS=$OPT_SRC +ENV PATH=$OPT_SRC/samtools/bin:$OPT_SRC/ensembl-vep:$OPT_SRC/var_c_code:$PATH +ENV LANG_VAR=en_US.UTF-8 +ARG BRANCH + +# Create vep user +RUN useradd -r -m -U -d "$OPT" -s /bin/bash -c "VEP User" -p '' vep && \ + chmod a+rx $OPT && \ + usermod -a -G sudo vep && \ + mkdir -p $OPT_SRC +USER vep + +# Copy downloaded libraries (stage 1) to this image (stage 2) +COPY --chown=vep:vep --from=builder $OPT_SRC $OPT_SRC +############################################################# + +# Change user to root for the following complilations/installations +USER root + +# Install bioperl-ext, faster alignments for haplo (XS-based BioPerl extensions to C libraries) +WORKDIR $OPT_SRC/bioperl-ext/Bio/Ext/Align/ +RUN perl Makefile.PL && make && make install && rm -f Makefile* + +# Install ensembl-xs, faster run using re-implementation in C of some of the Perl subroutines +WORKDIR $OPT_SRC/ensembl-xs +RUN perl Makefile.PL && make && make install && rm -f Makefile* cpanfile + +WORKDIR $OPT_SRC +# Install/compile more libraries +RUN export MACHTYPE=$(uname -m) &&\ + ensembl-vep/travisci/build_c.sh && \ + # Remove unused Bio-DB-HTS files + rm -rf Bio-HTS/cpanfile Bio-HTS/Build.PL Bio-HTS/Build Bio-HTS/_build Bio-HTS/INSTALL.pl && \ + # Install ensembl perl dependencies (cpanm) + cpanm --installdeps --with-recommends --notest --cpanfile ensembl_cpanfile . && \ + cpanm --installdeps --with-recommends --notest --cpanfile ensembl-vep/cpanfile . && \ + # Delete bioperl and cpanfiles after the cpanm installs as bioperl will be reinstalled by the INSTALL.pl script + rm -rf bioperl-live ensembl_cpanfile ensembl-vep/cpanfile && \ + # Configure "locale", see https://github.com/rocker-org/rocker/issues/19 + echo "$LANG_VAR UTF-8" >> /etc/locale.gen && locale-gen en_US.utf8 && \ + /usr/sbin/update-locale LANG=$LANG_VAR && \ + # Copy htslib executables. It also requires the packages 'zlib1g-dev', 'libbz2-dev' and 'liblzma-dev' + cp $HTSLIB_DIR/bgzip $HTSLIB_DIR/tabix $HTSLIB_DIR/htsfile /usr/local/bin/ && \ + # additional perl module for loftee + cpanm DBD::SQLite && \ + # Remove CPAN cache + rm -rf /root/.cpanm + +ENV LC_ALL=$LANG_VAR +ENV LANG=$LANG_VAR + +# Switch back to vep user +USER vep +ENV PERL5LIB=$PERL5LIB_TMP + +# Setup Docker environment for when users run VEP and INSTALL.pl in Docker image: +# - skip VEP updates in INSTALL.pl +ENV VEP_NO_UPDATE=1 +# - avoid Faidx/HTSLIB installation in INSTALL.pl +ENV VEP_NO_HTSLIB=1 +# - skip plugin installation in INSTALL.pl +ENV VEP_NO_PLUGINS=1 +# - set plugins directory for VEP and INSTALL.pl +ENV VEP_DIR_PLUGINS=/plugins +ENV VEP_PLUGINSDIR=$VEP_DIR_PLUGINS +WORKDIR $VEP_DIR_PLUGINS + +# Update bash profile +WORKDIR $OPT_SRC/ensembl-vep +RUN echo >> $OPT/.profile && \ + echo PATH=$PATH:\$PATH >> $OPT/.profile && \ + echo export PATH >> $OPT/.profile && \ + # Install Ensembl API and plugins + ./INSTALL.pl --auto ap --plugins all --skip_plugins LoF --pluginsdir $VEP_DIR_PLUGINS --no_update --no_htslib && \ + # Remove ensemb-vep's travisci folder + rm -rf travisci + +# Install dependencies for VEP plugins: +USER root +ENV PLUGIN_DEPS="https://raw.githubusercontent.com/Ensembl/VEP_plugins/$BRANCH/config" +# - Ubuntu packages +RUN curl -O "$PLUGIN_DEPS/ubuntu-packages.txt" && \ + apt-get update && apt-get install -y --no-install-recommends \ + $(sed -e s/\#.*//g ubuntu-packages.txt) && \ + rm -rf /var/lib/apt/lists/* ubuntu-packages.txt +# - Symlink python to python2 +RUN ln -s /usr/bin/python2 /usr/bin/python +# - Perl modules +RUN curl -O "$PLUGIN_DEPS/cpanfile" && \ + cpanm --installdeps --with-recommends . && \ + rm -rf /root/.cpanm cpanfile +# - Python packages +RUN curl -O https://raw.githubusercontent.com/paulfitz/mysql-connector-c/master/include/my_config.h && \ + mv my_config.h /usr/include/mysql/my_config.h +RUN curl -O "$PLUGIN_DEPS/requirements.txt" && \ + python2 -m pip install --no-cache-dir -r requirements.txt && \ + rm requirements.txt + +# Install GeneSplicer binary +USER vep +WORKDIR $VEP_DIR_PLUGINS +RUN curl -O ftp://ftp.ccb.jhu.edu/pub/software/genesplicer/GeneSplicer.tar.gz && \ + tar -xzf GeneSplicer.tar.gz && \ + rm GeneSplicer.tar.gz && \ + cd GeneSplicer/sources && \ + make && \ + mv genesplicer .. && \ + rm -rf GeneSplicer/*/ +ENV PATH=$VEP_DIR_PLUGINS/GeneSplicer:$PATH + +# Set working directory as symlink to $OPT/.vep (containing VEP cache and data) +USER root +RUN ln -s $OPT/.vep /data +USER vep +WORKDIR /data diff --git a/scripts/variantstore/variant-annotations-table/vep_loftee/README.md b/scripts/variantstore/variant-annotations-table/vep_loftee/README.md new file mode 100644 index 00000000000..223dcf918d2 --- /dev/null +++ b/scripts/variantstore/variant-annotations-table/vep_loftee/README.md @@ -0,0 +1,30 @@ +The Dockerfile in this directory is used to build a Docker image for Ensembl VEP 115 with GRCh38 LOFTEE support. +This file is intended to replace the Dockerfile at `docker/Dockerfile` in the Ensembl VEP repo. On an x86 VM +the image is can be built from the root of the Ensembl VEP repo with the command: + +``` +docker build -f docker/Dockerfile . +``` + +Get the image id from `docker images` and assign: + +``` +IMAGE_ID= +``` + +Then: + +``` +TAG="$(date -Idate)-${IMAGE_ID}" +BASE_REPO="broad-dsde-methods/gvs" +REPO_WITH_TAG="${BASE_REPO}/loftee:${TAG}" +docker tag "${IMAGE_ID}" "${REPO_WITH_TAG}" + +# Tag and push +GAR_TAG="us-central1-docker.pkg.dev/${REPO_WITH_TAG}" +docker tag "${REPO_WITH_TAG}" "${GAR_TAG}" + +docker push "${GAR_TAG}" + +echo "Docker image pushed to \"${GAR_TAG}\"" +``` diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index d86f2b92511..873d18cbaac 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -137,7 +137,7 @@ task GetToolVersions { String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" - String vep_loftee_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/loftee:2025-11-18-1ea988fc4bbf" + String vep_loftee_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/loftee:2025-12-10-3e71c688e658" String workspace_bucket = read_string(workspace_bucket_output) String workspace_id = read_string(workspace_id_output) From 8803613261559af3b238a2f5983dcf428b08fb7e Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 11 Dec 2025 04:44:11 -0500 Subject: [PATCH 59/60] warnings files not always generated --- .../variant-annotations-table/GvsCreateVATfromVDS.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 77482d88ffc..a491a9d4656 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -868,7 +868,7 @@ task GenerateVepAndLofteeAnnotations { output { File output_file = "vep_loftee_raw_output.txt" File monitoring_log = "monitoring.log" - File warnings = "warnings.txt" + File? warnings = "warnings.txt" Boolean done = true } } From 09d31ba10e6d97a3b6294efb93a2ba6992fc11f2 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 11 Dec 2025 13:42:38 -0500 Subject: [PATCH 60/60] Revert "revert ulimit stuff, vep just seems to hang immediately" This reverts commit 0c33013cb0f8a8a227edd3dadd9027f12486961d. --- .../GvsCreateVATfromVDS.wdl | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index a491a9d4656..77762bffe55 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -790,6 +790,9 @@ task GenerateVepAndLofteeAnnotations { File loftee_phylo_csf_database File input_vcf File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh" + Float memory_mib = 8 * 1024 + # The memory headroom left for other processes including the Batch agent. + Float overhead_memory_mib = 1.6 * 1024 } command <<< @@ -797,6 +800,24 @@ task GenerateVepAndLofteeAnnotations { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace + echo "MEM_SIZE is ${MEM_SIZE}" + echo "MEM_UNIT is ${MEM_UNIT}" + + if [[ -z "${MEM_UNIT:-}" ]] + then + vep_memory_kib=$(python -c "from math import floor; print(int(floor((~{memory_mib} - ~{overhead_memory_mib}) * 1024)))") + elif [[ ${MEM_UNIT} == "GB" ]] + then + vep_memory_kib=$(python -c "from math import floor; print(int(floor(((${MEM_SIZE} * 1024) - ~{overhead_memory_mib}) * 1024)))") + else + echo "Unexpected memory unit: ${MEM_UNIT}" 1>&2 + exit 1 + fi + + echo "memory_mib is ~{memory_mib}" + echo "overhead_memory_mib is ~{overhead_memory_mib}" + echo "vep_memory_kib is ${vep_memory_kib}" + bash ~{monitoring_script} > monitoring.log & if { grep -E -v '^#' ~{input_vcf} 2>&1 > /dev/null; } @@ -847,8 +868,31 @@ task GenerateVepAndLofteeAnnotations { --output_file vep_loftee_raw_output.txt ) + # Limit the amount of memory the VEP Python process uses, expressed in KiB. + # If we don't do this it seems that the Batch agent is often (though not always) starved for memory and + # unable to check in with the Batch service. If this happens the job fails for reasons that appear to + # Cromwell to be unretryable, and thus the whole workflow fails. e.g. + # + # Task GvsCreateVATfromVDS.GenerateVepAndLofteeAnnotations:150:4 failed. The job was stopped before the command finished. GCP Batch task exited with VMReportingTimeout(50002). + # + ulimit -v $vep_memory_kib + set +o errexit vep "${args[@]}" - echo "VEP + LOFTEE run complete." + set -o errexit + + VEP_RC=$? + if (( VEP_RC == 137 )) + then + # Cromwell does not currently consider the value in the rc file when determining retryability, though + # there are PRs open that would enable this. + # https://github.com/broadinstitute/cromwell/pull/7786/files + echo "VEP + LOFTEE appears to have OOMed with exit code 137, writing messages to stderr to hopefully trigger Cromwell to retry with more memory." + echo "Killed" >& 2 + echo "java.lang.OutOfMemoryError" >& 2 + exit 1 + else + echo "VEP + LOFTEE run complete." + fi else echo "No data found for processing in VCF, exit 0." touch "vep_loftee_raw_output.txt" @@ -930,6 +974,8 @@ task BigQueryLoadRawVepAndLofteeAnnotations { bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' \ --skip_leading_rows=1 --null_marker="-" ~{dataset_name}.~{raw_data_table} load_file.txt + + echo "VEP + LOFTEE raw data loading complete." fi >>>