From ae632b067a85089c4acf7599d70d0ab62efcc9c2 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Tue, 25 Mar 2014 20:20:54 -0400 Subject: [PATCH] Resolve ensemble calling issues: ensure input VCFs have consistent sample headers and correctly prioritize reads with depth genotype annotations. Thanks to Shalabh Suman --- HISTORY.md | 8 ++++++++ project.clj | 2 +- src/bcbio/variation/ensemble.clj | 18 ++++++++++++++++++ src/bcbio/variation/phasing.clj | 6 ++++-- src/bcbio/variation/recall.clj | 12 +++++------- src/bcbio/variation/variantcontext.clj | 5 +++++ 6 files changed, 41 insertions(+), 10 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c69d5b9..09cdc7f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,11 @@ +## 0.1.6 (in progress) + +- Ensure Ensemble input files have consistent VCF headers for multi population + variant calling. Thanks to Shalabh Suman. +- Prioritize depth attribute when choosing representative callset from multiple + choices during Ensemble calling. Thanks to Shalabh Suman. +- Avoid identifying mitochondrial-only test datasets as haploid reference genomes. + ## 0.1.5 (15 March 2014) - Move to MIT licensed GATK 3.0 framework. diff --git a/project.clj b/project.clj index d07d1ba..988ade1 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject bcbio.variation "0.1.5" +(defproject bcbio.variation "0.1.6-SNAPSHOT" :description "Toolkit to analyze genomic variation data, built on the GATK with Clojure" :license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"} :dependencies [[org.clojure/clojure "1.5.1"] diff --git a/src/bcbio/variation/ensemble.clj b/src/bcbio/variation/ensemble.clj index 3057158..1fb6aa2 100644 --- a/src/bcbio/variation/ensemble.clj +++ b/src/bcbio/variation/ensemble.clj @@ -80,6 +80,23 @@ (spit out-file)) out-file)) +(defn first-mismatch + "Return the first mismatched pair in the two sequences" + [c1 c2] + (first (drop-while (fn [[x1 x2]] (= x1 x2)) (map vector c1 c2)))) + +(defn- check-vcf-headers + "Ensure consistent headers for multi-sample inputs to Ensemble calling." + [vcf-files] + (reduce (fn [samples cmp-vcf] + (let [cmp-samples (gvc/get-samples cmp-vcf)] + (if (= samples cmp-samples) + samples + (throw (Exception. (str "VCF files do not have consistent headers: " + (vec (map fs/base-name vcf-files)) + "\nFirst mismatch:" (first-mismatch samples cmp-samples))))))) + (gvc/get-samples (first vcf-files)) (rest vcf-files))) + (defn consensus-calls "Provide a finalized set of consensus calls from multiple inputs. Handles cleaning up and normalizing input files, generating consensus @@ -90,6 +107,7 @@ ref-file (fsp/abspath ref-file) dirs (setup-work-dir out-file) config-file (create-ready-config vrn-files ref-file in-config dirs)] + (check-vcf-headers vrn-files) (compare/variant-comparison-from-config config-file) (let [prep-file (first (fs/glob (str (io/file (:prep dirs) "*cfilter.vcf"))))] (assert prep-file (str "Did not find prepped and filtered consensus file. " diff --git a/src/bcbio/variation/phasing.clj b/src/bcbio/variation/phasing.clj index d8cb24a..0943b2a 100644 --- a/src/bcbio/variation/phasing.clj +++ b/src/bcbio/variation/phasing.clj @@ -452,12 +452,14 @@ (defn is-haploid? "Is the provided VCF file a haploid genome (one genotype or all homozygous). - Samples the first set of variants, checking for haploid calls." + Samples the first set of variants, checking for haploid calls. + Avoids calling haploid based on chrM-only calls, used during testing." [vcf-file ref-file] (let [sample-size 10] (letfn [(is-vc-haploid? [vc] (when-not (= 0 (:num-samples vc)) - (= 1 (apply max (map #(count (:alleles %)) (:genotypes vc))))))] + (when-not (contains? #{"chrM" "MT" "M" "chrMT"} (:chr vc)) + (= 1 (apply max (map #(count (:alleles %)) (:genotypes vc)))))))] (with-open [vcf-iter (get-vcf-iterator vcf-file ref-file)] (let [vcf-iter (parse-vcf vcf-iter)] (if-not (empty? vcf-iter) diff --git a/src/bcbio/variation/recall.clj b/src/bcbio/variation/recall.clj index 0a709f6..f2b27a0 100644 --- a/src/bcbio/variation/recall.clj +++ b/src/bcbio/variation/recall.clj @@ -49,8 +49,8 @@ (map vec) (into {})) g (->> (:genotypes vc) - (filter #(= sample (:sample-name %))) - first)] + (filter #(= sample (:sample-name %))) + first)] (when g {:sample-name sample :qual (:qual vc) @@ -64,7 +64,7 @@ (if (seq (get-in g [:attributes "PVAL"])) 1 0) (if (seq (get-in g [:attributes "AD"])) 1 0) (if (get-in g [:attributes "GQ"]) 1 0) - (if (get-in g [:attributes "DP"]) 1 0)) + (if (pos? (get-in g [:attributes "DP"] -1)) 1 0)) :pl (attr/get-pl g) }))) @@ -121,10 +121,8 @@ (let [match-fn (juxt :start :ref-allele) other-vcs (filter #(= (match-fn %) (match-fn vc)) (gvc/variants-in-region input-vc-getter vc)) - most-likely-gs (->> samples - (map (partial best-supported-sample-gs other-vcs)) - (remove nil?))] - (when (seq most-likely-gs) + most-likely-gs (map (partial best-supported-sample-gs other-vcs) samples)] + (when (not-every? nil? most-likely-gs) (-> (VariantContextBuilder. (:vc vc)) (.alleles (conj (set (remove #(.isNoCall %) (mapcat :alleles most-likely-gs))) (:ref-allele vc))) diff --git a/src/bcbio/variation/variantcontext.clj b/src/bcbio/variation/variantcontext.clj index 0e6fa2b..47919b0 100644 --- a/src/bcbio/variation/variantcontext.clj +++ b/src/bcbio/variation/variantcontext.clj @@ -143,6 +143,11 @@ (with-open [vcf-reader (AbstractFeatureReader/getFeatureReader vcf-file (VCFCodec.) false)] (.getHeader vcf-reader))) +(defn get-samples + "Retrieve samples from VCF header" + [vcf-file] + (.getGenotypeSamples (get-vcf-header vcf-file))) + ;; ## Writing VCF files (defn merge-headers