From 5094a3356d5ed7b220678e6988d3ea05667638ae Mon Sep 17 00:00:00 2001 From: chapmanb Date: Tue, 17 Dec 2013 15:53:18 -0500 Subject: [PATCH 01/15] Provide better error message for ensemble preparation when final filtered file is not prepared --- HISTORY.md | 7 +++++++ src/bcbio/variation/combine.clj | 12 ++++++------ src/bcbio/variation/ensemble.clj | 2 ++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index cbc60bc..77e6d9e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,10 @@ +## 0.1.3 (in progress) + +- Move to external bcbio.run tool to help abstract out some core functionality + useful in other contexts. +- Additional flexibility for Illumina to valid VCF preparation. Allow ignoring + SVs or other file types. + ## 0.1.2 (4 December 2013) - Handle metrics evaluation with GATK where input calls have haploid chromosomes, diff --git a/src/bcbio/variation/combine.clj b/src/bcbio/variation/combine.clj index 415b44a..4e80d23 100644 --- a/src/bcbio/variation/combine.clj +++ b/src/bcbio/variation/combine.clj @@ -48,12 +48,12 @@ base-name (if (nil? base-ext) full-base-name (format "%s-%s.vcf" (first (string/split full-base-name #"-")) base-ext)) - file-info {:out-vcf (str (fs/file base-dir - (fsp/add-file-part base-name - (case merge-type - :minimal "mincombine" - :full "fullcombine" - "combine"))))} + file-info {:out-vcf (fsp/add-file-part base-name + (case merge-type + :minimal "mincombine" + :full "fullcombine" + "combine") + base-dir)} args (concat ["-R" ref "-o" :out-vcf "--rod_priority_list" (string/join "," (map-indexed unique-name vcfs))] diff --git a/src/bcbio/variation/ensemble.clj b/src/bcbio/variation/ensemble.clj index d3311a4..a84a7f0 100644 --- a/src/bcbio/variation/ensemble.clj +++ b/src/bcbio/variation/ensemble.clj @@ -86,6 +86,8 @@ config-file (create-ready-config vrn-files ref-file in-config dirs)] (compare/variant-comparison-from-config config-file) (let [prep-file (first (fs/glob (str (io/file (:prep dirs) "*cfilter.vcf"))))] + (assert prep-file (str "Did not find prepped and filtered consensus file. " + "Do you have classifiers specified in the input YAML file?")) (fs/copy prep-file out-file) (fs/copy (str prep-file ".idx") (str out-file ".idx")))) out-file) From 3ab4066b1e65655a08e49adb0314674c4de9cbd5 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Thu, 19 Dec 2013 06:29:24 -0500 Subject: [PATCH 02/15] Document requirement for java 1.7. Fixes #14 --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed77ef8..07a6973 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,10 @@ Run from the command line: $ java -jar bcbio.variation-VERSION-standalone.jar [arguments] The jar contains a full GATK commandline with additional walkers, as well as -custom command line programs. See the usage section below for more details. +custom command line programs. The usage section below contains examples of using +the library for variant comparison, normalization and ensemble calling. Note +that bcbio.variation requires Java 1.7 since the underlying GATK libraries are +not compatible with earlier versions. [dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.2/bcbio.variation-0.1.2-standalone.jar From dc40569a391a20aa83c05fcc338bfbbeaeef0014 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Tue, 31 Dec 2013 07:18:27 -0500 Subject: [PATCH 03/15] Print message instead of raising error when variants cannot be remapped. Helps handle hg19 *hap chromosomes which do not have equivalent contigs in GRCh37 reference. Thanks to Severine Catreux --- src/bcbio/variation/normalize.clj | 4 ++-- test/data/cg-normalize.vcf | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/bcbio/variation/normalize.clj b/src/bcbio/variation/normalize.clj index 750b024..999c5b3 100644 --- a/src/bcbio/variation/normalize.clj +++ b/src/bcbio/variation/normalize.clj @@ -276,8 +276,8 @@ (let [line-info (fix-vcf-line line chr-map config)] (if-let [wtr (get ref-wrtrs (:chrom line-info))] (.write wtr (str (:line line-info) "\n")) - (throw (Exception. (format "Could not find remapping of chromosome %s in reference: %s" - (:chrom line-info) (keys ref-wrtrs)))))))] + (println (format "Could not find remapping of chromosome %s in reference. Removing variant." + (:chrom line-info))))))] (let [ref-chrs (ref-chr-files ref-file) ref-wrtrs (zipmap (keys ref-chrs) (map writer (vals ref-chrs))) chr-map (chr-name-remap (:prep-org config) ref-file orig-ref-file)] diff --git a/test/data/cg-normalize.vcf b/test/data/cg-normalize.vcf index d76eb48..9952802 100644 --- a/test/data/cg-normalize.vcf +++ b/test/data/cg-normalize.vcf @@ -10,6 +10,7 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19239 chrMT 64 27603250 C T . PASS AC=0;AF=0.0;AN=1;DB;DP=0;NS=69;RSID=dbsnp.108:rs3883917 GT 1 chrM 65 27603251 C T . PASS AC=0;AF=0.0;AN=1;DB;DP=0;NS=69 GT 1 +chr4_ctg9_hap1 1 . C T . PASS AC=0;AF=0.0;AN=1;DB;DP=0;NS=69 GT 1/1 chr22 2064 5923 G A . s50 ;AC=0;AF=NaN;AN=0;DP=0;NS=0;RSID=. GT 0/1 chr22 2227 5924 G T . PASS AC=0;AF=0.00;AN=2;DB;DP=0;NS=65;RSID=dbsnp.132:rs115144100 GT 0/1 chr22 1603 26232579 T C . PASS AC=1;AF=0.50;AN=2;DB;DP=0;NS=44;RSID=dbsnp.100:rs2844883 GT 0/1 From 4a1ca27152bda2a9a3d9eed18951de07ba91a9cc Mon Sep 17 00:00:00 2001 From: chapmanb Date: Tue, 31 Dec 2013 15:28:52 -0500 Subject: [PATCH 04/15] Remove END tags from indels, which can cause errors during LeftAlignVariants after adjusting regions and getting out of sync. Fixes #13 --- src/bcbio/variation/normalize.clj | 14 ++++++++++++++ test/bcbio/variation/test/grade.clj | 16 ++++++++++++++-- test/data/digrade/NA12878-cmp-r1.vcf | 2 ++ test/data/digrade/NA12878-cmp-r2.vcf | 1 + 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/bcbio/variation/normalize.clj b/src/bcbio/variation/normalize.clj index 999c5b3..f651f39 100644 --- a/src/bcbio/variation/normalize.clj +++ b/src/bcbio/variation/normalize.clj @@ -17,6 +17,7 @@ [lonocloud.synthread :as ->] [bcbio.run.fsp :as fsp] [bcbio.run.itx :as itx] + [bcbio.variation.structural :as structural] [bcbio.variation.variantcontext :as gvc])) ;; ## Chromosome name remapping @@ -157,6 +158,18 @@ (.genotypes (map normalize-allele-calls (update-genotype-sample (:vc orig) sample))) .make))))) +(defn- remove-extra-end + "Remove END alleles present in non-structural variants. + END tags in indels cause issues downstream in LeftAlignVariants if moved." + [vc] + (if (and (contains? (:attributes vc) "END") + (nil? (structural/get-sv-type vc {:max-indel-size Integer/MAX_VALUE}))) + (assoc vc :vc + (-> (VariantContextBuilder. (:vc vc)) + (.rmAttribute "END") + .make)) + vc)) + (defn- no-call-genotype? "Check if a variant has a non-informative no-call genotype." [vc config] @@ -240,6 +253,7 @@ (remove #(no-call-genotype? % config)) (map (partial normalize-sv-genotype config sample)) (map (partial fix-vc sample config)) + (map remove-extra-end) (map :vc))) (defn- fix-vcf-line diff --git a/test/bcbio/variation/test/grade.clj b/test/bcbio/variation/test/grade.clj index d635916..ec50504 100644 --- a/test/bcbio/variation/test/grade.clj +++ b/test/bcbio/variation/test/grade.clj @@ -9,6 +9,7 @@ [bcbio.run.fsp :as fsp] [bcbio.run.itx :as itx] [bcbio.variation.compare :as compare] + [bcbio.variation.combine :as combine] [bcbio.variation.grade :as grade] [bcbio.variation.report :as report] [bcbio.variation.utils.quickcompare :as qcmp])) @@ -47,7 +48,8 @@ (-> (compare-two-vcf-phased calls exp config) :c-files keys) => [:concordant :discordant :discordant-missing :phasing-error])) -(facts "Compare diploid callset against a diploid reference" +(facts "Compare diploid callset against a diploid reference. + Includes comparison of calls with END tags which can be out of sync following comparison." (let [base-dir (fs/file data-dir "digrade") c1 {:file (str (fs/file base-dir "NA12878-cmp-r1.vcf")) :name "ref" :type "grading-ref"} @@ -65,11 +67,21 @@ (-> cmp :grade-breakdown :discordant :snp :shared :hethom) => 1 (-> cmp :c-files :eval-discordant) => out-file))) +(facts "Normalize input VCFs containing END tags" + (let [base-dir (fs/file data-dir "digrade") + c1 {:file (str (fs/file base-dir "NA12878-cmp-r1.vcf")) + :name "ref" :type "grading-ref" :prep true} + exp {:ref ref-file :sample "NA12878" :approach "grade"} + out-dir (str (fs/file base-dir "work"))] + (fsp/remove-path out-dir) + (fs/mkdirs out-dir) + (combine/gatk-normalize c1 exp nil out-dir (fn [& xs] (println xs))))) + (facts "Perform quick comparisons between smaller variant files that can fit in memory." (let [base-dir (fs/file data-dir "digrade") c1 (str (fs/file base-dir "NA12878-cmp-r1.vcf")) c2 (str (fs/file base-dir "NA12878-cmp-r2.vcf")) dir-out-file (str (fs/file base-dir "NA12878-cmp-r1-cmp.csv"))] (fsp/remove-path dir-out-file) - (qcmp/two-vcfs c1 c2 ref-file) => {:concordant 14 :discordant 2 :sample "NA12878"} + (qcmp/two-vcfs c1 c2 ref-file) => {:concordant 14 :discordant 3 :sample "NA12878"} (qcmp/vcfdir-to-base c1 base-dir ref-file 2) => dir-out-file)) diff --git a/test/data/digrade/NA12878-cmp-r1.vcf b/test/data/digrade/NA12878-cmp-r1.vcf index f21d397..b2ca705 100644 --- a/test/data/digrade/NA12878-cmp-r1.vcf +++ b/test/data/digrade/NA12878-cmp-r1.vcf @@ -165,3 +165,5 @@ 22 13850 . C T . . ABCI5=0.558;ABCI50=0.729;ABCI95=0.863;AC=1;AF=0.500;AN=2;BaseQRankSum=-0.698;CFILTERS=None;DP=20;Entropy=3.16;FS=0.000;GC=38.61;HRun=0;HaplotypeScore=0.0000;MFE=-1.92;MPGHetRef=10.39;MPGHomRef=-1.039e+01;MPGHomVar=-4.218e+01;MPGLikHetRef=6.02;MPGLikHomRef=16.41;MPGLikHomVar=48.20;MQ=6.63;MQ0=0;MQRankSum=1.135;NBQ=33.75;ReadMeanLen=95.6;ReadMeanPos=0.232;ReadPosEndDist=17.0;ReadPosRankSum=-0.786;set=varscan GT:AD:DP:PVAL 0/1:15,5:20:2.3562E-2 22 14258 . C T . . ABCI5=0.707;ABCI50=0.756;ABCI95=0.800;AC=1;AF=0.500;AN=2;BaseQRankSum=-5.968;CFILTERS=None;DP=227;Entropy=3.57;FS=0.000;GC=50.50;HRun=2;HaplotypeScore=0.0000;MFE=-0.44;MPGHetRef=113.16;MPGHomRef=-1.132e+02;MPGHomVar=-5.395e+02;MPGLikHetRef=68.33;MPGLikHomRef=181.49;MPGLikHomVar=607.82;MQ=1.19;MQ0=0;MQRankSum=0.348;NBQ=34.82;ReadMeanLen=99.2;ReadMeanPos=0.571;ReadPosEndDist=22.8;ReadPosRankSum=-0.604;set=varscan GT:AD:DP:PVAL 0/1:173,54:223:3.3114E-18 22 14364 . G T . . ABCI5=0.635;ABCI50=0.720;ABCI95=0.796;AC=1;AF=0.500;AN=2;BaseQRankSum=-5.134;CFILTERS=None;DP=81;Entropy=3.29;FS=0.000;GC=54.46;HRun=1;HaplotypeScore=0.0000;MFE=-0.99;MPGHetRef=48.13;MPGHomRef=-4.813e+01;MPGHomVar=-1.872e+02;MPGLikHetRef=24.38;MPGLikHomRef=72.51;MPGLikHomVar=211.61;MQ=1.71;MQ0=0;MQRankSum=-0.929;NBQ=34.07;ReadMeanLen=99.0;ReadMeanPos=0.516;ReadPosEndDist=23.9;ReadPosRankSum=-0.536;set=varscan GT:AD:DP:PVAL 0/1:59,22:81:4.5762E-8 +22 14929 . GGCCCACA G 100.813537528125 PASS SAMPLE=NA12878;DP=48;END=14936;VP=6;AF=0.125;BIAS=2:0;REFBIAS=25:17;VARBIAS=6:0;PMEAN=29.3;PSTD=12.97;QUAL=39;QSTD=2.3;SBF=0.0766397001445299;ODDRATIO=0 GT:DP 0/1:48 +22 14937 . GCCCACAG G 89.6981987750241 PASS SAMPLE=NA12878;DP=49;END=14944;VP=6;AF=0.12245;BIAS=2:0;REFBIAS=26:16;VARBIAS=6:0;PMEAN=40.7;PSTD=5.75;QUAL=34.7;QSTD=3.5;SBF=0.159450278009751;ODDRATIO=0 GT:DP 0/1:49 \ No newline at end of file diff --git a/test/data/digrade/NA12878-cmp-r2.vcf b/test/data/digrade/NA12878-cmp-r2.vcf index ef3f664..bada672 100644 --- a/test/data/digrade/NA12878-cmp-r2.vcf +++ b/test/data/digrade/NA12878-cmp-r2.vcf @@ -164,3 +164,4 @@ 22 1758 . A G 552.77 . ABCI5=-7.241e-03;ABCI50=0.024;ABCI95=0.130;AC=2;AF=1.00;AN=2;CFILTERS=None;DP=19;Entropy=3.47;FS=0.000;GC=53.47;HRun=1;HaplotypeScore=0.0000;MFE=-1.72;MPGHetRef=-5.717e+00;MPGHomRef=-6.810e+01;MPGHomVar=5.72;MPGLikHetRef=5.72;MPGLikHomRef=68.10;MPGLikHomVar=2.228e-03;MQ=26.00;MQ0=0;NBQ=34.70;QD=29.09;ReadMeanLen=100.0;ReadMeanPos=0.666;ReadPosEndDist=20.9;set=filterIngatk-samtools-varscan GT:AD:DP:PL 1/1:0,19:19:128,57,0 22 14123 . G A . . ABCI5=0.750;ABCI50=0.779;ABCI95=0.807;AC=1;AF=0.500;AN=2;BaseQRankSum=2.330;CFILTERS=None;DP=560;Entropy=3.57;FS=0.000;GC=51.49;HRun=1;HaplotypeScore=0.0000;MFE=-1.13;MPGHetRef=262.91;MPGHomRef=-2.629e+02;MPGHomVar=-1.301e+03;MPGLikHetRef=168.58;MPGLikHomRef=431.49;MPGLikHomVar=1469.79;MQ=1.97;MQ0=0;MQRankSum=-0.629;NBQ=36.02;OND=1.786e-03;ReadMeanLen=99.6;ReadMeanPos=0.559;ReadPosEndDist=23.9;ReadPosRankSum=0.653;set=varscan GT:AD:DP:PVAL 0/1:439,120:546:7.4652E-37 22 14364 . G T . . ABCI5=0.692;ABCI50=0.778;ABCI95=0.850;AC=1;AF=0.500;AN=2;BaseQRankSum=-0.852;CFILTERS=None;DP=71;Entropy=3.29;FS=0.000;GC=54.46;HRun=1;HaplotypeScore=0.0000;MFE=-0.99;MPGHetRef=31.94;MPGHomRef=-3.194e+01;MPGHomVar=-1.751e+02;MPGLikHetRef=21.37;MPGLikHomRef=53.31;MPGLikHomVar=196.50;MQ=1.43;MQ0=0;MQRankSum=1.049;NBQ=34.53;ReadMeanLen=99.0;ReadMeanPos=0.561;ReadPosEndDist=22.3;ReadPosRankSum=0.556;set=varscan GT:AD:DP:PVAL 0/1:56,15:66:2.8272E-5 +22 14929 . GGCCCACAGCCCACAG G 100.813537528125 PASS SAMPLE=NA12878;DP=49;END=14944;VP=6;AF=0.12245;BIAS=2:0;REFBIAS=26:16;VARBIAS=6:0;PMEAN=40.7;PSTD=5.75;QUAL=34.7;QSTD=3.5;SBF=0.159450278009751;ODDRATIO=0 GT:DP 0/1:49 From cb7cc75280dafec8dcc35997f557b10e983989cc Mon Sep 17 00:00:00 2001 From: chapmanb Date: Tue, 31 Dec 2013 15:32:18 -0500 Subject: [PATCH 05/15] Update history with fixes for 0.1.3-SNAPSHOT-20131231 --- HISTORY.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 77e6d9e..6ac863f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,10 @@ ## 0.1.3 (in progress) +- Avoid errors on converting hg19 to GRCh37 where hg19 variants contain hg19 hap + contigs with no equivalent in GRCh37. It now drops these variants instead + of generating an error. Thanks to Severine Catreux. +- Avoid issues with running LeftAlignVariants on indels with END tags. Thanks to + Justin Johnson. - Move to external bcbio.run tool to help abstract out some core functionality useful in other contexts. - Additional flexibility for Illumina to valid VCF preparation. Allow ignoring From 6dabe0ba08d9e44dd06c29ddabebcc4fd5596c48 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Sat, 4 Jan 2014 15:00:49 -0500 Subject: [PATCH 06/15] Ensure BED files passed to evaluations are converted to reference material coordinates when inputs differ (hg19->GRCh37). Thanks to Severine Catreux. --- .gitignore | 2 + HISTORY.md | 2 + src/bcbio/align/interval.clj | 18 ++ src/bcbio/variation/compare.clj | 12 +- src/bcbio/variation/filter/intervals.clj | 15 +- src/bcbio/variation/utils/svmerge.clj | 5 +- test/bcbio/variation/test/grade.clj | 18 ++ test/bcbio/variation/test/validate.clj | 8 +- .../data/digrade/NA12878-cmp-r1-intervals.bed | 1 + .../digrade/hg19/NA12878-cmp-r3-intervals.bed | 1 + test/data/digrade/hg19/NA12878-cmp-r3.vcf | 166 ++++++++++++++++++ 11 files changed, 228 insertions(+), 20 deletions(-) create mode 100644 test/data/digrade/NA12878-cmp-r1-intervals.bed create mode 100644 test/data/digrade/hg19/NA12878-cmp-r3-intervals.bed create mode 100644 test/data/digrade/hg19/NA12878-cmp-r3.vcf diff --git a/.gitignore b/.gitignore index 71c7b8a..8b4ded8 100644 --- a/.gitignore +++ b/.gitignore @@ -54,6 +54,8 @@ test/data/phased/*-cmp*.vcf test/data/phased/work/ test/data/digrade/*.idx test/data/digrade/*.csv +test/data/digrade/*-sorted.bed +test/data/digrade/hg19/*-sorted.bed test/data/digrade/work/ test/data/svs/*.idx test/data/svs/*-sorted.bed diff --git a/HISTORY.md b/HISTORY.md index 6ac863f..c60227f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,8 @@ - Avoid errors on converting hg19 to GRCh37 where hg19 variants contain hg19 hap contigs with no equivalent in GRCh37. It now drops these variants instead of generating an error. Thanks to Severine Catreux. +- Ensure BED files passed to evaluations are converted to reference material + coordinates when inputs differ (hg19->GRCh37). Thanks to Severine Catreux. - Avoid issues with running LeftAlignVariants on indels with END tags. Thanks to Justin Johnson. - Move to external bcbio.run tool to help abstract out some core functionality diff --git a/src/bcbio/align/interval.clj b/src/bcbio/align/interval.clj index 1b3201d..c8cc5b1 100644 --- a/src/bcbio/align/interval.clj +++ b/src/bcbio/align/interval.clj @@ -30,3 +30,21 @@ (map (partial update-contig-name name-map)) (remove nil?))))))) out-file)) + +(defn- maybe-remap-bed + [interval int-ref base-ref out-dir] + (when interval + (if (and (not (nil? int-ref)) (not= base-ref int-ref)) + (rename-bed interval base-ref :out-dir out-dir) + interval))) + +(defn prep-multi + "Prepare multiple input intervals, remapping chromosome names as needed." + ([to-prep ref-file orig out-dir] + (->> to-prep + (map (juxt :intervals :ref)) + (map (fn [[bed ref]] (maybe-remap-bed bed ref ref-file out-dir))) + (concat (flatten [orig])) + (remove nil?))) + ([to-prep ref-file out-dir] + (prep-multi to-prep ref-file [] out-dir))) diff --git a/src/bcbio/variation/compare.clj b/src/bcbio/variation/compare.clj index 88198aa..a577d0a 100644 --- a/src/bcbio/variation/compare.clj +++ b/src/bcbio/variation/compare.clj @@ -19,7 +19,6 @@ [bcbio.variation.evaluate :only [calc-variant-eval-metrics]] [bcbio.variation.filter :only [variant-filter variant-format-filter pipeline-recalibration]] - [bcbio.variation.filter.intervals :only [combine-multiple-intervals]] [bcbio.variation.multiple :only [prep-cmp-name-lookup pipeline-compare-multiple]] [bcbio.variation.multisample :only [compare-two-vcf-flexible multiple-samples?]] @@ -32,9 +31,11 @@ [clj-yaml.core :as yaml] [me.raynes.fs :as fs] [lonocloud.synthread :as ->] + [bcbio.align.interval :as ainterval] [bcbio.run.fsp :as fsp] [bcbio.run.itx :as itx] [bcbio.run.broad :as broad] + [bcbio.variation.filter.intervals :as fintervals] [bcbio.variation.grade :as grade] [bcbio.variation.phasing :as phasing] [bcbio.variation.report :as report] @@ -46,8 +47,7 @@ "Variant comparison producing 3 files: concordant and both directions discordant" [sample call1 call2 ref & {:keys [out-dir intervals]}] (let [base-dir (if (nil? out-dir) (fs/parent (:file call1)) out-dir) - ready-intervals (remove nil? (flatten [intervals (:intervals call1) - (:intervals call2)])) + ready-intervals (ainterval/prep-multi [call1 call2] ref intervals out-dir) sample (or sample (-> call1 :file gvc/get-vcf-header .getGenotypeSamples first))] (if-not (fs/exists? base-dir) (fs/mkdirs base-dir)) @@ -116,7 +116,7 @@ (let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out])) transition (partial do-transition config) align-bams (prepare-input-bams exp out-dir) - all-intervals (remove nil? (map :intervals (cons exp (:calls exp)))) + all-intervals (ainterval/prep-multi (cons exp (:calls exp)) (:ref exp) out-dir) start-vcfs (vec (map #(gatk-normalize % exp all-intervals out-dir transition) (:calls exp))) _ (transition :combine "Creating merged VCF files for all comparisons") @@ -148,8 +148,8 @@ (let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out])) align-bams (remove nil? (map :align [c1 c2]))] (when (and (:intervals exp) (seq align-bams)) - (combine-multiple-intervals (:intervals exp) align-bams (:ref exp) - :out-dir out-dir :name (:sample exp))))) + (fintervals/combine-multiple (:intervals exp) align-bams (:ref exp) + :out-dir out-dir :name (:sample exp))))) (discordant-name [x] (format "%s-discordant" (:name x))) (zipmap-ordered [xs1 xs2] diff --git a/src/bcbio/variation/filter/intervals.clj b/src/bcbio/variation/filter/intervals.clj index 151495f..0664ef8 100644 --- a/src/bcbio/variation/filter/intervals.clj +++ b/src/bcbio/variation/filter/intervals.clj @@ -80,6 +80,8 @@ (catch UserException$BadInput e [])) (rest intervals))))) +;; ## Merge BED files + (defn- prep-intervals-by-contig "Intersect and exclude intervals on a contig." [start-intervals exclude-intervals loc-parser combine-rule] @@ -109,7 +111,7 @@ combine-rule) contigs)))) -(defn combine-multiple-intervals +(defn combine-multiple "Combine intervals from an initial BED and coverage BAM files." [initial-bed align-bams ref & {:keys [out-dir name exclude-intervals combine-rule more-beds]}] @@ -137,9 +139,8 @@ (let [base-intervals (:intervals exp) all-aligns (set (remove nil? (map :align (cons exp (:calls exp)))))] (when (and base-intervals (seq all-aligns)) - (combine-multiple-intervals base-intervals all-aligns - (:ref exp) - :exclude-intervals (:exclude-intervals exp) - :name (:sample exp) - :out-dir (get-in config [:dir :prep] (get-in config [:dir :out])))))) - + (combine-multiple base-intervals all-aligns + (:ref exp) + :exclude-intervals (:exclude-intervals exp) + :name (:sample exp) + :out-dir (get-in config [:dir :prep] (get-in config [:dir :out])))))) diff --git a/src/bcbio/variation/utils/svmerge.clj b/src/bcbio/variation/utils/svmerge.clj index 1dbe58d..af7ec85 100644 --- a/src/bcbio/variation/utils/svmerge.clj +++ b/src/bcbio/variation/utils/svmerge.clj @@ -45,9 +45,8 @@ (-> (combine/combine-variants [call-safesv svready-file] ref-file :merge-type :full) (fs/rename (:calls out-files))) - (-> (intervals/combine-multiple-intervals region-file [] ref-file - :combine-rule :union - :more-beds [sv-bed]) + (-> (intervals/combine-multiple region-file [] ref-file + :combine-rule :union :more-beds [sv-bed]) (fs/rename (:regions out-files))) )) out-files)) diff --git a/test/bcbio/variation/test/grade.clj b/test/bcbio/variation/test/grade.clj index ec50504..1683022 100644 --- a/test/bcbio/variation/test/grade.clj +++ b/test/bcbio/variation/test/grade.clj @@ -67,6 +67,24 @@ (-> cmp :grade-breakdown :discordant :snp :shared :hethom) => 1 (-> cmp :c-files :eval-discordant) => out-file))) +(facts "Comparisons where inputs have alternative reference sequence: hg19->GRCh37" + (let [base-dir (fs/file data-dir "digrade") + c1o {:file (str (fs/file base-dir "NA12878-cmp-r1.vcf")) + :prep true + :name "ref" :type "grading-ref"} + c2o {:file (str (fs/file base-dir "hg19" "NA12878-cmp-r3.vcf")) + :name "eval" + :prep true + :intervals (str (fs/file base-dir "hg19" "NA12878-cmp-r3-intervals.bed")) + :ref (str (fs/file data-dir "hg19.fa"))} + exp {:ref ref-file :sample "NA12878" :approach "grade" + :intervals (str (fs/file base-dir "NA12878-cmp-r1-intervals.bed"))} + config {:dir {:out (str (fs/file base-dir "work"))}}] + (fsp/remove-path (get-in config [:dir :out])) + (fs/mkdirs (get-in config [:dir :out])) + (let [[c1 c2] (#'compare/prepare-vcf-calls (assoc exp :calls [c1o c2o]) config)] + (compare-two-vcf c1 c2 exp config)))) + (facts "Normalize input VCFs containing END tags" (let [base-dir (fs/file data-dir "digrade") c1 {:file (str (fs/file base-dir "NA12878-cmp-r1.vcf")) diff --git a/test/bcbio/variation/test/validate.clj b/test/bcbio/variation/test/validate.clj index 3ebedfd..87509ba 100644 --- a/test/bcbio/variation/test/validate.clj +++ b/test/bcbio/variation/test/validate.clj @@ -4,14 +4,14 @@ [bcbio.variation.haploid :exclude [-main]] [bcbio.variation.filter.attr] [bcbio.variation.filter.classify] - [bcbio.variation.filter.intervals] [bcbio.variation.filter] [bcbio.variation.validate] [bcbio.variation.variantcontext :exclude [-main]]) (:require [me.raynes.fs :as fs] [bcbio.run.fsp :as fsp] [bcbio.run.itx :as itx] - [bcbio.variation.filter.custom :as cf])) + [bcbio.variation.filter.custom :as cf] + [bcbio.variation.filter.intervals :as fintervals])) (background (around :facts @@ -95,5 +95,5 @@ (cf/freebayes-filter fb-vcf ref) => fb-filter-out) (facts "Prepare combined interval lists based on filtering criteria" - (combine-multiple-intervals region-bed [align-bam] ref - :exclude-intervals exclude-bed) => region-multi-out) + (fintervals/combine-multiple region-bed [align-bam] ref + :exclude-intervals exclude-bed) => region-multi-out) diff --git a/test/data/digrade/NA12878-cmp-r1-intervals.bed b/test/data/digrade/NA12878-cmp-r1-intervals.bed new file mode 100644 index 0000000..ad93b37 --- /dev/null +++ b/test/data/digrade/NA12878-cmp-r1-intervals.bed @@ -0,0 +1 @@ +22 1 15000 diff --git a/test/data/digrade/hg19/NA12878-cmp-r3-intervals.bed b/test/data/digrade/hg19/NA12878-cmp-r3-intervals.bed new file mode 100644 index 0000000..589390f --- /dev/null +++ b/test/data/digrade/hg19/NA12878-cmp-r3-intervals.bed @@ -0,0 +1 @@ +chr22 1 15000 diff --git a/test/data/digrade/hg19/NA12878-cmp-r3.vcf b/test/data/digrade/hg19/NA12878-cmp-r3.vcf new file mode 100644 index 0000000..7e8e1a2 --- /dev/null +++ b/test/data/digrade/hg19/NA12878-cmp-r3.vcf @@ -0,0 +1,166 @@ +##fileformat=VCFv4.1 +##FILTER= +##FILTER= +##FILTER= +##FILTER= 60.0"> +##FILTER= 13.0"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT== 15"> +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO== 15"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr22 55 . G T . CScoreFilter ABCI5=0.052;ABCI50=0.180;ABCI95=0.383;AC=1;AF=0.500;AN=2;BaseQRankSum=0.067;CFILTERS=balance;DP=13;Entropy=2.77;FS=0.000;GC=75.25;HRun=1;HaplotypeScore=0.0000;MFE=-3.60;MPGHetRef=1.56;MPGHomRef=-2.999e+01;MPGHomVar=-1.564e+00;MPGLikHetRef=3.91;MPGLikHomRef=33.91;MPGLikHomVar=5.48;MQ=8.51;MQ0=0;MQRankSum=0.067;NBQ=32.80;OND=0.308;ReadMeanLen=95.3;ReadMeanPos=0.598;ReadPosEndDist=22.2;ReadPosRankSum=0.067;set=varscan GT:AD:DP:PVAL 0/1:2,7:11:3.4965E-3 +chr22 61 . G T . . ABCI5=0.388;ABCI50=0.620;ABCI95=0.818;AC=1;AF=0.500;AN=2;BaseQRankSum=-0.597;CFILTERS=None;DP=11;Entropy=2.89;FS=0.000;GC=74.26;HRun=0;HaplotypeScore=0.0000;MFE=-3.75;MPGHetRef=8.79;MPGHomRef=-8.790e+00;MPGHomVar=-2.050e+01;MPGLikHetRef=3.31;MPGLikHomRef=12.10;MPGLikHomVar=23.81;MQ=9.20;MQ0=0;MQRankSum=-1.034;NBQ=36.30;ReadMeanLen=94.9;ReadMeanPos=0.596;ReadPosEndDist=25.7;ReadPosRankSum=-0.278;set=varscan GT:AD:DP:PVAL 0/1:7,4:11:4.5113E-2 +chr22 86 . C T . . ABCI5=0.238;ABCI50=0.430;ABCI95=0.637;AC=1;AF=0.500;AN=2;BaseQRankSum=-1.468;CFILTERS=None;DP=14;Entropy=2.82;FS=0.000;GC=76.24;HRun=0;HaplotypeScore=0.0000;MFE=-1.73;MPGHetRef=14.69;MPGHomRef=-1.799e+01;MPGHomVar=-1.469e+01;MPGLikHetRef=4.21;MPGLikHomRef=22.20;MPGLikHomVar=18.91;MQ=13.41;MQ0=0;MQRankSum=-2.797;NBQ=34.35;ReadMeanLen=88.9;ReadMeanPos=0.457;ReadPosEndDist=19.4;ReadPosRankSum=0.715;set=varscan GT:AD:DP:PVAL 0/1:6,8:14:9.6618E-4 +chr22 203 . G C 200.80 . ABCI5=0.435;ABCI50=0.577;ABCI95=0.710;AC=2;AF=1.00;AN=2;BaseQRankSum=3.300;CFILTERS=None;DP=34;Entropy=3.27;FS=0.000;GC=75.25;HRun=3;HaplotypeScore=0.0000;MFE=-2.51;MPGHetRef=34.81;MPGHomRef=-3.481e+01;MPGHomVar=-4.225e+01;MPGLikHetRef=10.24;MPGLikHomRef=45.04;MPGLikHomVar=52.48;MQ=16.97;MQ0=0;MQRankSum=4.451;NBQ=33.58;OND=0.618;QD=5.91;ReadMeanLen=88.6;ReadMeanPos=0.343;ReadPosEndDist=19.5;ReadPosRankSum=1.343;set=filterIngatk-samtools-varscan GT:AD:DP:PL 1/1:20,13:31:56,10,0 +chr22 235 . G A . . ABCI5=0.616;ABCI50=0.715;ABCI95=0.801;AC=1;AF=0.500;AN=2;BaseQRankSum=5.507;CFILTERS=None;DP=62;Entropy=3.14;FS=0.000;GC=75.25;HRun=0;HaplotypeScore=0.0000;MFE=-4.22;MPGHetRef=39.73;MPGHomRef=-3.973e+01;MPGHomVar=-1.008e+02;MPGLikHetRef=18.66;MPGLikHomRef=58.40;MPGLikHomVar=119.50;MQ=11.18;MQ0=0;MQRankSum=-2.982;NBQ=34.61;ReadMeanLen=95.7;ReadMeanPos=0.337;ReadPosEndDist=23.9;ReadPosRankSum=0.158;set=varscan GT:AD:DP:PVAL 0/1:45,17:59:4.7113E-6 +chr22 248 . CCTT C 201.05 CScoreFilter AC=1;AF=0.500;AN=2;CFILTERS=balance,calling;DP=90;Entropy=2.82;GC=77.23;HRun=0;MFE=-1.13;MQ=11.90;NBQ=34.08;QD=2.23;ReadMeanLen=95.6;ReadMeanPos=0.334;ReadPosEndDist=20.5;set=freebayes-varscan GT:DP:PL 0/1:89:215,0,217 +chr22 301 . T C 147.77 . ABCI5=0.740;ABCI50=0.790;ABCI95=0.834;AC=1;AF=0.500;AN=2;BaseQRankSum=-1.302;CFILTERS=None;DP=201;Entropy=3.64;FS=0.000;GC=65.35;HRun=0;HaplotypeScore=0.0000;MFE=-1.35;MPGHetRef=73.64;MPGHomRef=-7.364e+01;MPGHomVar=-4.750e+02;MPGLikHetRef=60.51;MPGLikHomRef=134.15;MPGLikHomVar=535.52;MQ=12.44;MQ0=0;MQRankSum=1.855;NBQ=32.64;QD=0.74;ReadMeanLen=98.1;ReadMeanPos=0.400;ReadPosEndDist=24.4;ReadPosRankSum=0.095;set=filterIngatk-varscan GT:AD:DP:PVAL 0/1:160,41:199:4.5655E-14 +chr22 427 . C G 540.77 . ABCI5=0.553;ABCI50=0.619;ABCI95=0.682;AC=1;AF=0.500;AN=2;BaseQRankSum=7.334;CFILTERS=None;DP=155;Entropy=3.44;FS=0.765;GC=74.26;HRun=3;HaplotypeScore=0.0000;MFE=-2.04;MPGHetRef=157.55;MPGHomRef=-1.576e+02;MPGHomVar=-2.324e+02;MPGLikHetRef=46.66;MPGLikHomRef=204.21;MPGLikHomVar=279.11;MQ=20.20;MQ0=0;MQRankSum=0.572;NBQ=33.43;QD=3.49;ReadMeanLen=98.7;ReadMeanPos=0.607;ReadPosEndDist=24.0;ReadPosRankSum=-0.832;set=filterIngatk-samtools-varscan GT:AD:DP:PL 0/1:98,57:149:78,0,166 +chr22 513 . T G . CScoreFilter ABCI5=0.646;ABCI50=0.756;ABCI95=0.847;AC=1;AF=0.500;AN=2;BaseQRankSum=-1.445;CFILTERS=balance;DP=48;Entropy=2.95;FS=0.000;GC=72.28;HRun=0;HaplotypeScore=0.7340;MFE=-3.11;MPGHetRef=21.35;MPGHomRef=-2.135e+01;MPGHomVar=-1.075e+02;MPGLikHetRef=14.45;MPGLikHomRef=35.80;MPGLikHomVar=121.91;MQ=27.99;MQ0=0;MQRankSum=-2.776;NBQ=34.19;ReadMeanLen=97.3;ReadMeanPos=0.549;ReadPosEndDist=24.8;ReadPosRankSum=-0.691;set=varscan GT:AD:DP:PVAL 0/1:37,11:46:2.4819E-4 +chr22 551 . A G . . ABCI5=0.534;ABCI50=0.665;ABCI95=0.779;AC=1;AF=0.500;AN=2;BaseQRankSum=0.957;CFILTERS=None;DP=37;Entropy=3.49;FS=0.000;GC=71.29;HRun=1;HaplotypeScore=0.0000;MFE=-4.16;MPGHetRef=31.67;MPGHomRef=-3.167e+01;MPGHomVar=-7.786e+01;MPGLikHetRef=11.14;MPGLikHomRef=42.80;MPGLikHomVar=89.00;MQ=23.33;MQ0=0;MQRankSum=-2.417;NBQ=33.80;ReadMeanLen=94.7;ReadMeanPos=0.628;ReadPosEndDist=22.9;ReadPosRankSum=-0.568;set=varscan GT:AD:DP:PVAL 0/1:25,12:37:8.4418E-5 +chr22 558 . G A . . ABCI5=0.564;ABCI50=0.703;ABCI95=0.819;AC=1;AF=0.500;AN=2;BaseQRankSum=-3.521;CFILTERS=None;DP=32;Entropy=3.66;FS=0.000;GC=71.29;HRun=0;HaplotypeScore=0.0000;MFE=-2.04;MPGHetRef=20.04;MPGHomRef=-2.004e+01;MPGHomVar=-7.037e+01;MPGLikHetRef=9.63;MPGLikHomRef=29.68;MPGLikHomVar=80.00;MQ=22.77;MQ0=0;MQRankSum=-2.138;NBQ=32.31;ReadMeanLen=95.3;ReadMeanPos=0.652;ReadPosEndDist=23.2;ReadPosRankSum=-0.084;set=varscan GT:AD:DP:PVAL 0/1:23,9:31:9.9376E-4 +chr22 567 . T C . . ABCI5=0.501;ABCI50=0.654;ABCI95=0.787;AC=1;AF=0.500;AN=2;BaseQRankSum=1.929;CFILTERS=None;DP=27;Entropy=3.55;FS=0.000;GC=72.28;HRun=2;HaplotypeScore=0.0000;MFE=-5.31;MPGHetRef=25.28;MPGHomRef=-2.528e+01;MPGHomVar=-5.437e+01;MPGLikHetRef=8.13;MPGLikHomRef=33.41;MPGLikHomVar=62.50;MQ=15.47;MQ0=0;MQRankSum=-1.826;NBQ=34.67;ReadMeanLen=95.5;ReadMeanPos=0.695;ReadPosEndDist=22.0;ReadPosRankSum=0.797;set=varscan GT:AD:DP:PVAL 0/1:18,9:26:2.076E-3 +chr22 575 . A G . . ABCI5=0.412;ABCI50=0.589;ABCI95=0.750;AC=1;AF=0.500;AN=2;BaseQRankSum=0.424;CFILTERS=None;DP=20;Entropy=3.52;FS=0.000;GC=71.29;HRun=1;HaplotypeScore=0.0000;MFE=-3.02;MPGHetRef=22.78;MPGHomRef=-2.278e+01;MPGHomVar=-3.798e+01;MPGLikHetRef=6.02;MPGLikHomRef=28.80;MPGLikHomVar=44.00;MQ=9.86;MQ0=0;MQRankSum=-0.424;NBQ=34.36;ReadMeanLen=94.1;ReadMeanPos=0.695;ReadPosEndDist=24.4;ReadPosRankSum=0.656;set=varscan GT:AD:DP:PVAL 0/1:12,8:19:3.9926E-3 +chr22 616 . C T . . ABCI5=0.187;ABCI50=0.435;ABCI95=0.706;AC=1;AF=0.500;AN=2;BaseQRankSum=1.754;CFILTERS=None;DP=7;Entropy=3.20;FS=0.000;GC=65.35;HRun=0;HaplotypeScore=0.0000;MFE=-3.31;MPGHetRef=6.69;MPGHomRef=-1.189e+01;MPGHomVar=-6.693e+00;MPGLikHetRef=2.11;MPGLikHomRef=14.00;MPGLikHomVar=8.80;MQ=3.00;MQ0=0;MQRankSum=0.937;NBQ=35.80;ReadMeanLen=99.7;ReadMeanPos=0.639;ReadPosEndDist=16.7;ReadPosRankSum=-0.550;set=varscan GT:AD:DP:PVAL 0/1:3,4:7:3.4965E-2 +chr22 1363 . A G . . ABCI5=0.566;ABCI50=0.687;ABCI95=0.792;AC=1;AF=0.500;AN=2;BaseQRankSum=-1.547;CFILTERS=None;DP=43;Entropy=2.55;FS=0.000;GC=56.44;HRun=0;HaplotypeScore=0.0000;MFE=0.00;MPGHetRef=26.90;MPGHomRef=-2.690e+01;MPGHomVar=-8.387e+01;MPGLikHetRef=12.94;MPGLikHomRef=39.85;MPGLikHomVar=96.81;MQ=12.75;MQ0=0;MQRankSum=0.119;NBQ=33.49;ReadMeanLen=96.9;ReadMeanPos=0.392;ReadPosEndDist=25.4;ReadPosRankSum=0.278;set=varscan GT:AD:DP:PVAL 0/1:30,13:42:4.0777E-5 +chr22 1375 . ACCC A . CScoreFilter AC=1;AF=0.500;AN=2;CFILTERS=balance,calling;DP=54;Entropy=2.27;GC=54.46;HRun=0;MFE=0.00;MQ=18.11;NBQ=34.53;ReadMeanLen=97.6;ReadMeanPos=0.350;ReadPosEndDist=23.4;set=varscan GT:AD:DP:PVAL 0/1:11:41:2.4398E-4 +chr22 1646 . T C . . ABCI5=0.700;ABCI50=0.749;ABCI95=0.795;AC=1;AF=0.500;AN=2;BaseQRankSum=-6.734;CFILTERS=None;DP=224;Entropy=3.16;FS=0.000;GC=55.45;HRun=1;HaplotypeScore=3.7122;MFE=-5.58;MPGHetRef=97.82;MPGHomRef=-9.782e+01;MPGHomVar=-5.263e+02;MPGLikHetRef=67.43;MPGLikHomRef=165.25;MPGLikHomVar=593.70;MQ=49.83;MQ0=0;MQRankSum=-11.126;NBQ=35.55;OND=4.464e-03;ReadMeanLen=99.5;ReadMeanPos=0.562;ReadPosEndDist=24.0;ReadPosRankSum=0.492;set=varscan GT:AD:DP:PVAL 0/1:168,55:216:8.2073E-17 +chr22 1660 rs72613662 G A 4379.77 . ABCI5=0.216;ABCI50=0.266;ABCI95=0.320;AC=2;AF=1.00;AN=2;BaseQRankSum=-3.484;CFILTERS=None;DP=196;Entropy=3.01;FS=0.000;GC=57.43;HRun=1;HaplotypeScore=1.1567;MFE=-5.00;MPGHetRef=136.64;MPGHomRef=-4.525e+02;MPGHomVar=-1.366e+02;MPGLikHetRef=59.00;MPGLikHomRef=511.54;MPGLikHomVar=195.64;MQ=48.23;MQ0=0;MQRankSum=10.660;NBQ=35.16;OND=0.270;QD=22.35;ReadMeanLen=99.5;ReadMeanPos=0.569;ReadPosEndDist=24.6;ReadPosRankSum=0.720;set=combo-gatk-freebayes-gatk_haplotype-varscan GT:AD:DP:PL 1/1:53,143:196:4313,43,0 +chr22 1758 . A G 552.77 . ABCI5=-7.241e-03;ABCI50=0.024;ABCI95=0.130;AC=2;AF=1.00;AN=2;CFILTERS=None;DP=19;Entropy=3.47;FS=0.000;GC=53.47;HRun=1;HaplotypeScore=0.0000;MFE=-1.72;MPGHetRef=-5.717e+00;MPGHomRef=-6.810e+01;MPGHomVar=5.72;MPGLikHetRef=5.72;MPGLikHomRef=68.10;MPGLikHomVar=2.228e-03;MQ=26.00;MQ0=0;NBQ=34.70;QD=29.09;ReadMeanLen=100.0;ReadMeanPos=0.666;ReadPosEndDist=20.9;set=filterIngatk-samtools-varscan GT:AD:DP:PL 1/1:0,19:19:128,57,0 +chr22 14123 . G A . . ABCI5=0.750;ABCI50=0.779;ABCI95=0.807;AC=1;AF=0.500;AN=2;BaseQRankSum=2.330;CFILTERS=None;DP=560;Entropy=3.57;FS=0.000;GC=51.49;HRun=1;HaplotypeScore=0.0000;MFE=-1.13;MPGHetRef=262.91;MPGHomRef=-2.629e+02;MPGHomVar=-1.301e+03;MPGLikHetRef=168.58;MPGLikHomRef=431.49;MPGLikHomVar=1469.79;MQ=1.97;MQ0=0;MQRankSum=-0.629;NBQ=36.02;OND=1.786e-03;ReadMeanLen=99.6;ReadMeanPos=0.559;ReadPosEndDist=23.9;ReadPosRankSum=0.653;set=varscan GT:AD:DP:PVAL 0/1:439,120:546:7.4652E-37 +chr22 14364 . G T . . ABCI5=0.692;ABCI50=0.778;ABCI95=0.850;AC=1;AF=0.500;AN=2;BaseQRankSum=-0.852;CFILTERS=None;DP=71;Entropy=3.29;FS=0.000;GC=54.46;HRun=1;HaplotypeScore=0.0000;MFE=-0.99;MPGHetRef=31.94;MPGHomRef=-3.194e+01;MPGHomVar=-1.751e+02;MPGLikHetRef=21.37;MPGLikHomRef=53.31;MPGLikHomVar=196.50;MQ=1.43;MQ0=0;MQRankSum=1.049;NBQ=34.53;ReadMeanLen=99.0;ReadMeanPos=0.561;ReadPosEndDist=22.3;ReadPosRankSum=0.556;set=varscan GT:AD:DP:PVAL 0/1:56,15:66:2.8272E-5 From 62ea8d734537edd0494ae5cec2725155ed6f5c6f Mon Sep 17 00:00:00 2001 From: chapmanb Date: Sat, 4 Jan 2014 20:38:20 -0500 Subject: [PATCH 07/15] Ensure single sample ensemble preparation comparisons have a consistent sample name. Avoids confusing results when passed VCFs with different samples. Thanks to Chao Chen --- src/bcbio/variation/ensemble.clj | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/bcbio/variation/ensemble.clj b/src/bcbio/variation/ensemble.clj index a84a7f0..3057158 100644 --- a/src/bcbio/variation/ensemble.clj +++ b/src/bcbio/variation/ensemble.clj @@ -11,7 +11,8 @@ [me.raynes.fs :as fs] [bcbio.run.fsp :as fsp] [bcbio.run.itx :as itx] - [bcbio.variation.compare :as compare])) + [bcbio.variation.compare :as compare] + [bcbio.variation.variantcontext :as gvc])) (defn- setup-work-dir "Create working directory for ensemble consensus calling." @@ -69,7 +70,12 @@ :xspecific true :trusted {:total (get-in config [:ensemble :trusted-pct] 0.65)}}}]} (->/when-let [int-file (:intervals config)] - (assoc :intervals int-file)))]} + (assoc :intervals int-file)) + (->/when-let [sample-name (let [samples (-> vrn-files first gvc/get-vcf-header + .getGenotypeSamples)] + (when (= 1 (count samples)) + (first samples)))] + (assoc :sample sample-name)))]} yaml/generate-string (spit out-file)) out-file)) From f1bbaa485ee548110097c0da67837379aa91f068 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Wed, 15 Jan 2014 06:09:25 -0500 Subject: [PATCH 08/15] Update dependencies to the GATK 2.8.1 MIT licensed framework and associated Picard, Tribble and variant libraries. Silence phone home events from library. --- HISTORY.md | 2 ++ project.clj | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c60227f..6bc6834 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ ## 0.1.3 (in progress) +- Update dependencies to the GATK 2.8.1 MIT licensed framework and associated + Picard, Tribble and variant libraries. Silence phone home events from library. - Avoid errors on converting hg19 to GRCh37 where hg19 variants contain hg19 hap contigs with no equivalent in GRCh37. It now drops these variants instead of generating an error. Thanks to Severine Catreux. diff --git a/project.clj b/project.clj index 4252520..de7daab 100644 --- a/project.clj +++ b/project.clj @@ -8,11 +8,11 @@ [clj-stacktrace "0.2.5"] [bcbio.run "0.0.1-SNAPSHOT"] ;; GATK requirements - [org.clojars.chapmanb/gatk-lite "2.7.2"] - [org.clojars.chapmanb/picard "1.96"] - [org.clojars.chapmanb/sam "1.96"] - [org.clojars.chapmanb/tribble "1.96"] - [org.clojars.chapmanb/variant "1.96"] + [org.clojars.chapmanb/gatk-lite "2.8.1"] + [org.clojars.chapmanb/picard "1.104"] + [org.clojars.chapmanb/sam "1.104"] + [org.clojars.chapmanb/tribble "1.104"] + [org.clojars.chapmanb/variant "1.104"] [colt/colt "1.2.0"] [commons-lang "2.5"] [log4j "1.2.17"] From c09576db48006e6f41a5983b0b280fab088eaea5 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Wed, 15 Jan 2014 09:32:50 -0500 Subject: [PATCH 09/15] v0.1.3 --- project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project.clj b/project.clj index de7daab..05b9cb8 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject bcbio.variation "0.1.3-SNAPSHOT" +(defproject bcbio.variation "0.1.3" :description "Toolkit to analyze genomic variation data, built on the GATK with Clojure" :license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"} :dependencies [[org.clojure/clojure "1.5.1"] From c4913b97bf652fbfd8ed21f39a482105dda7b9cb Mon Sep 17 00:00:00 2001 From: chapmanb Date: Wed, 15 Jan 2014 09:38:31 -0500 Subject: [PATCH 10/15] Update docs for v0.1.3 --- HISTORY.md | 2 +- README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6bc6834..212befc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,4 @@ -## 0.1.3 (in progress) +## 0.1.3 (15 January 2014) - Update dependencies to the GATK 2.8.1 MIT licensed framework and associated Picard, Tribble and variant libraries. Silence phone home events from library. diff --git a/README.md b/README.md index 07a6973..1af6319 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ associated with different variant representations. ### Download -The latest release is 0.1.2 (4 December 2013): [bcbio.variation-0.1.2-standalone.jar][dl]. +The latest release is 0.1.3 (15 January 2014): [bcbio.variation-0.1.3-standalone.jar][dl]. Run from the command line: $ java -jar bcbio.variation-VERSION-standalone.jar [arguments] @@ -44,7 +44,7 @@ the library for variant comparison, normalization and ensemble calling. Note that bcbio.variation requires Java 1.7 since the underlying GATK libraries are not compatible with earlier versions. -[dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.2/bcbio.variation-0.1.2-standalone.jar +[dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.3/bcbio.variation-0.1.3-standalone.jar ### As a library From 3cff8fec48d0e2cdf1cb825807f9b93458b7fb56 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Tue, 28 Jan 2014 11:19:03 -0500 Subject: [PATCH 11/15] Include genotype quality in output Ensemble called VCFs. Thanks to Shalabh Suman --- project.clj | 2 +- src/bcbio/variation/recall.clj | 4 ++-- src/bcbio/variation/variantcontext.clj | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/project.clj b/project.clj index 05b9cb8..6e051c4 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject bcbio.variation "0.1.3" +(defproject bcbio.variation "0.1.4-SNAPSHOT" :description "Toolkit to analyze genomic variation data, built on the GATK with Clojure" :license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"} :dependencies [[org.clojure/clojure "1.5.1"] diff --git a/src/bcbio/variation/recall.clj b/src/bcbio/variation/recall.clj index 6cdb2ed..9201ba5 100644 --- a/src/bcbio/variation/recall.clj +++ b/src/bcbio/variation/recall.clj @@ -58,7 +58,7 @@ :call-type (:type g) :ref-allele (:ref-allele vc) :alleles (sort-by allele-order (:alleles g)) - :attributes (select-keys (:attributes g) ["PL" "DP" "AD" "PVAL"]) + :attributes (select-keys (:attributes g) ["PL" "DP" "AD" "PVAL" "GQ"]) :has-likelihood (if (seq (get-in g [:attributes "PL"])) 1 0) :attr-count (+ (if (seq (get-in g [:attributes "PL"])) 1 0) (if (seq (get-in g [:attributes "PVAL"])) 1 0) @@ -128,7 +128,7 @@ (.attributes (-> (map :attributes (reverse other-vcs)) (#(apply merge %)) (assoc "set" (update-set-info (get-in vc [:attributes "set"]) calls)))) - (.genotypes (gvc/create-genotypes most-likely-gs :attrs #{"PL" "PVAL" "DP" "AD" "AO"})) + (.genotypes (gvc/create-genotypes most-likely-gs :attrs #{"PL" "PVAL" "DP" "AD" "AO" "GQ"})) .make)))) (defn- recall-w-consensus diff --git a/src/bcbio/variation/variantcontext.clj b/src/bcbio/variation/variantcontext.clj index 44c70d3..5e7d37c 100644 --- a/src/bcbio/variation/variantcontext.clj +++ b/src/bcbio/variation/variantcontext.clj @@ -230,6 +230,7 @@ [gs & {:keys [attrs]}] (let [all-attrs [["PL" seq (fn [x _ v] (.PL x (int-array v)))] ["PVAL" identity (fn [x k v] (.attribute x k v))] + ["GQ" identity (fn [x _ v] (.GQ x v))] ["DP" identity (fn [x _ v] (.DP x v))] ["AD" seq (fn [x _ v] (.AD x (int-array v)))] ["AO" identity (fn [x a v] (.attribute x a v))]]] From 7c8ed9b1e8a61572e7f931aa39d2f0b8d7ab5315 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Fri, 28 Feb 2014 09:21:39 -0500 Subject: [PATCH 12/15] Pass through representative genotype call with the most attributes of interest specified. Helps with downstream filtering of ensemble calls. Thanks to Shalabh Suman --- src/bcbio/variation/recall.clj | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/bcbio/variation/recall.clj b/src/bcbio/variation/recall.clj index 9201ba5..0a709f6 100644 --- a/src/bcbio/variation/recall.clj +++ b/src/bcbio/variation/recall.clj @@ -63,6 +63,7 @@ :attr-count (+ (if (seq (get-in g [:attributes "PL"])) 1 0) (if (seq (get-in g [:attributes "PVAL"])) 1 0) (if (seq (get-in g [:attributes "AD"])) 1 0) + (if (get-in g [:attributes "GQ"]) 1 0) (if (get-in g [:attributes "DP"]) 1 0)) :pl (attr/get-pl g) }))) @@ -79,11 +80,13 @@ (apply + (remove nil? (map k xs)))) (sum-plus-call-type [i xs] (let [pls (safe-sum xs :pl) - represent-x (last (sort-by #(vector (:has-likelihood %) - (or (:pl %) (- Integer/MIN_VALUE)) - (:attr-count %)) + best-x (last (sort-by #(vector (:has-likelihood %) + (or (:pl %) (- Integer/MIN_VALUE)) + (:attr-count %)) + xs)) + represent-x (last (sort-by #(vector (:has-likelihood %) (:attr-count %)) xs)) - call-code (if (= "HET" (:call-type represent-x)) 0 1)] + call-code (if (= "HET" (:call-type best-x)) 0 1)] [(count xs) call-code (- pls) i represent-x]))] (->> alleles (group-by :alleles) From 6b95e167e8539f5059b32a1dacb08a7e2f2a5d60 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Mon, 3 Mar 2014 06:16:47 -0500 Subject: [PATCH 13/15] Handle bgzipped inputs to validation, using non-indexed retrieval to get headers from VCF file. Remove need for ref-file when retrieving VCF iterators. --- HISTORY.md | 7 +++++++ src/bcbio/variation/variantcontext.clj | 28 +++++++++++++++----------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 212befc..86b9e91 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,10 @@ +## 0.1.4 (in progress) + +- Allow bgzipped/tabix inputs to validation. +- Improve representation of ensemble variants ensuring standard expected keys + are present when available in an individual genotype call. Thanks to Shalabh + Suman. + ## 0.1.3 (15 January 2014) - Update dependencies to the GATK 2.8.1 MIT licensed framework and associated diff --git a/src/bcbio/variation/variantcontext.clj b/src/bcbio/variation/variantcontext.clj index 5e7d37c..0e6fa2b 100644 --- a/src/bcbio/variation/variantcontext.clj +++ b/src/bcbio/variation/variantcontext.clj @@ -85,16 +85,20 @@ "Create a Tribble FeatureSource for VCF file. Handles indexing and parsing of VCF into VariantContexts. We treat gzipped files as tabix indexed VCFs." - [in-file ref-file] - (if (.endsWith in-file ".gz") - (AbstractFeatureReader/getFeatureReader in-file (VCFCodec.) false) - (AbstractFeatureReader/getFeatureReader (.getAbsolutePath (file in-file)) (VCFCodec.) - (create-vcf-index in-file)))) + ([in-file] + (if (.endsWith in-file ".gz") + (AbstractFeatureReader/getFeatureReader in-file (VCFCodec.) false) + (AbstractFeatureReader/getFeatureReader (.getAbsolutePath (file in-file)) (VCFCodec.) + (create-vcf-index in-file)))) + ([in-file ref-file] + (get-vcf-source in-file))) (defn get-vcf-iterator "Create an iterator over VCF VariantContexts." - [in-file ref-file] - (.iterator (get-vcf-source in-file ref-file))) + ([in-file] + (.iterator (get-vcf-source in-file))) + ([in-file ref-file] + (get-vcf-iterator in-file))) (defn variants-in-region "Retrieve variants located in potentially multiple variant files" @@ -136,8 +140,8 @@ (defn get-vcf-header "Retrieve header from input VCF file." [vcf-file] - (with-open [vcf-reader (.makeSourceFromStream (VCFCodec.) (input-stream vcf-file))] - (.readActualHeader (VCFCodec.) vcf-reader))) + (with-open [vcf-reader (AbstractFeatureReader/getFeatureReader vcf-file (VCFCodec.) false)] + (.getHeader vcf-reader))) ;; ## Writing VCF files @@ -202,7 +206,7 @@ [vcf ref out-part fname fdesc passes?] (let [out-file (fsp/add-file-part vcf out-part)] (when (itx/needs-run? out-file) - (with-open [vcf-iter (get-vcf-iterator vcf ref)] + (with-open [vcf-iter (get-vcf-iterator vcf)] (write-vcf-w-template vcf {:out out-file} (map (partial maybe-add-filter fname passes?) (parse-vcf vcf-iter)) ref @@ -214,7 +218,7 @@ [in-file passes? file-out-part ref-file & {:keys [out-dir]}] (let [out-file (fsp/add-file-part in-file file-out-part out-dir)] (when (itx/needs-run? out-file) - (with-open [in-iter (get-vcf-iterator in-file ref-file)] + (with-open [in-iter (get-vcf-iterator in-file)] (write-vcf-w-template in-file {:out out-file} (map :vc (filter passes? (parse-vcf in-iter))) ref-file))) @@ -261,7 +265,7 @@ .make))) (defn -main [vcf ref approach] - (with-open [vcf-iter (get-vcf-iterator vcf ref)] + (with-open [vcf-iter (get-vcf-iterator vcf)] (letfn [(item-iter [] (case approach "gatk" (iterator-seq (.iterator vcf-iter)) From f437b0cd642815d3b3cee67524b96a059b5ae1b5 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Wed, 5 Mar 2014 05:34:41 -0500 Subject: [PATCH 14/15] v0.1.4 --- HISTORY.md | 2 +- README.md | 4 ++-- project.clj | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 86b9e91..feda515 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,4 @@ -## 0.1.4 (in progress) +## 0.1.4 (5 March 2014) - Allow bgzipped/tabix inputs to validation. - Improve representation of ensemble variants ensuring standard expected keys diff --git a/README.md b/README.md index 1af6319..0b7ebdf 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ associated with different variant representations. ### Download -The latest release is 0.1.3 (15 January 2014): [bcbio.variation-0.1.3-standalone.jar][dl]. +The latest release is 0.1.4 (5 March 2014): [bcbio.variation-0.1.4-standalone.jar][dl]. Run from the command line: $ java -jar bcbio.variation-VERSION-standalone.jar [arguments] @@ -44,7 +44,7 @@ the library for variant comparison, normalization and ensemble calling. Note that bcbio.variation requires Java 1.7 since the underlying GATK libraries are not compatible with earlier versions. -[dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.3/bcbio.variation-0.1.3-standalone.jar +[dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.4/bcbio.variation-0.1.4-standalone.jar ### As a library diff --git a/project.clj b/project.clj index 6e051c4..fde6225 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject bcbio.variation "0.1.4-SNAPSHOT" +(defproject bcbio.variation "0.1.4" :description "Toolkit to analyze genomic variation data, built on the GATK with Clojure" :license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"} :dependencies [[org.clojure/clojure "1.5.1"] From d6240431147e9df9f2f455ffb2f29ed2793abb87 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Sat, 8 Mar 2014 13:53:46 -0500 Subject: [PATCH 15/15] GATK 3.0 support. Handle longer running gemini loading steps in new 0.6.5 version --- HISTORY.md | 6 ++++++ project.clj | 12 ++++++------ src/bcbio/variation/index/gemini.clj | 12 +++++++----- .../gatk/walkers/annotator/DepthOfCoverage.java | 2 +- .../bcbio/gatk/walkers/annotator/FisherStrand.java | 9 ++------- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index feda515..b80a3b1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,9 @@ +## 0.1.5 (In progress) + +- Move to MIT licensed GATK 3.0 framework. +- Support lightweight loading options for gemini integration to avoid large load + times with new gene tables in gemini 0.6.5. + ## 0.1.4 (5 March 2014) - Allow bgzipped/tabix inputs to validation. diff --git a/project.clj b/project.clj index fde6225..438d996 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject bcbio.variation "0.1.4" +(defproject bcbio.variation "0.1.5-SNAPSHOT" :description "Toolkit to analyze genomic variation data, built on the GATK with Clojure" :license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"} :dependencies [[org.clojure/clojure "1.5.1"] @@ -8,11 +8,11 @@ [clj-stacktrace "0.2.5"] [bcbio.run "0.0.1-SNAPSHOT"] ;; GATK requirements - [org.clojars.chapmanb/gatk-lite "2.8.1"] - [org.clojars.chapmanb/picard "1.104"] - [org.clojars.chapmanb/sam "1.104"] - [org.clojars.chapmanb/tribble "1.104"] - [org.clojars.chapmanb/variant "1.104"] + [org.clojars.chapmanb/gatk-framework "3.0"] + [org.clojars.chapmanb/picard "1.107"] + [org.clojars.chapmanb/sam "1.107"] + [org.clojars.chapmanb/tribble "1.107"] + [org.clojars.chapmanb/variant "1.107"] [colt/colt "1.2.0"] [commons-lang "2.5"] [log4j "1.2.17"] diff --git a/src/bcbio/variation/index/gemini.clj b/src/bcbio/variation/index/gemini.clj index 7084071..f0695aa 100644 --- a/src/bcbio/variation/index/gemini.clj +++ b/src/bcbio/variation/index/gemini.clj @@ -38,7 +38,9 @@ (when-let [gemini-cmd (get-gemini-cmd)] (itx/with-tx-file [tx-index index-file] (let [info (apply shell/sh - (concat [gemini-cmd "load" "-v" in-file] + (concat [gemini-cmd "load" "-v" in-file "--skip-gene-tables"] + (when (.contains in-file "/test/data/") + ["--test-mode"]) (when (has-snpeff-anns? in-file) ["-t" "snpEff"]) [tx-index]))] @@ -227,11 +229,11 @@ (defn vc-attr-retriever "Retrieve metrics by name from a gemini index for provided VariantContexts." [in-file ref-file] - (let [pool (future (when-let [index-db (index-variant-file in-file ref-file)] - (get-sqlite-db-pool index-db)))] + (let [pool (when-let [index-db (index-variant-file in-file ref-file)] + (get-sqlite-db-pool index-db))] (fn [vc attr] - (when @pool - (sql/with-connection @pool + (when pool + (sql/with-connection pool (sql/with-query-results rows [(str "SELECT " (string/join "," (attr->colnames attr)) " FROM variants WHERE chrom = ? AND start = ? and ref = ?") diff --git a/src/java/bcbio/gatk/walkers/annotator/DepthOfCoverage.java b/src/java/bcbio/gatk/walkers/annotator/DepthOfCoverage.java index b8702dc..27b91d8 100644 --- a/src/java/bcbio/gatk/walkers/annotator/DepthOfCoverage.java +++ b/src/java/bcbio/gatk/walkers/annotator/DepthOfCoverage.java @@ -56,7 +56,7 @@ else if (perReadAlleleLikelihoodMap != null) { for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = el.getKey(); - depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + depth += 1; } } } diff --git a/src/java/bcbio/gatk/walkers/annotator/FisherStrand.java b/src/java/bcbio/gatk/walkers/annotator/FisherStrand.java index bd955a0..a2d9f90 100644 --- a/src/java/bcbio/gatk/walkers/annotator/FisherStrand.java +++ b/src/java/bcbio/gatk/walkers/annotator/FisherStrand.java @@ -258,7 +258,7 @@ private static int[][] getContingencyTable( final Map sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - // ignore reduced reads because they are always on the forward strand! - // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test - if ( p.getRead().isReducedRead() ) - continue; - if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions continue; @@ -300,7 +295,7 @@ private static int[][] getSNPContingencyTable(final Map