Skip to content

Commit

Permalink
Merge branch 'master' of github.com:chapmanb/bcbio.variation
Browse files Browse the repository at this point in the history
  • Loading branch information
chapmanb committed Mar 13, 2014
2 parents 5131b26 + d624043 commit 2b6d257
Show file tree
Hide file tree
Showing 24 changed files with 353 additions and 72 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ test/data/phased/*-cmp*.vcf
test/data/phased/work/
test/data/digrade/*.idx
test/data/digrade/*.csv
test/data/digrade/*-sorted.bed
test/data/digrade/hg19/*-sorted.bed
test/data/digrade/work/
test/data/svs/*.idx
test/data/svs/*-sorted.bed
Expand Down
29 changes: 29 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
## 0.1.5 (In progress)

- Move to MIT licensed GATK 3.0 framework.
- Support lightweight loading options for gemini integration to avoid large load
times with new gene tables in gemini 0.6.5.

## 0.1.4 (5 March 2014)

- Allow bgzipped/tabix inputs to validation.
- Improve representation of ensemble variants ensuring standard expected keys
are present when available in an individual genotype call. Thanks to Shalabh
Suman.

## 0.1.3 (15 January 2014)

- Update dependencies to the GATK 2.8.1 MIT licensed framework and associated
Picard, Tribble and variant libraries. Silence phone home events from library.
- Avoid errors on converting hg19 to GRCh37 where hg19 variants contain hg19 hap
contigs with no equivalent in GRCh37. It now drops these variants instead
of generating an error. Thanks to Severine Catreux.
- Ensure BED files passed to evaluations are converted to reference material
coordinates when inputs differ (hg19->GRCh37). Thanks to Severine Catreux.
- Avoid issues with running LeftAlignVariants on indels with END tags. Thanks to
Justin Johnson.
- Move to external bcbio.run tool to help abstract out some core functionality
useful in other contexts.
- Additional flexibility for Illumina to valid VCF preparation. Allow ignoring
SVs or other file types.

## 0.1.2 (4 December 2013)

- Handle metrics evaluation with GATK where input calls have haploid chromosomes,
Expand Down
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,18 @@ associated with different variant representations.

### Download

The latest release is 0.1.2 (4 December 2013): [bcbio.variation-0.1.2-standalone.jar][dl].
The latest release is 0.1.4 (5 March 2014): [bcbio.variation-0.1.4-standalone.jar][dl].
Run from the command line:

$ java -jar bcbio.variation-VERSION-standalone.jar [arguments]

The jar contains a full GATK commandline with additional walkers, as well as
custom command line programs. See the usage section below for more details.
custom command line programs. The usage section below contains examples of using
the library for variant comparison, normalization and ensemble calling. Note
that bcbio.variation requires Java 1.7 since the underlying GATK libraries are
not compatible with earlier versions.

[dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.2/bcbio.variation-0.1.2-standalone.jar
[dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.4/bcbio.variation-0.1.4-standalone.jar

### As a library

Expand Down
12 changes: 6 additions & 6 deletions project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject bcbio.variation "0.1.3-SNAPSHOT"
(defproject bcbio.variation "0.1.5-SNAPSHOT"
:description "Toolkit to analyze genomic variation data, built on the GATK with Clojure"
:license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"}
:dependencies [[org.clojure/clojure "1.5.1"]
Expand All @@ -8,11 +8,11 @@
[clj-stacktrace "0.2.5"]
[bcbio.run "0.0.1-SNAPSHOT"]
;; GATK requirements
[org.clojars.chapmanb/gatk-lite "2.7.2"]
[org.clojars.chapmanb/picard "1.96"]
[org.clojars.chapmanb/sam "1.96"]
[org.clojars.chapmanb/tribble "1.96"]
[org.clojars.chapmanb/variant "1.96"]
[org.clojars.chapmanb/gatk-framework "3.0"]
[org.clojars.chapmanb/picard "1.107"]
[org.clojars.chapmanb/sam "1.107"]
[org.clojars.chapmanb/tribble "1.107"]
[org.clojars.chapmanb/variant "1.107"]
[colt/colt "1.2.0"]
[commons-lang "2.5"]
[log4j "1.2.17"]
Expand Down
18 changes: 18 additions & 0 deletions src/bcbio/align/interval.clj
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,21 @@
(map (partial update-contig-name name-map))
(remove nil?)))))))
out-file))

(defn- maybe-remap-bed
[interval int-ref base-ref out-dir]
(when interval
(if (and (not (nil? int-ref)) (not= base-ref int-ref))
(rename-bed interval base-ref :out-dir out-dir)
interval)))

(defn prep-multi
"Prepare multiple input intervals, remapping chromosome names as needed."
([to-prep ref-file orig out-dir]
(->> to-prep
(map (juxt :intervals :ref))
(map (fn [[bed ref]] (maybe-remap-bed bed ref ref-file out-dir)))
(concat (flatten [orig]))
(remove nil?)))
([to-prep ref-file out-dir]
(prep-multi to-prep ref-file [] out-dir)))
12 changes: 6 additions & 6 deletions src/bcbio/variation/combine.clj
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@
base-name (if (nil? base-ext) full-base-name
(format "%s-%s.vcf" (first (string/split full-base-name #"-"))
base-ext))
file-info {:out-vcf (str (fs/file base-dir
(fsp/add-file-part base-name
(case merge-type
:minimal "mincombine"
:full "fullcombine"
"combine"))))}
file-info {:out-vcf (fsp/add-file-part base-name
(case merge-type
:minimal "mincombine"
:full "fullcombine"
"combine")
base-dir)}
args (concat ["-R" ref
"-o" :out-vcf
"--rod_priority_list" (string/join "," (map-indexed unique-name vcfs))]
Expand Down
12 changes: 6 additions & 6 deletions src/bcbio/variation/compare.clj
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
[bcbio.variation.evaluate :only [calc-variant-eval-metrics]]
[bcbio.variation.filter :only [variant-filter variant-format-filter
pipeline-recalibration]]
[bcbio.variation.filter.intervals :only [combine-multiple-intervals]]
[bcbio.variation.multiple :only [prep-cmp-name-lookup pipeline-compare-multiple]]
[bcbio.variation.multisample :only [compare-two-vcf-flexible
multiple-samples?]]
Expand All @@ -32,9 +31,11 @@
[clj-yaml.core :as yaml]
[me.raynes.fs :as fs]
[lonocloud.synthread :as ->]
[bcbio.align.interval :as ainterval]
[bcbio.run.fsp :as fsp]
[bcbio.run.itx :as itx]
[bcbio.run.broad :as broad]
[bcbio.variation.filter.intervals :as fintervals]
[bcbio.variation.grade :as grade]
[bcbio.variation.phasing :as phasing]
[bcbio.variation.report :as report]
Expand All @@ -46,8 +47,7 @@
"Variant comparison producing 3 files: concordant and both directions discordant"
[sample call1 call2 ref & {:keys [out-dir intervals]}]
(let [base-dir (if (nil? out-dir) (fs/parent (:file call1)) out-dir)
ready-intervals (remove nil? (flatten [intervals (:intervals call1)
(:intervals call2)]))
ready-intervals (ainterval/prep-multi [call1 call2] ref intervals out-dir)
sample (or sample (-> call1 :file gvc/get-vcf-header .getGenotypeSamples first))]
(if-not (fs/exists? base-dir)
(fs/mkdirs base-dir))
Expand Down Expand Up @@ -116,7 +116,7 @@
(let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))
transition (partial do-transition config)
align-bams (prepare-input-bams exp out-dir)
all-intervals (remove nil? (map :intervals (cons exp (:calls exp))))
all-intervals (ainterval/prep-multi (cons exp (:calls exp)) (:ref exp) out-dir)
start-vcfs (vec (map #(gatk-normalize % exp all-intervals out-dir transition)
(:calls exp)))
_ (transition :combine "Creating merged VCF files for all comparisons")
Expand Down Expand Up @@ -148,8 +148,8 @@
(let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))
align-bams (remove nil? (map :align [c1 c2]))]
(when (and (:intervals exp) (seq align-bams))
(combine-multiple-intervals (:intervals exp) align-bams (:ref exp)
:out-dir out-dir :name (:sample exp)))))
(fintervals/combine-multiple (:intervals exp) align-bams (:ref exp)
:out-dir out-dir :name (:sample exp)))))
(discordant-name [x]
(format "%s-discordant" (:name x)))
(zipmap-ordered [xs1 xs2]
Expand Down
12 changes: 10 additions & 2 deletions src/bcbio/variation/ensemble.clj
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
[me.raynes.fs :as fs]
[bcbio.run.fsp :as fsp]
[bcbio.run.itx :as itx]
[bcbio.variation.compare :as compare]))
[bcbio.variation.compare :as compare]
[bcbio.variation.variantcontext :as gvc]))

(defn- setup-work-dir
"Create working directory for ensemble consensus calling."
Expand Down Expand Up @@ -69,7 +70,12 @@
:xspecific true
:trusted {:total (get-in config [:ensemble :trusted-pct] 0.65)}}}]}
(->/when-let [int-file (:intervals config)]
(assoc :intervals int-file)))]}
(assoc :intervals int-file))
(->/when-let [sample-name (let [samples (-> vrn-files first gvc/get-vcf-header
.getGenotypeSamples)]
(when (= 1 (count samples))
(first samples)))]
(assoc :sample sample-name)))]}
yaml/generate-string
(spit out-file))
out-file))
Expand All @@ -86,6 +92,8 @@
config-file (create-ready-config vrn-files ref-file in-config dirs)]
(compare/variant-comparison-from-config config-file)
(let [prep-file (first (fs/glob (str (io/file (:prep dirs) "*cfilter.vcf"))))]
(assert prep-file (str "Did not find prepped and filtered consensus file. "
"Do you have classifiers specified in the input YAML file?"))
(fs/copy prep-file out-file)
(fs/copy (str prep-file ".idx") (str out-file ".idx"))))
out-file)
Expand Down
15 changes: 8 additions & 7 deletions src/bcbio/variation/filter/intervals.clj
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@
(catch UserException$BadInput e []))
(rest intervals)))))

;; ## Merge BED files

(defn- prep-intervals-by-contig
"Intersect and exclude intervals on a contig."
[start-intervals exclude-intervals loc-parser combine-rule]
Expand Down Expand Up @@ -109,7 +111,7 @@
combine-rule)
contigs))))

(defn combine-multiple-intervals
(defn combine-multiple
"Combine intervals from an initial BED and coverage BAM files."
[initial-bed align-bams ref & {:keys [out-dir name exclude-intervals combine-rule
more-beds]}]
Expand Down Expand Up @@ -137,9 +139,8 @@
(let [base-intervals (:intervals exp)
all-aligns (set (remove nil? (map :align (cons exp (:calls exp)))))]
(when (and base-intervals (seq all-aligns))
(combine-multiple-intervals base-intervals all-aligns
(:ref exp)
:exclude-intervals (:exclude-intervals exp)
:name (:sample exp)
:out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))))))

(combine-multiple base-intervals all-aligns
(:ref exp)
:exclude-intervals (:exclude-intervals exp)
:name (:sample exp)
:out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))))))
12 changes: 7 additions & 5 deletions src/bcbio/variation/index/gemini.clj
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
(when-let [gemini-cmd (get-gemini-cmd)]
(itx/with-tx-file [tx-index index-file]
(let [info (apply shell/sh
(concat [gemini-cmd "load" "-v" in-file]
(concat [gemini-cmd "load" "-v" in-file "--skip-gene-tables"]
(when (.contains in-file "/test/data/")
["--test-mode"])
(when (has-snpeff-anns? in-file)
["-t" "snpEff"])
[tx-index]))]
Expand Down Expand Up @@ -227,11 +229,11 @@
(defn vc-attr-retriever
"Retrieve metrics by name from a gemini index for provided VariantContexts."
[in-file ref-file]
(let [pool (future (when-let [index-db (index-variant-file in-file ref-file)]
(get-sqlite-db-pool index-db)))]
(let [pool (when-let [index-db (index-variant-file in-file ref-file)]
(get-sqlite-db-pool index-db))]
(fn [vc attr]
(when @pool
(sql/with-connection @pool
(when pool
(sql/with-connection pool
(sql/with-query-results rows
[(str "SELECT " (string/join "," (attr->colnames attr))
" FROM variants WHERE chrom = ? AND start = ? and ref = ?")
Expand Down
18 changes: 16 additions & 2 deletions src/bcbio/variation/normalize.clj
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
[lonocloud.synthread :as ->]
[bcbio.run.fsp :as fsp]
[bcbio.run.itx :as itx]
[bcbio.variation.structural :as structural]
[bcbio.variation.variantcontext :as gvc]))

;; ## Chromosome name remapping
Expand Down Expand Up @@ -157,6 +158,18 @@
(.genotypes (map normalize-allele-calls (update-genotype-sample (:vc orig) sample)))
.make)))))

(defn- remove-extra-end
"Remove END alleles present in non-structural variants.
END tags in indels cause issues downstream in LeftAlignVariants if moved."
[vc]
(if (and (contains? (:attributes vc) "END")
(nil? (structural/get-sv-type vc {:max-indel-size Integer/MAX_VALUE})))
(assoc vc :vc
(-> (VariantContextBuilder. (:vc vc))
(.rmAttribute "END")
.make))
vc))

(defn- no-call-genotype?
"Check if a variant has a non-informative no-call genotype."
[vc config]
Expand Down Expand Up @@ -240,6 +253,7 @@
(remove #(no-call-genotype? % config))
(map (partial normalize-sv-genotype config sample))
(map (partial fix-vc sample config))
(map remove-extra-end)
(map :vc)))

(defn- fix-vcf-line
Expand Down Expand Up @@ -276,8 +290,8 @@
(let [line-info (fix-vcf-line line chr-map config)]
(if-let [wtr (get ref-wrtrs (:chrom line-info))]
(.write wtr (str (:line line-info) "\n"))
(throw (Exception. (format "Could not find remapping of chromosome %s in reference: %s"
(:chrom line-info) (keys ref-wrtrs)))))))]
(println (format "Could not find remapping of chromosome %s in reference. Removing variant."
(:chrom line-info))))))]
(let [ref-chrs (ref-chr-files ref-file)
ref-wrtrs (zipmap (keys ref-chrs) (map writer (vals ref-chrs)))
chr-map (chr-name-remap (:prep-org config) ref-file orig-ref-file)]
Expand Down
15 changes: 9 additions & 6 deletions src/bcbio/variation/recall.clj
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,12 @@
:call-type (:type g)
:ref-allele (:ref-allele vc)
:alleles (sort-by allele-order (:alleles g))
:attributes (select-keys (:attributes g) ["PL" "DP" "AD" "PVAL"])
:attributes (select-keys (:attributes g) ["PL" "DP" "AD" "PVAL" "GQ"])
:has-likelihood (if (seq (get-in g [:attributes "PL"])) 1 0)
:attr-count (+ (if (seq (get-in g [:attributes "PL"])) 1 0)
(if (seq (get-in g [:attributes "PVAL"])) 1 0)
(if (seq (get-in g [:attributes "AD"])) 1 0)
(if (get-in g [:attributes "GQ"]) 1 0)
(if (get-in g [:attributes "DP"]) 1 0))
:pl (attr/get-pl g)
})))
Expand All @@ -79,11 +80,13 @@
(apply + (remove nil? (map k xs))))
(sum-plus-call-type [i xs]
(let [pls (safe-sum xs :pl)
represent-x (last (sort-by #(vector (:has-likelihood %)
(or (:pl %) (- Integer/MIN_VALUE))
(:attr-count %))
best-x (last (sort-by #(vector (:has-likelihood %)
(or (:pl %) (- Integer/MIN_VALUE))
(:attr-count %))
xs))
represent-x (last (sort-by #(vector (:has-likelihood %) (:attr-count %))
xs))
call-code (if (= "HET" (:call-type represent-x)) 0 1)]
call-code (if (= "HET" (:call-type best-x)) 0 1)]
[(count xs) call-code (- pls) i represent-x]))]
(->> alleles
(group-by :alleles)
Expand Down Expand Up @@ -128,7 +131,7 @@
(.attributes (-> (map :attributes (reverse other-vcs))
(#(apply merge %))
(assoc "set" (update-set-info (get-in vc [:attributes "set"]) calls))))
(.genotypes (gvc/create-genotypes most-likely-gs :attrs #{"PL" "PVAL" "DP" "AD" "AO"}))
(.genotypes (gvc/create-genotypes most-likely-gs :attrs #{"PL" "PVAL" "DP" "AD" "AO" "GQ"}))
.make))))

(defn- recall-w-consensus
Expand Down
5 changes: 2 additions & 3 deletions src/bcbio/variation/utils/svmerge.clj
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@
(-> (combine/combine-variants [call-safesv svready-file] ref-file
:merge-type :full)
(fs/rename (:calls out-files)))
(-> (intervals/combine-multiple-intervals region-file [] ref-file
:combine-rule :union
:more-beds [sv-bed])
(-> (intervals/combine-multiple region-file [] ref-file
:combine-rule :union :more-beds [sv-bed])
(fs/rename (:regions out-files)))
))
out-files))
Loading

0 comments on commit 2b6d257

Please sign in to comment.