Skip to content

Commit d1afa73

Browse files
authored
Merge pull request #1378 from deeptools/bamComp_fix
Bam comp fix
2 parents c8f38b5 + d669a3a commit d1afa73

File tree

4 files changed

+115
-147
lines changed

4 files changed

+115
-147
lines changed

pydeeptools/deeptools/multiBamSummary2.py

Lines changed: 3 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,9 @@ def process_args(args=None):
206206
args.labels = smartLabels(args.bamfiles)
207207
else:
208208
args.labels = [os.path.basename(x) for x in args.bamfiles]
209-
209+
if not args.outFileName:
210+
print("Please provide an output file name.")
211+
exit(0)
210212
if not args.BED:
211213
args.BED = []
212214
if not args.region:
@@ -268,73 +270,3 @@ def main(args=None):
268270
args.exonID,
269271
args.transcript_id_designator
270272
)
271-
272-
# if 'BED' in args:
273-
# bed_regions = args.BED
274-
# else:
275-
# bed_regions = None
276-
277-
# if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors):
278-
# sys.stderr.write("You've input a single BAM file and not specified "
279-
# "--outRawCounts or --scalingFactors. The resulting output will NOT be "
280-
# "useful with any deepTools program!\n")
281-
282-
# stepsize = args.binSize + args.distanceBetweenBins
283-
# c = countR.CountReadsPerBin(
284-
# args.bamfiles,
285-
# args.binSize,
286-
# numberOfSamples=None,
287-
# genomeChunkSize=args.genomeChunkSize,
288-
# numberOfProcessors=args.numberOfProcessors,
289-
# verbose=args.verbose,
290-
# region=args.region,
291-
# bedFile=bed_regions,
292-
# blackListFileName=args.blackListFileName,
293-
# extendReads=args.extendReads,
294-
# minMappingQuality=args.minMappingQuality,
295-
# ignoreDuplicates=args.ignoreDuplicates,
296-
# center_read=args.centerReads,
297-
# samFlag_include=args.samFlagInclude,
298-
# samFlag_exclude=args.samFlagExclude,
299-
# minFragmentLength=args.minFragmentLength,
300-
# maxFragmentLength=args.maxFragmentLength,
301-
# stepSize=stepsize,
302-
# zerosToNans=False,
303-
# out_file_for_raw_data=args.outRawCounts)
304-
305-
# num_reads_per_bin = c.run(allArgs=args)
306-
307-
# sys.stderr.write("Number of bins "
308-
# "found: {}\n".format(num_reads_per_bin.shape[0]))
309-
310-
# if num_reads_per_bin.shape[0] < 2:
311-
# exit("ERROR: too few non zero bins found.\n"
312-
# "If using --region please check that this "
313-
# "region is covered by reads.\n")
314-
315-
# # numpy will append .npz to the file name if we don't do this...
316-
# if args.outFileName:
317-
# f = open(args.outFileName, "wb")
318-
# np.savez_compressed(f,
319-
# matrix=num_reads_per_bin,
320-
# labels=args.labels)
321-
# f.close()
322-
323-
# if args.scalingFactors:
324-
# f = open(args.scalingFactors, 'w')
325-
# f.write("sample\tscalingFactor\n")
326-
# scalingFactors = countR.estimateSizeFactors(num_reads_per_bin)
327-
# for sample, scalingFactor in zip(args.labels, scalingFactors):
328-
# f.write("{}\t{:6.4f}\n".format(sample, scalingFactor))
329-
# f.close()
330-
331-
# if args.outRawCounts:
332-
# # append to the generated file the
333-
# # labels
334-
# header = "#'chr'\t'start'\t'end'\t"
335-
# header += "'" + "'\t'".join(args.labels) + "'\n"
336-
# f = open(args.outRawCounts, 'r+')
337-
# content = f.read()
338-
# f.seek(0, 0)
339-
# f.write(header + content)
340-
# f.close()

src/bamcompare.rs

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::fs::File;
88
use itertools::Itertools;
99
use bigtools::{Value};
1010
use crate::filehandler::{bam_ispaired, write_covfile};
11-
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider};
11+
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, TempZip, region_divider};
1212
use crate::normalization::scale_factor_bamcompare;
1313
use crate::calc::{median, calc_ratio};
1414
use tempfile::{TempPath};
@@ -68,7 +68,7 @@ pub fn r_bamcompare(
6868
// Set up the bam files in a Vec.
6969
let bamfiles = vec![(bamifile1, ispe1), (bamifile2, ispe2)];
7070

71-
let covcalcs: Vec<ParsedBamFile> = pool.install(|| {
71+
let mut covcalcs: Vec<ParsedBamFile> = pool.install(|| {
7272
bamfiles.par_iter()
7373
.map(|(bamfile, ispe)| {
7474
let (bg, mapped, unmapped, readlen, fraglen) = regionblocks.par_iter()
@@ -102,45 +102,44 @@ pub fn r_bamcompare(
102102
println!("scale factor1 = {}, scale factor2 = {}", sf.0, sf.1);
103103
// Create output stream
104104
let mut chrom = "".to_string();
105-
let lines = covcalcs[0].bg.iter().zip(covcalcs[1].bg.iter()).flat_map(
106-
|(t1, t2)| {
107-
let reader1 = BufReader::new(File::open(t1).unwrap()).lines();
108-
let reader2 = BufReader::new(File::open(t2).unwrap()).lines();
109105

110-
reader1.zip(reader2).map(
111-
|(l1, l2)| {
112-
let l1 = l1.unwrap();
113-
let l2 = l2.unwrap();
114-
let fields1: Vec<&str> = l1.split('\t').collect();
115-
let fields2: Vec<&str> = l2.split('\t').collect();
116-
117-
let chrom1: String = fields1[0].to_string();
118-
let chrom2: String = fields2[0].to_string();
119-
let start1: u32 = fields1[1].parse().unwrap();
120-
let start2: u32 = fields2[1].parse().unwrap();
121-
let end1: u32 = fields1[2].parse().unwrap();
122-
let end2: u32 = fields2[2].parse().unwrap();
123-
124-
// Assert the regions are equal.
125-
assert_eq!(chrom1, chrom2);
126-
assert_eq!(start1, start2);
127-
assert_eq!(end1, end2);
128-
129-
// Calculate the coverage.
130-
let cov1: f32 = fields1[3].parse().unwrap();
131-
let cov2: f32 = fields2[3].parse().unwrap();
132-
let cov = calc_ratio(cov1, cov2, &sf.0, &sf.1, &pseudocount, operation);
133-
134-
(chrom1, Value { start: start1, end: end1, value: cov })
135-
}).coalesce(|p, c| {
136-
if p.1.value == c.1.value {
137-
Ok((p.0, Value {start: p.1.start, end: c.1.end, value: p.1.value}))
138-
} else {
139-
Err((p, c))
140-
}
141-
})
142-
}
143-
);
106+
// Extract both vecs of TempPaths into a single vector
107+
let its = vec![
108+
covcalcs[0].bg.drain(..).collect::<Vec<_>>(),
109+
covcalcs[1].bg.drain(..).collect::<Vec<_>>()
110+
];
111+
let its: Vec<_> = its.iter().map(|x| x.into_iter()).collect();
112+
let zips = TempZip { iterators: its };
113+
let zips_vec: Vec<_> = zips.collect();
114+
115+
let lines = zips_vec
116+
.into_iter()
117+
.flat_map(|c| {
118+
let readers: Vec<_> = c.into_iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect();
119+
let temp_zip = TempZip { iterators: readers };
120+
temp_zip.into_iter().map(|mut _l| {
121+
let lines: Vec<_> = _l
122+
.iter_mut()
123+
.map(|x| x.as_mut().unwrap())
124+
.map(|x| x.split('\t').collect())
125+
.map(|x: Vec<&str>| (x[0].to_string(), x[1].parse::<u32>().unwrap(), x[2].parse::<u32>().unwrap(), x[3].parse::<f32>().unwrap()))
126+
.collect();
127+
assert_eq!(lines.len(), 2);
128+
assert_eq!(lines[0].0, lines[1].0);
129+
assert_eq!(lines[0].1, lines[1].1);
130+
assert_eq!(lines[0].2, lines[1].2);
131+
// Calculate the coverage.
132+
let cov = calc_ratio(lines[0].3, lines[1].3, &sf.0, &sf.1, &pseudocount, operation);
133+
(lines[0].0.clone(), Value { start: lines[0].1, end: lines[0].2, value: cov })
134+
}).coalesce(|p, c| {
135+
if p.1.value == c.1.value && p.0 == c.0 {
136+
Ok((p.0, Value {start: p.1.start, end: c.1.end, value: p.1.value}))
137+
} else {
138+
Err((p, c))
139+
}
140+
})
141+
});
142+
144143
write_covfile(lines, ofile, ofiletype, chromsizes);
145144
Ok(())
146145
}

src/covcalc.rs

Lines changed: 69 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -168,30 +168,69 @@ pub fn bam_pileup<'a>(
168168

169169
if binsize > &1 {
170170
let mut counts: Vec<f32>;
171+
let mut startstr: String = region.1.to_string();
172+
let mut endstr: String = region.2.to_string();
171173

172174
if gene_mode {
173-
counts = vec![0.0; 1];
174-
for record in bam.records() {
175-
let record = record.expect("Error parsing record.");
176-
if !ignorechr.contains(&region.0) {
177-
if record.is_unmapped() {
178-
unmapped_reads += 1;
179-
} else {
180-
mapped_reads += 1;
181-
if *ispe {
182-
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
183-
fraglens.push(record.insert_size().abs() as u32);
175+
// It could be that we are in metagene mode, i.e. we only want counts over exons
176+
// In this we need another iter - fetch per regstruct
177+
match (regstruct.start.clone(), regstruct.end.clone()) {
178+
(Revalue::U(start), Revalue::U(end)) => {
179+
counts = vec![0.0; 1];
180+
for record in bam.records() {
181+
let record = record.expect("Error parsing record.");
182+
if !ignorechr.contains(&region.0) {
183+
if record.is_unmapped() {
184+
unmapped_reads += 1;
185+
} else {
186+
mapped_reads += 1;
187+
if *ispe {
188+
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
189+
fraglens.push(record.insert_size().abs() as u32);
190+
}
191+
}
192+
readlens.push(record.seq_len() as u32);
184193
}
185194
}
186-
readlens.push(record.seq_len() as u32);
195+
counts[0] += 1.0;
187196
}
188-
}
189-
counts[0] += 1.0;
197+
},
198+
(Revalue::V(starts), Revalue::V(ends)) => {
199+
// Make a string with the start values comma separated
200+
startstr = starts.iter().map(|x| x.to_string()).collect::<Vec<String>>().join(",");
201+
endstr = ends.iter().map(|x| x.to_string()).collect::<Vec<String>>().join(",");
202+
203+
counts = vec![0.0; 1];
204+
let exons: Vec<(u32, u32)> = starts.iter().zip(ends.iter())
205+
.map(|(&s, &e)| (s, e))
206+
.collect();
207+
for exon in exons {
208+
bam.fetch((regstruct.chrom.as_str(), exon.0, exon.1))
209+
.expect(&format!("Error fetching region: {}:{},{}", regstruct.chrom, exon.0, exon.1));
210+
for record in bam.records() {
211+
let record = record.expect("Error parsing record.");
212+
if !ignorechr.contains(&region.0) {
213+
if record.is_unmapped() {
214+
unmapped_reads += 1;
215+
} else {
216+
mapped_reads += 1;
217+
if *ispe {
218+
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
219+
fraglens.push(record.insert_size().abs() as u32);
220+
}
221+
}
222+
readlens.push(record.seq_len() as u32);
223+
}
224+
}
225+
counts[0] += 1.0;
226+
}
227+
}
228+
},
229+
_ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}.",regstruct.name),
190230
}
191231
} else {
192232
// populate the bg vector with 0 counts over all bins
193233
counts = vec![0.0; (region.2 - region.1).div_ceil(*binsize) as usize];
194-
println!("LENGTH OF THE VECO BRO {:?}", counts.len());
195234
// let mut binstart = region.1;
196235
let mut binix: u32 = 0;
197236

@@ -217,7 +256,6 @@ pub fn bam_pileup<'a>(
217256
.flat_map(|x| x[0] as u32..x[1] as u32)
218257
.map(|x| (x / binsize) as usize)
219258
.collect();
220-
println!("INDICES = {:?}", indices);
221259
indices.into_iter()
222260
.for_each(|ix| counts[ix] += 1.0);
223261
}
@@ -227,7 +265,7 @@ pub fn bam_pileup<'a>(
227265
// bamCoverage mode -> we can collapse bins with same coverage (collapse = true)
228266
// bamCompare & others -> We cannot collapse the bins, yet. (collapse = false)
229267
if counts.len() == 1 {
230-
writeln!(writer, "{}\t{}\t{}\t{}", region.0, region.1, region.2, counts[0]).unwrap();
268+
writeln!(writer, "{}\t{}\t{}\t{}", region.0, startstr, endstr, counts[0]).unwrap();
231269
} else {
232270
if collapse {
233271
let mut lcov = counts[0];
@@ -1438,3 +1476,17 @@ impl Bin {
14381476
}
14391477
}
14401478
}
1479+
1480+
pub struct TempZip<I>
1481+
where I: Iterator {
1482+
pub iterators: Vec<I>
1483+
}
1484+
1485+
impl<I, T> Iterator for TempZip<I>
1486+
where I: Iterator<Item=T> {
1487+
type Item = Vec<T>;
1488+
fn next(&mut self) -> Option<Self::Item> {
1489+
let o: Option<Vec<T>> = self.iterators.iter_mut().map(|x| x.next()).collect();
1490+
o
1491+
}
1492+
}

src/multibamsummary.rs

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use std::borrow::Cow;
1313
use std::collections::HashMap;
1414
use std::path::Path;
1515
use std::sync::{Arc, Mutex};
16-
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider};
16+
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, TempZip, region_divider};
1717
use crate::filehandler::{bam_ispaired, read_bedfile, read_gtffile, chrombounds_from_bam, is_bed_or_gtf};
1818
use crate::calc::{median, calc_ratio, deseq_scalefactors};
1919
use crate::bamcompare::ParsedBamFile;
@@ -184,17 +184,17 @@ pub fn r_mbams(
184184
.flat_map(|c| {
185185
let readers: Vec<_> = c.par_iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect();
186186
let mut _matvec: Vec<Vec<f32>> = Vec::new();
187-
let mut _regions: Vec<(String, u32, u32)> = Vec::new();
187+
let mut _regions: Vec<(String, String, String)> = Vec::new();
188188
for mut _l in (TempZip { iterators: readers }) {
189189
// unwrap all lines in _l
190190
let lines: Vec<_> = _l
191191
.par_iter_mut()
192192
.map(|x| x.as_mut().unwrap())
193193
.map(|x| x.split('\t').collect())
194-
.map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::<u32>().unwrap(), x[2].parse::<u32>().unwrap(), x[3].parse::<f32>().unwrap() ) )
194+
.map(|x: Vec<&str> | ( x[0].to_string(), x[1].to_string(), x[2].to_string(), x[3].parse::<f32>().unwrap() ) )
195195
.collect();
196196
let counts = lines.par_iter().map(|x| x.3).collect::<Vec<_>>();
197-
let regions: (String, u32, u32) = (lines[0].0.clone(), lines[0].1, lines[0].2);
197+
let regions: (String, String, String) = (lines[0].0.clone(), lines[0].1.clone(), lines[0].2.clone());
198198
_matvec.push(counts);
199199
_regions.push(regions);
200200
}
@@ -267,20 +267,5 @@ pub fn r_mbams(
267267
if verbose {
268268
println!("Matrix written.");
269269
}
270-
271270
Ok(())
272-
}
273-
274-
struct TempZip<I>
275-
where I: Iterator {
276-
iterators: Vec<I>
277-
}
278-
279-
impl<I, T> Iterator for TempZip<I>
280-
where I: Iterator<Item=T> {
281-
type Item = Vec<T>;
282-
fn next(&mut self) -> Option<Self::Item> {
283-
let o: Option<Vec<T>> = self.iterators.iter_mut().map(|x| x.next()).collect();
284-
o
285-
}
286271
}

0 commit comments

Comments
 (0)