Skip to content

Commit

Permalink
Merge pull request #1378 from deeptools/bamComp_fix
Browse files Browse the repository at this point in the history
Bam comp fix
  • Loading branch information
WardDeb authored Jan 31, 2025
2 parents c8f38b5 + d669a3a commit d1afa73
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 147 deletions.
74 changes: 3 additions & 71 deletions pydeeptools/deeptools/multiBamSummary2.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,9 @@ def process_args(args=None):
args.labels = smartLabels(args.bamfiles)
else:
args.labels = [os.path.basename(x) for x in args.bamfiles]

if not args.outFileName:
print("Please provide an output file name.")
exit(0)
if not args.BED:
args.BED = []
if not args.region:
Expand Down Expand Up @@ -268,73 +270,3 @@ def main(args=None):
args.exonID,
args.transcript_id_designator
)

# if 'BED' in args:
# bed_regions = args.BED
# else:
# bed_regions = None

# if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors):
# sys.stderr.write("You've input a single BAM file and not specified "
# "--outRawCounts or --scalingFactors. The resulting output will NOT be "
# "useful with any deepTools program!\n")

# stepsize = args.binSize + args.distanceBetweenBins
# c = countR.CountReadsPerBin(
# args.bamfiles,
# args.binSize,
# numberOfSamples=None,
# genomeChunkSize=args.genomeChunkSize,
# numberOfProcessors=args.numberOfProcessors,
# verbose=args.verbose,
# region=args.region,
# bedFile=bed_regions,
# blackListFileName=args.blackListFileName,
# extendReads=args.extendReads,
# minMappingQuality=args.minMappingQuality,
# ignoreDuplicates=args.ignoreDuplicates,
# center_read=args.centerReads,
# samFlag_include=args.samFlagInclude,
# samFlag_exclude=args.samFlagExclude,
# minFragmentLength=args.minFragmentLength,
# maxFragmentLength=args.maxFragmentLength,
# stepSize=stepsize,
# zerosToNans=False,
# out_file_for_raw_data=args.outRawCounts)

# num_reads_per_bin = c.run(allArgs=args)

# sys.stderr.write("Number of bins "
# "found: {}\n".format(num_reads_per_bin.shape[0]))

# if num_reads_per_bin.shape[0] < 2:
# exit("ERROR: too few non zero bins found.\n"
# "If using --region please check that this "
# "region is covered by reads.\n")

# # numpy will append .npz to the file name if we don't do this...
# if args.outFileName:
# f = open(args.outFileName, "wb")
# np.savez_compressed(f,
# matrix=num_reads_per_bin,
# labels=args.labels)
# f.close()

# if args.scalingFactors:
# f = open(args.scalingFactors, 'w')
# f.write("sample\tscalingFactor\n")
# scalingFactors = countR.estimateSizeFactors(num_reads_per_bin)
# for sample, scalingFactor in zip(args.labels, scalingFactors):
# f.write("{}\t{:6.4f}\n".format(sample, scalingFactor))
# f.close()

# if args.outRawCounts:
# # append to the generated file the
# # labels
# header = "#'chr'\t'start'\t'end'\t"
# header += "'" + "'\t'".join(args.labels) + "'\n"
# f = open(args.outRawCounts, 'r+')
# content = f.read()
# f.seek(0, 0)
# f.write(header + content)
# f.close()
79 changes: 39 additions & 40 deletions src/bamcompare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::fs::File;
use itertools::Itertools;
use bigtools::{Value};
use crate::filehandler::{bam_ispaired, write_covfile};
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider};
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, TempZip, region_divider};
use crate::normalization::scale_factor_bamcompare;
use crate::calc::{median, calc_ratio};
use tempfile::{TempPath};
Expand Down Expand Up @@ -68,7 +68,7 @@ pub fn r_bamcompare(
// Set up the bam files in a Vec.
let bamfiles = vec![(bamifile1, ispe1), (bamifile2, ispe2)];

let covcalcs: Vec<ParsedBamFile> = pool.install(|| {
let mut covcalcs: Vec<ParsedBamFile> = pool.install(|| {
bamfiles.par_iter()
.map(|(bamfile, ispe)| {
let (bg, mapped, unmapped, readlen, fraglen) = regionblocks.par_iter()
Expand Down Expand Up @@ -102,45 +102,44 @@ pub fn r_bamcompare(
println!("scale factor1 = {}, scale factor2 = {}", sf.0, sf.1);
// Create output stream
let mut chrom = "".to_string();
let lines = covcalcs[0].bg.iter().zip(covcalcs[1].bg.iter()).flat_map(
|(t1, t2)| {
let reader1 = BufReader::new(File::open(t1).unwrap()).lines();
let reader2 = BufReader::new(File::open(t2).unwrap()).lines();

reader1.zip(reader2).map(
|(l1, l2)| {
let l1 = l1.unwrap();
let l2 = l2.unwrap();
let fields1: Vec<&str> = l1.split('\t').collect();
let fields2: Vec<&str> = l2.split('\t').collect();

let chrom1: String = fields1[0].to_string();
let chrom2: String = fields2[0].to_string();
let start1: u32 = fields1[1].parse().unwrap();
let start2: u32 = fields2[1].parse().unwrap();
let end1: u32 = fields1[2].parse().unwrap();
let end2: u32 = fields2[2].parse().unwrap();

// Assert the regions are equal.
assert_eq!(chrom1, chrom2);
assert_eq!(start1, start2);
assert_eq!(end1, end2);

// Calculate the coverage.
let cov1: f32 = fields1[3].parse().unwrap();
let cov2: f32 = fields2[3].parse().unwrap();
let cov = calc_ratio(cov1, cov2, &sf.0, &sf.1, &pseudocount, operation);

(chrom1, Value { start: start1, end: end1, value: cov })
}).coalesce(|p, c| {
if p.1.value == c.1.value {
Ok((p.0, Value {start: p.1.start, end: c.1.end, value: p.1.value}))
} else {
Err((p, c))
}
})
}
);
// Extract both vecs of TempPaths into a single vector
let its = vec![
covcalcs[0].bg.drain(..).collect::<Vec<_>>(),
covcalcs[1].bg.drain(..).collect::<Vec<_>>()
];
let its: Vec<_> = its.iter().map(|x| x.into_iter()).collect();
let zips = TempZip { iterators: its };
let zips_vec: Vec<_> = zips.collect();

let lines = zips_vec
.into_iter()
.flat_map(|c| {
let readers: Vec<_> = c.into_iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect();
let temp_zip = TempZip { iterators: readers };
temp_zip.into_iter().map(|mut _l| {
let lines: Vec<_> = _l
.iter_mut()
.map(|x| x.as_mut().unwrap())
.map(|x| x.split('\t').collect())
.map(|x: Vec<&str>| (x[0].to_string(), x[1].parse::<u32>().unwrap(), x[2].parse::<u32>().unwrap(), x[3].parse::<f32>().unwrap()))
.collect();
assert_eq!(lines.len(), 2);
assert_eq!(lines[0].0, lines[1].0);
assert_eq!(lines[0].1, lines[1].1);
assert_eq!(lines[0].2, lines[1].2);
// Calculate the coverage.
let cov = calc_ratio(lines[0].3, lines[1].3, &sf.0, &sf.1, &pseudocount, operation);
(lines[0].0.clone(), Value { start: lines[0].1, end: lines[0].2, value: cov })
}).coalesce(|p, c| {
if p.1.value == c.1.value && p.0 == c.0 {
Ok((p.0, Value {start: p.1.start, end: c.1.end, value: p.1.value}))
} else {
Err((p, c))
}
})
});

write_covfile(lines, ofile, ofiletype, chromsizes);
Ok(())
}
Expand Down
86 changes: 69 additions & 17 deletions src/covcalc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,30 +168,69 @@ pub fn bam_pileup<'a>(

if binsize > &1 {
let mut counts: Vec<f32>;
let mut startstr: String = region.1.to_string();
let mut endstr: String = region.2.to_string();

if gene_mode {
counts = vec![0.0; 1];
for record in bam.records() {
let record = record.expect("Error parsing record.");
if !ignorechr.contains(&region.0) {
if record.is_unmapped() {
unmapped_reads += 1;
} else {
mapped_reads += 1;
if *ispe {
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
fraglens.push(record.insert_size().abs() as u32);
// It could be that we are in metagene mode, i.e. we only want counts over exons
// In this we need another iter - fetch per regstruct
match (regstruct.start.clone(), regstruct.end.clone()) {
(Revalue::U(start), Revalue::U(end)) => {
counts = vec![0.0; 1];
for record in bam.records() {
let record = record.expect("Error parsing record.");
if !ignorechr.contains(&region.0) {
if record.is_unmapped() {
unmapped_reads += 1;
} else {
mapped_reads += 1;
if *ispe {
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
fraglens.push(record.insert_size().abs() as u32);
}
}
readlens.push(record.seq_len() as u32);
}
}
readlens.push(record.seq_len() as u32);
counts[0] += 1.0;
}
}
counts[0] += 1.0;
},
(Revalue::V(starts), Revalue::V(ends)) => {
// Make a string with the start values comma separated
startstr = starts.iter().map(|x| x.to_string()).collect::<Vec<String>>().join(",");
endstr = ends.iter().map(|x| x.to_string()).collect::<Vec<String>>().join(",");

counts = vec![0.0; 1];
let exons: Vec<(u32, u32)> = starts.iter().zip(ends.iter())
.map(|(&s, &e)| (s, e))
.collect();
for exon in exons {
bam.fetch((regstruct.chrom.as_str(), exon.0, exon.1))
.expect(&format!("Error fetching region: {}:{},{}", regstruct.chrom, exon.0, exon.1));
for record in bam.records() {
let record = record.expect("Error parsing record.");
if !ignorechr.contains(&region.0) {
if record.is_unmapped() {
unmapped_reads += 1;
} else {
mapped_reads += 1;
if *ispe {
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
fraglens.push(record.insert_size().abs() as u32);
}
}
readlens.push(record.seq_len() as u32);
}
}
counts[0] += 1.0;
}
}
},
_ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}.",regstruct.name),
}
} else {
// populate the bg vector with 0 counts over all bins
counts = vec![0.0; (region.2 - region.1).div_ceil(*binsize) as usize];
println!("LENGTH OF THE VECO BRO {:?}", counts.len());
// let mut binstart = region.1;
let mut binix: u32 = 0;

Expand All @@ -217,7 +256,6 @@ pub fn bam_pileup<'a>(
.flat_map(|x| x[0] as u32..x[1] as u32)
.map(|x| (x / binsize) as usize)
.collect();
println!("INDICES = {:?}", indices);
indices.into_iter()
.for_each(|ix| counts[ix] += 1.0);
}
Expand All @@ -227,7 +265,7 @@ pub fn bam_pileup<'a>(
// bamCoverage mode -> we can collapse bins with same coverage (collapse = true)
// bamCompare & others -> We cannot collapse the bins, yet. (collapse = false)
if counts.len() == 1 {
writeln!(writer, "{}\t{}\t{}\t{}", region.0, region.1, region.2, counts[0]).unwrap();
writeln!(writer, "{}\t{}\t{}\t{}", region.0, startstr, endstr, counts[0]).unwrap();
} else {
if collapse {
let mut lcov = counts[0];
Expand Down Expand Up @@ -1438,3 +1476,17 @@ impl Bin {
}
}
}

pub struct TempZip<I>
where I: Iterator {
pub iterators: Vec<I>
}

impl<I, T> Iterator for TempZip<I>
where I: Iterator<Item=T> {
type Item = Vec<T>;
fn next(&mut self) -> Option<Self::Item> {
let o: Option<Vec<T>> = self.iterators.iter_mut().map(|x| x.next()).collect();
o
}
}
23 changes: 4 additions & 19 deletions src/multibamsummary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use std::borrow::Cow;
use std::collections::HashMap;
use std::path::Path;
use std::sync::{Arc, Mutex};
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider};
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, TempZip, region_divider};
use crate::filehandler::{bam_ispaired, read_bedfile, read_gtffile, chrombounds_from_bam, is_bed_or_gtf};
use crate::calc::{median, calc_ratio, deseq_scalefactors};
use crate::bamcompare::ParsedBamFile;
Expand Down Expand Up @@ -184,17 +184,17 @@ pub fn r_mbams(
.flat_map(|c| {
let readers: Vec<_> = c.par_iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect();
let mut _matvec: Vec<Vec<f32>> = Vec::new();
let mut _regions: Vec<(String, u32, u32)> = Vec::new();
let mut _regions: Vec<(String, String, String)> = Vec::new();
for mut _l in (TempZip { iterators: readers }) {
// unwrap all lines in _l
let lines: Vec<_> = _l
.par_iter_mut()
.map(|x| x.as_mut().unwrap())
.map(|x| x.split('\t').collect())
.map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::<u32>().unwrap(), x[2].parse::<u32>().unwrap(), x[3].parse::<f32>().unwrap() ) )
.map(|x: Vec<&str> | ( x[0].to_string(), x[1].to_string(), x[2].to_string(), x[3].parse::<f32>().unwrap() ) )
.collect();
let counts = lines.par_iter().map(|x| x.3).collect::<Vec<_>>();
let regions: (String, u32, u32) = (lines[0].0.clone(), lines[0].1, lines[0].2);
let regions: (String, String, String) = (lines[0].0.clone(), lines[0].1.clone(), lines[0].2.clone());
_matvec.push(counts);
_regions.push(regions);
}
Expand Down Expand Up @@ -267,20 +267,5 @@ pub fn r_mbams(
if verbose {
println!("Matrix written.");
}

Ok(())
}

struct TempZip<I>
where I: Iterator {
iterators: Vec<I>
}

impl<I, T> Iterator for TempZip<I>
where I: Iterator<Item=T> {
type Item = Vec<T>;
fn next(&mut self) -> Option<Self::Item> {
let o: Option<Vec<T>> = self.iterators.iter_mut().map(|x| x.next()).collect();
o
}
}

0 comments on commit d1afa73

Please sign in to comment.