Skip to content

Commit 989ead1

Browse files
committed
Fix bug in deduplicating aligned ONT BAM
(CHRRY-PICK & follow up to PR 406)
1 parent 0d61935 commit 989ead1

File tree

5 files changed

+102
-76
lines changed

5 files changed

+102
-76
lines changed

docker/lr-utils/remove_duplicate_ont_aln.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def main():
2121
guilty_dict_per_chr.setdefault(chrom, set())
2222
guilty_dict_per_chr[chrom].add(name)
2323

24+
print("chromosomes on which there are duplicate records:")
25+
print(f" {guilty_dict_per_chr}")
26+
2427
# Silence message about the .bai file not being found.
2528
pysam.set_verbosity(0)
2629

@@ -36,12 +39,13 @@ def main():
3639

3740
chrom = read.reference_name
3841
n = read.query_name
39-
if n in guilty_dict_per_chr[chrom]:
42+
if chrom in guilty_dict_per_chr and n in guilty_dict_per_chr[chrom]:
4043

4144
mq = read.mapping_quality
4245
sam_flag = read.flag
4346
pos = read.reference_start
44-
signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-"
47+
cigar = read.cigarstring
48+
signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-{cigar}"
4549

4650
if current_position != pos: # new position, let's write and reset
4751
out.write(read)

wdl/pipelines/ONT/Preprocessing/ONTFlowcellFromMultipleBasecalls.wdl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ version 1.0
22

33
import "../../../tasks/Utility/Utils.wdl" as Utils
44
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
5+
import "../../../tasks/Utility/ONTUtils.wdl"
56
import "../../../tasks/Utility/Finalize.wdl" as FF
67

78
import "../../../tasks/QC/SampleLevelAlignedMetrics.wdl" as COV
@@ -40,7 +41,7 @@ workflow ONTFlowcellFromMultipleBasecalls {
4041
File bam = select_first([MergeAllReads.merged_bam, aligned_bams[0]])
4142
File bai = select_first([MergeAllReads.merged_bai, aligned_bais[0]])
4243
if (bams_suspected_to_contain_dup_record) {
43-
call Utils.DeduplicateBam as RemoveDuplicates {
44+
call ONTUtils.DeduplicateBam as RemoveDuplicates {
4445
input: aligned_bam = bam, aligned_bai = bai, same_name_as_input = true
4546
}
4647
}

wdl/pipelines/TechAgnostic/Utility/MergeSampleBamsAndCollectMetrics.wdl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ version 1.0
22

33
import "../../../tasks/Utility/Utils.wdl" as Utils
44
import "../../../tasks/Utility/PBUtils.wdl" as PB
5+
import "../../../tasks/Utility/ONTUtils.wdl" as ONT
56
import "../../../tasks/Utility/BAMutils.wdl" as BU
67

78
import "../../../tasks/Utility/Finalize.wdl" as FF
@@ -71,7 +72,7 @@ workflow Work {
7172
###########################################################
7273
# ont specific: sometimes there are duplicate reads
7374
if (is_ont && bams_suspected_to_contain_dup_record) {
74-
call Utils.DeduplicateBam as RemoveONTDuplicates {
75+
call ONT.DeduplicateBam as RemoveONTDuplicates {
7576
input: aligned_bam = bam, aligned_bai = bai
7677
}
7778
}

wdl/tasks/Utility/ONTUtils.wdl

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,3 +229,95 @@ task PartitionManifest {
229229
}
230230
}
231231

232+
task DeduplicateBam {
233+
234+
meta {
235+
description: "Utility to drop (occationally happening) literal duplicate records in input BAM"
236+
}
237+
238+
parameter_meta {
239+
aligned_bam: "input BAM file (must be coordinate sorted)"
240+
aligned_bai: "input BAM index file"
241+
same_name_as_input: "if true, output BAM will have the same name as input BAM, otherwise it will have the input basename with .dedup suffix"
242+
runtime_attr_override: "override default runtime attributes"
243+
}
244+
245+
input {
246+
File aligned_bam
247+
File aligned_bai
248+
249+
Boolean same_name_as_input = true
250+
251+
RuntimeAttr? runtime_attr_override
252+
}
253+
254+
Int disk_size = 3 * ceil(size(aligned_bam, "GB"))
255+
256+
String base = basename(aligned_bam, ".bam")
257+
String prefix = if (same_name_as_input) then base else (base + ".dedup")
258+
259+
command <<<
260+
set -eux
261+
262+
samtools view -H "~{aligned_bam}" | grep "@HD" > hd.line
263+
if ! grep -qF "SO:coordinate" hd.line;
264+
then
265+
echo "BAM must be coordinate sorted!" && echo && cat hd.line && exit 1
266+
fi
267+
echo "==========================================================="
268+
echo "collecting duplicate information"
269+
time \
270+
samtools view -@ 1 "~{aligned_bam}" | \
271+
awk -F "\t" 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | \
272+
sort | uniq -d \
273+
> "~{aligned_bam}".duplicates.txt
274+
275+
cnt=$(wc -l "~{aligned_bam}".duplicates.txt | awk '{print $1}')
276+
if [[ ${cnt} -eq 0 ]];
277+
then
278+
echo "No duplicates found"
279+
if ! ~{same_name_as_input} ;
280+
then
281+
mv "~{aligned_bam}" "~{prefix}.bam"
282+
mv "~{aligned_bai}" "~{prefix}.bam.bai"
283+
fi
284+
exit 0
285+
fi
286+
echo "==========================================================="
287+
echo "de-duplicating"
288+
time python3 /opt/remove_duplicate_ont_aln.py \
289+
"~{aligned_bam}" \
290+
--prefix "~{prefix}" \
291+
--annotations "~{aligned_bam}".duplicates.txt
292+
echo "==========================================================="
293+
echo "DONE"
294+
samtools index "~{prefix}.bam"
295+
>>>
296+
297+
output {
298+
File corrected_bam = "~{prefix}.bam"
299+
File corrected_bai = "~{prefix}.bam.bai"
300+
File duplicate_record_signatures = "~{prefix}.duplicate.signatures.txt"
301+
}
302+
303+
#########################
304+
RuntimeAttr default_attr = object {
305+
cpu_cores: 4,
306+
mem_gb: 16,
307+
disk_gb: disk_size,
308+
boot_disk_gb: 10,
309+
preemptible_tries: 0,
310+
max_retries: 1,
311+
docker: "us.gcr.io/broad-dsp-lrma/lr-bam-dedup:0.1.2"
312+
}
313+
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
314+
runtime {
315+
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
316+
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
317+
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL"
318+
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
319+
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
320+
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
321+
docker: select_first([runtime_attr.docker, default_attr.docker])
322+
}
323+
}

wdl/tasks/Utility/Utils.wdl

Lines changed: 0 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1961,78 +1961,6 @@ task FilterBamOnTag {
19611961
}
19621962
}
19631963

1964-
task DeduplicateBam {
1965-
1966-
meta {
1967-
description: "Utility to drop (occationally happening) duplicate records in input BAM"
1968-
}
1969-
1970-
parameter_meta {
1971-
aligned_bam: "input BAM file"
1972-
aligned_bai: "input BAM index file"
1973-
same_name_as_input: "if true, output BAM will have the same name as input BAM, otherwise it will have the input basename with .dedup suffix"
1974-
runtime_attr_override: "override default runtime attributes"
1975-
}
1976-
1977-
input {
1978-
File aligned_bam
1979-
File aligned_bai
1980-
1981-
Boolean same_name_as_input = true
1982-
1983-
RuntimeAttr? runtime_attr_override
1984-
}
1985-
1986-
Int disk_size = 3 * ceil(size(aligned_bam, "GB"))
1987-
1988-
String base = basename(aligned_bam, ".bam")
1989-
String prefix = if (same_name_as_input) then base else (base + ".dedup")
1990-
1991-
command <<<
1992-
echo "==========================================================="
1993-
echo "collecting duplicate information"
1994-
time \
1995-
samtools view -@ 1 "~{aligned_bam}" | \
1996-
awk -F "\t" 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5}' | \
1997-
sort | uniq -d \
1998-
> "~{aligned_bam}".duplicates.txt
1999-
echo "==========================================================="
2000-
echo "de-duplicating"
2001-
time python3 /opt/remove_duplicate_ont_aln.py \
2002-
--prefix "~{prefix}" \
2003-
--annotations "~{aligned_bam}".duplicates.txt \
2004-
"~{aligned_bam}"
2005-
echo "==========================================================="
2006-
echo "DONE"
2007-
samtools index "~{prefix}.bam"
2008-
>>>
2009-
2010-
output {
2011-
File corrected_bam = "~{prefix}.bam"
2012-
File corrected_bai = "~{prefix}.bam.bai"
2013-
}
2014-
2015-
#########################
2016-
RuntimeAttr default_attr = object {
2017-
cpu_cores: 4,
2018-
mem_gb: 16,
2019-
disk_gb: disk_size,
2020-
boot_disk_gb: 10,
2021-
preemptible_tries: 0,
2022-
max_retries: 1,
2023-
docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.10"
2024-
}
2025-
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
2026-
runtime {
2027-
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
2028-
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
2029-
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL"
2030-
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
2031-
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
2032-
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
2033-
docker: select_first([runtime_attr.docker, default_attr.docker])
2034-
}
2035-
}
20361964

20371965
task Cat {
20381966

0 commit comments

Comments
 (0)