Skip to content

Commit 8fce954

Browse files
committed
New utility tasks to
* verify primrose was run on PacBio BAM * get basecall model from ONT BAM
1 parent 989ead1 commit 8fce954

File tree

2 files changed

+86
-0
lines changed

2 files changed

+86
-0
lines changed

wdl/tasks/Utility/ONTUtils.wdl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,46 @@ task DeduplicateBam {
321321
docker: select_first([runtime_attr.docker, default_attr.docker])
322322
}
323323
}
324+
325+
task GetBasecallModel {
326+
meta {
327+
desciption: "Getting the basecall model string of an ONT BAM"
328+
}
329+
parameter_meta {
330+
bam: {
331+
desciption: "BAM to operate on",
332+
localization_optional: true
333+
}
334+
runid_2_model: "The basecall model for each run."
335+
}
336+
input {
337+
File bam
338+
}
339+
output {
340+
Map[String, String] runid_2_model = read_map("results.tsv")
341+
}
342+
343+
command <<<
344+
set -eux
345+
346+
export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token)
347+
samtools view -H ~{bam} | grep "^@RG" > one_rg_per_line.txt
348+
349+
while IFS= read -r line
350+
do
351+
echo "$line" | tr '\t' '\n' | grep "^DS:" | sed "s/^DS://" | tr ' ' '\n' > tmp.txt
352+
runid=$(grep "^runid=" tmp.txt | awk -F '=' '{print $2}')
353+
model=$(grep "^basecall_model=" tmp.txt | awk -F '=' '{print $2}')
354+
echo -e "${runid}\t${model}" >> results.tsv
355+
done < one_rg_per_line.txt
356+
>>>
357+
358+
runtime {
359+
cpu: 1
360+
memory: "4 GiB"
361+
disks: "local-disk 10 HDD"
362+
preemptible: 2
363+
maxRetries: 1
364+
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
365+
}
366+
}

wdl/tasks/Utility/PBUtils.wdl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,3 +1280,46 @@ task SummarizePBI {
12801280
docker: select_first([runtime_attr.docker, default_attr.docker])
12811281
}
12821282
}
1283+
1284+
# todo: primrose is rebranded as jasmine, take care of that later
1285+
task VerifyPacBioBamHasAppropriatePrimroseRuns {
1286+
meta {
1287+
desciption: "Verify that a PacBio's BAM has primrose run on all its read groups"
1288+
}
1289+
input {
1290+
String bam
1291+
}
1292+
1293+
output {
1294+
Array[String] readgroups_missing_primrose = read_lines("movies_without_primrose.txt")
1295+
}
1296+
1297+
command <<<
1298+
set -eux
1299+
1300+
export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token`
1301+
samtools view -H ~{bam} > header.txt
1302+
1303+
# get read groups' movies
1304+
grep "^@RG" header.txt | tr '\t' '\n' | grep "^PU:" | awk -F ':' '{print $2}' | sort > readgroup.movies.txt
1305+
cat readgroup.movies.txt
1306+
1307+
# get primrose PG lines
1308+
grep "^@PG" header.txt | grep -v "^@SQ" | grep "^@PG" | grep -F 'ID:primrose' | tr '\t' '\n' | grep '^CL:' > primrose.pg.lines.txt
1309+
tr ' ' '\n' < primrose.pg.lines.txt
1310+
1311+
touch movies_without_primrose.txt
1312+
while IFS= read -r readgroup; do
1313+
if ! grep -q "${readgroup}" primrose.pg.lines.txt; then echo "${readgroup}" >> movies_without_primrose.txt; fi
1314+
done < readgroup.movies.txt
1315+
>>>
1316+
1317+
runtime {
1318+
cpu: 1
1319+
memory: "4 GiB"
1320+
disks: "local-disk 10 HDD"
1321+
preemptible: 2
1322+
maxRetries: 1
1323+
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
1324+
}
1325+
}

0 commit comments

Comments
 (0)