-
Notifications
You must be signed in to change notification settings - Fork 66
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #338 from CGATOxford/AC_strand_script
Ac strand script
- Loading branch information
Showing
6 changed files
with
203 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
"""bam2libtype.py - determine the library type of a bam file | ||
============================================================ | ||
Author: Adam Cribbs | ||
Purpose | ||
------- | ||
This tool determines the library type of a BAM file. The naming | ||
convention used is from the salmon documentation: | ||
http://salmon.readthedocs.io/en/latest/library_type.html. | ||
BAM files need to have a corresponding index file i.e. example.bam | ||
and example.bam.bai | ||
Usage | ||
----- | ||
cat example.bam | cgat bam2libtype > out.tsv | ||
options | ||
------- | ||
There are no options for this script, just pass the script a bam file | ||
as the stdin and an outfile as the stdout. | ||
Type:: | ||
python bam2bed.py --help | ||
for command line help. | ||
Command line options | ||
-------------------- | ||
""" | ||
|
||
import sys | ||
import pysam | ||
import CGAT.Experiment as E | ||
|
||
|
||
def main(argv=None): | ||
"""script main. | ||
parses command line options in sys.argv, unless *argv* is given. | ||
""" | ||
|
||
if not argv: | ||
argv = sys.argv | ||
|
||
# setup command line parser | ||
parser = E.OptionParser( | ||
version="%prog version: $Id$", usage=globals()["__doc__"]) | ||
|
||
(options, args) = E.Start(parser, argv=argv) | ||
|
||
samfile = pysam.AlignmentFile(options.stdin, "rb") | ||
outfile = options.stdout | ||
|
||
# initialise counts for each library type | ||
MSR = 0 | ||
MSF = 0 | ||
ISF = 0 | ||
ISR = 0 | ||
OSF = 0 | ||
OSR = 0 | ||
SR = 0 | ||
SF = 0 | ||
|
||
for read in samfile: | ||
|
||
# to handle paired end reads: | ||
if read.is_paired and read.is_proper_pair: | ||
|
||
# get attributes of read | ||
read_start = read.reference_start | ||
read_end = read.reference_end | ||
read_neg = read.is_reverse | ||
|
||
# specify which read is R1 and which is R2: | ||
# specify which read is R1 and which is R2: | ||
if read.is_read1 is True: | ||
R1_is_reverse = read.is_reverse | ||
R1_reference_start = read.reference_start | ||
|
||
R2_is_reverse = read.mate_is_reverse | ||
R2_reference_start = read.next_reference_start | ||
else: | ||
R1_is_reverse = read.mate_is_reverse | ||
R1_reference_start = read.next_reference_start | ||
|
||
R2_is_reverse = read.is_reverse | ||
R2_reference_start = read.reference_start | ||
|
||
# Decision tree to specify strandness: | ||
# potential to convert this to a machine learning | ||
# decision tree algorithm in the future: | ||
if R1_is_reverse is True: | ||
|
||
if R2_is_reverse is True: | ||
|
||
MSF += 1 | ||
else: | ||
if R2_reference_start - R1_reference_start >= 0: | ||
OSR += 1 | ||
else: | ||
ISR += 1 | ||
|
||
else: | ||
|
||
if R2_is_reverse is True: | ||
|
||
if R1_reference_start - R2_reference_start >= 0: | ||
|
||
OSF += 1 | ||
else: | ||
ISF += 1 | ||
else: | ||
MSR += 1 | ||
else: | ||
if read.is_reverse: | ||
SR += 1 | ||
else: | ||
SF += 1 | ||
|
||
total = MSR + ISR + OSR + ISF + MSF + OSF + SF + SR | ||
|
||
def total_percent(strand, total): | ||
return float(strand)/float(total)*100 | ||
|
||
MSR_total = total_percent(MSR, total) | ||
ISR_total = total_percent(ISR, total) | ||
OSR_total = total_percent(OSR, total) | ||
ISF_total = total_percent(ISF, total) | ||
MSF_total = total_percent(MSF, total) | ||
OSF_total = total_percent(OSF, total) | ||
SF_total = total_percent(SF, total) | ||
SR_total = total_percent(SR, total) | ||
|
||
outfile.write("MSR\tISR\tOSR\tISF\tMSF\tOSF\tSF\tSR\n") | ||
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % | ||
(int(MSR_total), int(ISR_total), int(OSR_total), | ||
int(ISF_total), int(MSF_total), | ||
int(OSF_total), int(SF_total), int(SR_total))) | ||
|
||
E.Stop() | ||
|
||
if __name__ == "__main__": | ||
sys.exit(main(sys.argv)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../data/paired.bam |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# output generated by bam2libtype | ||
# job started at Thu Jun 22 19:03:14 2017 on cgat057.anat.ox.ac.uk -- 5bab66f9-4752-4eae-b579-033086913e56 | ||
# pid: 4943, system: Linux 2.6.32-696.1.1.el6.x86_64 #1 SMP Tue Apr 11 08:56:50 CDT 2017 x86_64 | ||
# loglevel : 1 | ||
# random_seed : None | ||
# short_help : None | ||
# stderr : <open file '<stderr>', mode 'w' at 0x2b75db3511e0> | ||
# stdin : <open file '<stdin>', mode 'r' at 0x2b75db3510c0> | ||
# stdlog : <open file '<stdout>', mode 'w' at 0x2b75db351150> | ||
# stdout : <open file '<stdout>', mode 'w' at 0x2b75db351150> | ||
# timeit_file : None | ||
# timeit_header : None | ||
# timeit_name : all | ||
MSR ISR OSR ISF MSF OSF SF SR | ||
0 49 0 47 0 0 1 1 | ||
# job finished in 0 seconds at Thu Jun 22 19:03:14 2017 -- 0.43 0.60 0.00 0.00 -- 5bab66f9-4752-4eae-b579-033086913e56 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# output generated by bam2libtype | ||
# job started at Thu Jun 22 19:03:53 2017 on cgat057.anat.ox.ac.uk -- f3083ee5-cbde-4e32-a43e-eafdb5ba42ec | ||
# pid: 4965, system: Linux 2.6.32-696.1.1.el6.x86_64 #1 SMP Tue Apr 11 08:56:50 CDT 2017 x86_64 | ||
# loglevel : 1 | ||
# random_seed : None | ||
# short_help : None | ||
# stderr : <open file '<stderr>', mode 'w' at 0x2b86133961e0> | ||
# stdin : <open file '<stdin>', mode 'r' at 0x2b86133960c0> | ||
# stdlog : <open file '<stdout>', mode 'w' at 0x2b8613396150> | ||
# stdout : <open file '<stdout>', mode 'w' at 0x2b8613396150> | ||
# timeit_file : None | ||
# timeit_header : None | ||
# timeit_name : all | ||
MSR ISR OSR ISF MSF OSF SF SR | ||
0 0 0 0 0 0 51 48 | ||
# job finished in 0 seconds at Thu Jun 22 19:03:53 2017 -- 1.27 0.71 0.00 0.01 -- f3083ee5-cbde-4e32-a43e-eafdb5ba42ec |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
|
||
version: | ||
stdin: null | ||
outputs: [stdout] | ||
references: [] | ||
options: --version | ||
|
||
paired_lib: | ||
stdin: paired.bam | ||
outputs: [stdout] | ||
references: [paired_IU.txt] | ||
options: | ||
|
||
single_lib: | ||
stdin: single.bam | ||
outputs: [stdout] | ||
references: [single_U.txt] | ||
options: | ||
|