-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfasta_processor_motif_finder.py
43 lines (27 loc) · 1.07 KB
/
fasta_processor_motif_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#The script parses a fasta file and finds the N glycosylation motif(s) in the protein sequence.
#The output file produces UniProtKB accession, matched motif and the starting position of the matched motif
#The script works on fasta file(s) downloaded from UniProtKB. e.g. input file: https://www.uniprot.org/uniprot/P04278.fasta
import re
fasta_file = '/Users/rahinavelkar/Desktop/scripts/P04278.fasta'
outfile = '/Users/rahinavelkar/Desktop/scripts/N_motifs.txt'
fh=open(fasta_file)
fasta={}
counter=0
for line in fh:
line=line.rstrip()
if line[0]=='>':
words=line.split()
name=words[0][4:]
fasta[name]=''
else:
fasta[name]= fasta[name]+line
for name,seq in fasta.items():
pattern = re.finditer('N\wS|N\wT', seq)
for i in pattern:
if i.group() != 'NPT' and i.group() != 'NPS':
motif = i.group()
motif_start_pos = i.start()+1
sourceFile = open(outfile,'a')
print(name, motif, motif_start_pos, file=sourceFile)
sourceFile.close()
print('Done')