Skip to content

Commit 494672b

Browse files
authored
new file (find N motifs from UniProtKB sequences)
1 parent 8e9660a commit 494672b

1 file changed

Lines changed: 43 additions & 0 deletions

File tree

fasta_processor_motif_finder.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#The script parses a fasta file and finds the N glycosylation motif(s) in the protein sequence.
2+
#The output file produces UniProtKB accession, matched motif and the starting position of the matched motif
3+
#The script works on fasta file(s) downloaded from UniProtKB. e.g. input file: https://www.uniprot.org/uniprot/P04278.fasta
4+
5+
import re
6+
7+
8+
fasta_file = '/Users/rahinavelkar/Desktop/scripts/P04278.fasta'
9+
outfile = '/Users/rahinavelkar/Desktop/scripts/N_motifs.txt'
10+
11+
12+
fh=open(fasta_file)
13+
14+
fasta={}
15+
counter=0
16+
17+
for line in fh:
18+
line=line.rstrip()
19+
if line[0]=='>':
20+
words=line.split()
21+
name=words[0][4:]
22+
fasta[name]=''
23+
else:
24+
fasta[name]= fasta[name]+line
25+
26+
27+
for name,seq in fasta.items():
28+
pattern = re.finditer('N\wS|N\wT', seq)
29+
for i in pattern:
30+
if i.group() != 'NPT' and i.group() != 'NPS':
31+
motif = i.group()
32+
motif_start_pos = i.start()+1
33+
sourceFile = open(outfile,'a')
34+
print(name, motif, motif_start_pos, file=sourceFile)
35+
sourceFile.close()
36+
37+
38+
39+
print('Done')
40+
41+
42+
43+

0 commit comments

Comments
 (0)