-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiamond_get_amp_prototype.py
65 lines (56 loc) · 1.8 KB
/
diamond_get_amp_prototype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import time
import os
from nltk import edit_distance
from pyclustering.cluster.kmedoids import kmedoids
import numpy as np
import pandas as pd
import argparse
from Bio.Blast.Applications import NcbiblastpCommandline
from io import StringIO
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import networkx as nx
import matplotlib.pyplot as plt
import zipfile
with zipfile.ZipFile("diamond_distance_matrix.zip","r") as zip_ref:
zip_ref.extractall(".")
mat = np.load('diamond_distance_matrix.npy')
mat = mat + mat.T
## get the adjaceny matrix ###
adj = np.zeros(mat.shape)
for i in range(mat.shape[0]):
for j in range(mat.shape[1]):
if mat[i,j]==1e+100:
adj[i,j]=0
else:
adj[i,j]=1
if i==j:
adj[i,j]=0
## get the connected components ##
G = nx.from_numpy_matrix(adj)
sort=sorted(nx.connected_components(G), key = len, reverse=True)
## get AMP prototypes ###
prototype_id_all = []
for i in range(len(sort)):
temp_mat = mat[list(sort[i]), :]
temp_mat = temp_mat[:, list(sort[i])]
## get medoid for this connected components
init_medoids = np.random.randint(temp_mat.shape[0], size=1)
km = kmedoids(temp_mat, init_medoids, data_type='distance_matrix')
km.process()
centers = km.get_medoids()
# get the id of center wrt the distance matrix
prototype_id = list(sort[i])[centers[0]]
prototype_id_all.append(prototype_id)
### get the corresponding AMPs from the prototype id ###
dataset = pd.read_csv('D1.csv')
dataset.keys()
amp = dataset.loc[dataset['Label'] == 1]
amp = amp.Sequences.tolist()
text_file = open("amp_prototype_diamond_distance.txt", "w")
for i in prototype_id_all:
amp_prototype = amp[i]
text_file.write(str(amp_prototype) + '\n')
text_file.close()