-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcanopyClustering.py
145 lines (121 loc) · 5.48 KB
/
canopyClustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from scipy import io, sparse
import csv
# Get organism ID from protein id
def get_organism(s):
head = s.split('.')[0].rstrip('0123456789')
return head
# If cosine distance less than this threshold, cluster proteins together
clustering_threshold = 0.6
# Format input and output
data_folder = '/Users/matthewthompson/Documents/UAMS_SURF/K-mer_testing/CSV_files/staph/'
input_description = 'staph_3mer_top_9'
output_description = input_description + '_' + str(clustering_threshold)
# Read in k-mer count matrix (output from kmerSelector.py)
sparse_matrix = io.mmread(data_folder + input_description + '.mtx')
print("Converting to csr_matrix")
df = sparse.csr_matrix(sparse_matrix)
# Proteins are index, kmers are columns
df = df.transpose()
# Read in proteins (output from kmerSelector.py)
proteins = pd.read_csv(data_folder + input_description + '_protein_list.csv')
# Calculate and format distance matrix
print("Pairwise distance calculation")
pairwise_distances = pd.DataFrame(pairwise_distances(df, metric='cosine'))
pairwise_distances.columns = proteins.columns
pairwise_distances.index = proteins.columns
elligible = list(pairwise_distances.columns)
# Make a list of precluster centers
# Make a sub dataframe with 100 proteins
subset_columns = elligible[0:101]
# Reduce the distance matrix to the 100 proteins
subset_distances = pairwise_distances[subset_columns]
# Isolate the 100% duplicate rows across 100 proteins
subset_duplicates = subset_distances[subset_distances.duplicated(keep = False)]
# List of 100% identical proteins
eligible_proteins = list(subset_duplicates.index)
# Reduce duplicate rows to unique points
cluster_centers = subset_duplicates.drop_duplicates()
used_points = []
preclusters = dict()
cluster_centers = list(cluster_centers.index)
i = 1
print("Preclustering exact duplicates")
while len(cluster_centers) != 0:
cluster_center = cluster_centers.pop()
sorted_distances = pairwise_distances.loc[cluster_center].sort_values()
available_points = sorted_distances[~sorted_distances.index.isin(used_points)]
identical_points = list(available_points[available_points == 0].index)
points_under_threshold = list(available_points[available_points < clustering_threshold].index)
used_points = used_points + points_under_threshold
used_points.append(cluster_center)
used_points = list(set(used_points))
cluster_centers = list(set(cluster_centers) - set(used_points))
preclusters['clus_' + str(i)] = points_under_threshold
i+=1
points_not_in_precluster = list(set(elligible) - set(used_points))
# Store these variables for reference
points_without_duplicates = points_not_in_precluster
points_with_duplicates = used_points
clusters = dict()
possible_cluster_centers = points_not_in_precluster
print("Clustering non-duplicate genes")
while len(possible_cluster_centers) != 0:
cluster_center = possible_cluster_centers.pop()
sorted_distances = pairwise_distances.loc[cluster_center].sort_values()
available_points = sorted_distances[~sorted_distances.index.isin(used_points)]
points_under_clustering_threshold = list(available_points[available_points < clustering_threshold].index)
used_points = used_points + points_under_clustering_threshold
#used_points.append(cluster_center)
used_points = list(set(used_points))
possible_cluster_centers = list(set(possible_cluster_centers) - set(used_points))
clusters['clus_' + str(i)] = points_under_clustering_threshold
i+=1
# Remove empty clusters
non_empty_clusters = dict()
for cluster_center in list(clusters.keys()):
cluster = clusters[cluster_center]
if len(cluster) > 0:
non_empty_clusters[cluster_center] = cluster
total_clusters = {**preclusters, **non_empty_clusters}
'''
organism_count = dict()
for cluster_center in total_clusters.keys():
cluster = total_clusters[cluster_center]
org_list = [get_organism(protein) for protein in cluster]
org_list = list(set(org_list))
organism_count[cluster_center] = pd.Series(org_list).value_counts()
organism_count = pd.DataFrame(organism_count)
df_100 = organism_count.dropna(axis = 1, thresh = 14)
df_93 = organism_count.dropna(axis = 1, thresh = 13)
df_85 = organism_count.dropna(axis = 1, thresh = 12)
'''
grouping_list = dict()
i = 1
# Reformat clusters for output
print("Formatting clusters")
for cluster_center in total_clusters.keys():
formatted_clusters = dict()
formatted_clusters['c'] = i
formatted_clusters['points'] = total_clusters[cluster_center]
clusters[i] = formatted_clusters
for protein in formatted_clusters['points']:
grouping_list[protein] = i
i += 1
# Get ordering of genes (output from kmerCounter.R) to assist with data import into FindMyFriends (grouping.R)
ordered_gene_list = list(pd.read_csv(data_folder + 'find_my_friends_gene_ordering_list.csv')['x'])
# Create list of cluster membership for each protein for output
out_list = []
for entry in ordered_gene_list:
out_list.append(str(grouping_list[str(entry).strip()]))
df_out = pd.DataFrame()
df_out['gene'] = ordered_gene_list
df_out['clust'] = out_list
# Write cluster membership for each protein (input for grouping.R)
df_out.to_csv(data_folder + output_description + '_grouping_list.csv', index = None)
# Write clusters out for comparison with UCLUST
with open(data_folder + output_description + '_clusters.txt', 'w') as csv_file:
writer = csv.writer(csv_file)
for key, value in total_clusters.items():
writer.writerow([key, value])