-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcore.py
390 lines (258 loc) · 11 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
import numpy as np
import pandas as pd
import scanpy as sc
import random
import anndata as ad
import time
from treasmo.moran_vec import Moran
import treasmo.lee_vec as lee_vec
from libpysal.weights import W
import scipy as sp
from scipy import stats
#from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multitest import fdrcorrection
__author__ = "Chaozhong Liu"
__email__ = "[email protected]"
def Morans_I(mudata, mods=['rna','atac'], seed=1, max_RAM=16):
"""
Function to calculate Moran's I for all the features in multiome data
Parameters
--------------
mudata: MuData
single-cell multi-omics data saved as MuData object
mods: List[str, str]
scRNA-seq and scATAC-seq modality name in MuData object
seed: int
random seed to make the results reproducible
max_RAM: int
maximum limitation of memory usage (Gb)
Returns
--------------
MuData
added with var['Morans.I']
"""
# Check if mods exist in MuData
if sum([i in mudata.mod.keys() for i in mods]) == len(mods):
pass
else:
print("Not all mods provided are in MuData! Running stopped.")
return
# Start
random.seed(seed)
np.random.seed(seed)
start = time.time()
# Construct connection matrix
cong_mtx = mudata.obsp['connectivities'].toarray()
#cong_mtx = cong_mtx/cong_mtx.sum(axis=1)[:,np.newaxis]
neighbors = {}
weights = {}
for i in range(cong_mtx.shape[0]):
neighbors[i] = np.nonzero(cong_mtx[i])[0].tolist()
weights[i] = cong_mtx[i][np.nonzero(cong_mtx[i])[0]].tolist()
w = W(neighbors, weights)
mi = Moran(mudata.n_obs, w, permutations=0)
mudata.var['Morans.I'] = np.NaN
for mod in mods:
mi.calc_i(mudata.mod[mod].X, seed=seed, max_RAM=max_RAM)
mudata.mod[mod].var['Morans.I'] = mi.I
mudata.var['Morans.I'] = np.concatenate([mudata.mod[mod].var['Morans.I'].to_numpy() for mod in mudata.mod.keys()])
#print(mi.I)
print("Finished calculating Moran's I. %.3fs past"%(time.time()-start))
return mudata
def _check_array(mtx):
if sp.sparse.issparse(mtx): #isinstance(mtx, sp.spmatrix):
return mtx.toarray()
elif isinstance(mtx, np.ndarray):
return mtx
else:
raise Exception("Omics Data should be either numpy array or scipy sparse matrix.")
def _prepare_mtx(anndat, features):
index_df = pd.DataFrame({'index':np.arange(anndat.n_vars)})
index_df.index = anndat.var.index
feature_index = index_df.loc[features,'index'].to_numpy()
return _check_array(anndat.X[:,feature_index])
def Global_L(mudata, pairsDf, mods=['rna','atac'], permutations=0, percent=0.1, seed=1, max_RAM=16):
"""
Function to calculate the global L index (mean of correlation strength index) for all the pairs in multiome data
Parameters
--------------
mudata: MuData
single-cell multi-omics data saved as MuData object
mods: List[str, str]
scRNA-seq and scATAC-seq modality name in MuData object
pairsDf: pandas.DataFrame
gene-peak pair DataFrame containing pairs to be calculated
self-prepared or called from treasmo.tl.peaks_within_distance / TFBS_match
permutations: int
Number of permutations for significance test.
Default is 0, meaning no significance test
999 is a good choice for most cases, but it might take a long time (hours) to finish depending on the number of pairs
percent: float
percentage of cells to shuffle during permutation.
For most of the time, default 0.1 is already a good choice.
seed: int
random seed to make the results reproducible
max_RAM: int
maximum limitation of memory usage(Gb)
Returns
--------------
DataFrame
pairsDf added with extra columns:
Global L results
QC metrics (feature sparsity)
"""
random.seed(seed)
np.random.seed(seed)
start = time.time()
print("Calculating KNN graph-based global correlation...")
cong_mtx = mudata.obsp['connectivities'].toarray()
np.fill_diagonal(cong_mtx, 1.0)
eSP = lee_vec.Spatial_Pearson(cong_mtx, permutations=permutations)
genes = pairsDf['Gene']
peaks = pairsDf['Peak']
gene_X = _prepare_mtx(mudata.mod[mods[0]], genes)
peak_X = _prepare_mtx(mudata.mod[mods[1]], peaks)
eSP = eSP.fit(gene_X, peak_X, percent=percent, seed=seed, max_RAM=max_RAM)
pairsDf['L'] = eSP.association_
pairsDf['L.p_value'] = eSP.significance_ if permutations else np.NaN
if permutations:
_,pairsDf['L.FDR'] = fdrcorrection(peaks_nearby['L.p_value'],
alpha=0.05, method='indep')
else:
pairsDf['L.FDR'] = np.NaN
pairsDf['gene.pct'] = mudata.mod[mods[0]].var.loc[genes,'Frac.all'].to_numpy()
pairsDf['peak.pct'] = mudata.mod[mods[1]].var.loc[peaks,'Frac.all'].to_numpy()
print("Finished calculating correlation. %.3fs past"%(time.time()-start))
return pairsDf
def _calculate_LL(mudata, genes, peaks,
mods=['rna','atac'],
seed=1, max_RAM=16):
random.seed(seed)
np.random.seed(seed)
cong_mtx = mudata.obsp['connectivities'].toarray()
np.fill_diagonal(cong_mtx, 1.0)
gene_X = _prepare_mtx(mudata.mod[mods[0]], genes)
peak_X = _prepare_mtx(mudata.mod[mods[1]], peaks)
eSP2 = lee_vec.Spatial_Pearson_Local(cong_mtx, permutations=0)
eSP2 = eSP2.fit(gene_X,peak_X,seed=seed, max_RAM=max_RAM)
'''
if permutations:
local_p_df = pd.DataFrame(eSP2.significance_)
local_p_df.columns = pair_names
'''
L_mtx = eSP2.associations_
pair_names = pd.Series(genes) + '~' + pd.Series(peaks)
pair_names = pair_names.to_numpy()
return L_mtx, pair_names
def Local_L(mudata, genes, peaks,
mods=['rna','atac'],
rm_dropout=False,
seed=1, max_RAM=16):
"""
Function to calculate the single-cell gene-peak correlation strength index for all the pairs in multiome data
Parameters
--------------
mudata: MuData
single-cell multi-omics data saved as MuData object
mods: List[str, str]
scRNA-seq and scATAC-seq modality name in MuData object
genes, peaks: List, numpy.array
one-to-one lists containing gene and peak names
rm_dropout: bool
make correlation strength index to be 0 if feature value is 0 (dropout)
seed: int
random seed to make the results reproducible
max_RAM: int
maximum limitation of memory usage (Gb)
Returns
--------------
MuData
added with Local L matrix and gene-peak pair names in ``mudata.uns``
"""
random.seed(seed)
np.random.seed(seed)
start = time.time()
print("Inferring KNN graph-based local correlation...")
L_mtx, L_mtx_names = _calculate_LL(mudata, genes, peaks,
mods=mods,
seed=1, max_RAM=16)
if rm_dropout:
print("Setting local correlation to 0 for cells with no expression on either feature of a certain pair...")
GP_names = L_mtx_names
Dropout_mtx = _dropout_filter(mudata, GP_names, mods)
L_mtx = L_mtx * Dropout_mtx
mudata.uns['Local_L'] = L_mtx
mudata.uns['Local_L_names'] = L_mtx_names
print("Following changes made to the AnnData object:")
print("\tKNN graph-based local correlation results saved in uns['Local_L']")
print("\tGene-peak pair names saved in uns['Local_L_names']")
print("Finished Inferring local correlation. %.3fs past"%(time.time()-start))
return mudata
def _dropout_filter(mudata, GP_names, mods):
# Gene dropout
index_df = pd.DataFrame({'index':np.arange(len(mudata.mod[mods[0]].var_names))})
index_df.index = mudata.mod[mods[0]].var_names
GP_G = [gp.split('~')[0] for gp in GP_names]
GP_genes_index = index_df.loc[GP_G,:]['index'].to_numpy()
E_mtx = _check_array(mudata.mod[mods[0]].X)
E_mtx_dropout_value = np.zeros(E_mtx.shape)
Dropout_mtx = (~np.isclose(E_mtx_dropout_value,E_mtx, 1e-3)).astype(int)
Dropout_mtx_G = Dropout_mtx[:,GP_genes_index]
# Peak dropout
index_df = pd.DataFrame({'index':np.arange(len(mudata.mod[mods[1]].var_names))})
index_df.index = mudata.mod[mods[1]].var_names
GP_P = [gp.split('~')[1] for gp in GP_names]
GP_peaks_index = index_df.loc[GP_P,:]['index'].to_numpy()
E_mtx = _check_array(mudata.mod[mods[1]].X)
E_mtx_dropout_value = np.zeros(E_mtx.shape)
Dropout_mtx = (~np.isclose(E_mtx_dropout_value,E_mtx, 1e-3)).astype(int)
Dropout_mtx_P = Dropout_mtx[:,GP_peaks_index]
Dropout_mtx = Dropout_mtx_G * Dropout_mtx_P
return Dropout_mtx
#=========================================================================================
# Pearson correlation function for benchmark purpose
#=========================================================================================
def Pearsonr(mudata, genes, peaks,
mods=['rna','atac'],
p_value=False, seed=1):
"""
Function to calculate the Pearson correlation between genes and peaks
Parameters
--------------
mudata: MuData
single-cell multi-omics data saved as MuData object
mods: List[str, str]
scRNA-seq and scATAC-seq modality name in MuData object
genes, peaks: List, numpy.array
one-to-one lists containing gene and peak names
p_value: bool
perform significance testing or not
seed: int
random seed to make the results reproducible
Returns
--------------
DataFrame
containing Pearson correlation r, p-value and FDR for all gene-peak pairs
"""
random.seed(seed)
np.random.seed(seed)
gene_X = _prepare_mtx(mudata.mod[mods[0]], genes)
peak_X = _prepare_mtx(mudata.mod[mods[1]], peaks)
gene_X = pd.DataFrame(gene_X, index=np.arange(gene_X.shape[0]), columns=np.arange(gene_X.shape[1]))
peak_X = pd.DataFrame(peak_X, index=np.arange(peak_X.shape[0]), columns=np.arange(gene_X.shape[1]))
global_P_df = pd.DataFrame(peak_X.corrwith(gene_X, method='pearson', axis=0))
global_P_df.index = pd.Series(genes) + '~' + pd.Series(peaks)
global_P_df.columns = ['r']
if p_value:
print("Calculating p-values...")
p_list = []
for i in range(global_P_df.shape[0]):
p_list.append(stats.pearsonr(gene_X.iloc[:,i].to_numpy(), peak_X.iloc[:,i].to_numpy())[1])
global_P_df['r.p_value'] = p_list
print("Calculating FDR...")
_, global_P_df['r.FDR'] = fdrcorrection(global_P_df['r.p_value'],
alpha=0.05, method='indep')
else:
global_P_df['r.p_value'] = np.NaN
global_P_df['r.FDR'] = np.NaN
return global_P_df