-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path1_nb_aggregation.py
executable file
·155 lines (128 loc) · 4.69 KB
/
1_nb_aggregation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
# Author: Giulia Muzio
'''
Script for generating the neighborhood file on
the non-permuted setting. The neighborhood file
consists in two columns, for example:
snp set
Chrom_1_23235 AT1G01040
Chrom_1_24466 AT1G01040
Chrom_2_11575494 AT1G01040
Chrom_2_11575607 AT1G01040
Chrom_1_33481 AT1G01060
Chrom_3_22272106 AT1G01060
Chrom_5_26882325 AT1G01060
which means that the 1-hop neighborhood of AT1G01040
comprises the SNPs named Chrom_1_23235, Chrom_1_24466,
Chrom_2_11575494, and Chrom_2_11575607. To give another
example, the SNPs Chrom_1_33481, Chrom_3_22272106, and
Chrom_5_26882325 can be mapped onto AT1G01060 and its
1-hop neighbors.
'''
import os
import csv
import argparse
import numpy as np
import pandas as pd
from utils import *
def main(args):
# Loading files
gene2snps = load_file('{}/{}'.format(args.i, args.g2s))
bim = pd.read_csv('{}/{}'.format(args.i, args.bim), sep = '\t',
names = ['chrom', 'snp', '-', 'pos', 'a1', 'a2'])
network = load_file('{}/{}'.format(args.i, args.nw))
# output folders creation
if(not os.path.exists(args.o)):
os.makedirs(args.o)
# 1-hop neighborhoods construction
neighbourhood_file(gene2snps, bim, network, args)
def neighbourhood_file(gene2snps, bim, A, args):
'''
Function for writing the neighborhood file where the
first column represents the neighborhoods, and the
second the SNPs belonging to the neighborhoods, as
in this example:
snp set
Chrom_1_23235 AT1G01040
Chrom_1_24466 AT1G01040
Chrom_2_11575494 AT1G01040
Chrom_2_11575607 AT1G01040
Chrom_1_33481 AT1G01060
Chrom_3_22272106 AT1G01060
Chrom_5_26882325 AT1G01060
which means that the 1-hop neighborhood of AT1G01040
comprises the SNPs named Chrom_1_23235, Chrom_1_24466,
Chrom_2_11575494, and Chrom_2_11575607. To give another
example, the SNPs Chrom_1_33481, Chrom_3_22272106, and
Chrom_5_26882325 can be mapped onto AT1G01060 and its
1-hop neighbors.
Input
-----------
gene2snps: dictionary where the genes (Ng) are the keys and the
values are boolean vectors of size #SNPs: True
if the SNP is mapped onto the gene that is the key
bim: Plink bim file, with #SNPs rows
A: Ng x Ng pandas dataframe that represents the adjacency
matrix of the PPI network. Index and columns' names
are the genes.
args: arguments
Output
-----------
'''
SNPs = bim['snp'].values
genes = np.array(A.index)
# open the output neighborhood file
with open('{}/{}'.format(args.o, args.nbs), "w") as f:
f.write('snp set\n')
for gene in genes:
conn_idx = (A[gene].values).astype(bool)
snps_idx = gene2snps[gene]
# Finding the neighbors on the network
connected = genes[conn_idx]
for conn in connected:
snps_idx = np.logical_or(snps_idx, gene2snps[conn])
snps = SNPs[snps_idx]
# writing the neighborhood
csv.writer(f, delimiter = ' ').writerows(np.c_[snps, np.full(len(snps), gene)])
def parse_arguments():
'''
Definition of the command line arguments
Input
---------
Output
---------
args.i: input folder
args.o: output folder
args.g2s: name of the gene2snps mapping file (pickle); it is a
dictionary where the genes (Ng) are the keys and the
values are boolean vectors of size #SNPs: True
if the SNP is mapped onto the gene that is the key
args.bim: name of the Plink bim file; Plink bim file, with
#SNPs rows.
args.nw: name of the adjacency matrix file (pickle); Ng x Ng
pandas dataframe that represents the adjacency
matrix of the PPI network. Index and columns' names
are the genes.
args.nbs: name of the output text file where to save the 1-hop
neighborhoods in the form of name of SNPs
'''
parser = argparse.ArgumentParser()
parser.add_argument('--i', required = False, default = 'data',
help = 'input folder')
parser.add_argument('--o', required = False, default = 'results/settings',
help = 'output folder')
parser.add_argument('--g2s', required = False, default = 'gene_snps_index.pkl',
help = 'mapping between the genes and the snps')
parser.add_argument('--bim', required = False, default = 'genotype.bim',
help = 'bim file, where to get the snp names')
parser.add_argument('--nw', required = False, default = 'PPI_adj.pkl',
help = 'adjacency matrix of the PPI network')
parser.add_argument('--nbs', required = False, default = 'neighborhoods.txt',
help = 'name of the file containing the 1-hop\
neighborhoods on the non-permuted setting\
to save in the output folder')
args = parser.parse_args()
return args
if __name__ == '__main__':
arguments = parse_arguments()
main(arguments)