-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_control_expression.py
139 lines (114 loc) · 4.06 KB
/
generate_control_expression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python
### Imports ###
import numpy as np
import pandas as pd
from collections import Counter
from pathlib import Path
from typing import Callable, Optional, Sequence
### Functions ###
def resample_reads_control(
series: pd.Series,
rng: np.random.Generator,
) -> pd.Series:
"""
Generates a control expression pandas Series by sampling from the
distribution of genes until the same number of reads is reached.
Currently too slow to be used in practice.
Parameters
----------
series : pd.Series
pandas Series with gene expressions
rng : np.random.Generator
numpy random Generator object to be used
Returns
-------
pd.Series
pandas Series with control gene expression
"""
target_n = series.sum()
probs = series.values / target_n
selection = rng.choice(series.index, target_n, p=probs)
counts = Counter(selection)
res_series = pd.Series(counts.values(), index=counts.keys()
).reindex_like(series).fillna(0).astype(int)
return res_series
def adjust_reads_control(
series: pd.Series,
rng: np.random.Generator,
proportion: float = 0.1,
) -> pd.Series:
"""
Generates a control expression pandas Series by perturbing the
number of reads through multiplying by a number drawn from a normal
distribution.
Parameters
----------
series : pd.Series
pandas Series with gene expressions
rng : np.random.Generator
numpy random Generator object to be used
proportion : float, optional
Standard deviation for the normal distribution that will be used
to perturb the gene expression data, by default 0.1
Returns
-------
pd.Series
pandas Series with control gene expression
"""
offsets = series.apply(lambda x: round(rng.normal(0, x * proportion)))
adj_series = series + offsets
adj_series = adj_series.apply(lambda x: x if x > 0 else 1)
return adj_series
def generate_control_expression(
expression: Path,
output: Sequence[Path],
n_repeats: int,
random_seed: Optional[int] = None,
pert_function: Callable = adjust_reads_control,
) -> None:
"""
Generates a control gene expression for multiple samples in multiple
replicates by perturbing the given distribution of genes.
Parameters
----------
expression : Path
Path to the file containing the expression data
output : Sequence[Path]
Paths to the files where the control expression data will be
saved
n_repeats : int
Number of repeats to generate
random_seed : Optional[int], optional
Seed for the random number generator or None to use the system
entropy, by default None
pert_function : Callable, optional
Function to be used to generate control gene expression,
by default adjust_reads_control
"""
rng = np.random.default_rng(random_seed)
df = pd.read_csv(expression, sep='\t', index_col=0, header=None)
for attempt in range(n_repeats):
temp = {}
for pos,col in enumerate(df.columns):
temp[pos] = pert_function(df[col], rng)
df_temp = pd.DataFrame({i: temp[i] for i in range(df.shape[1])})
df_temp.to_csv(output[attempt], sep='\t', header=False)
### Main body ###
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('-e', '--expression', dest='expression',
help='file to load the gene expression from', metavar='FILE')
parser.add_argument('-o', '--output', dest='output', nargs='+',
help='files to save the control gene expression into', metavar='FILE')
parser.add_argument('-nr', '--n-repeats', dest='n_repeats', type=int,
help='number of repeats', metavar='INT')
parser.add_argument('-rs', '--random-seed', dest='random_seed', type=int,
help='random seed', metavar='INT', default=None)
args = parser.parse_args()
generate_control_expression(
args.expression,
args.output,
args.n_repeats,
args.random_seed,
)