13
13
"genome" : str ,
14
14
"target" : str ,
15
15
"found_in" : str ,
16
+ "source_samples" : str ,
16
17
"taxonomy" : str ,
17
18
}
18
19
SUMMARY_COLUMNS = {
25
26
"match_percent" : float ,
26
27
}
27
28
28
- def evaluate (unbinned_otu_table , binned_otu_table , elusive_clusters , elusive_edges , recovered_otu_table , recovered_bins ):
29
+ def evaluate (target_otu_table , binned_otu_table , elusive_clusters , elusive_edges , recovered_otu_table , recovered_bins ):
29
30
30
31
print (f"Polars using { str (pl .threadpool_size ())} threads" )
31
32
32
33
if len (recovered_otu_table ) == 0 :
33
34
empty_output = pl .DataFrame (schema = OUTPUT_COLUMNS )
34
35
return empty_output , empty_output , pl .DataFrame (schema = SUMMARY_COLUMNS )
35
36
36
- # Load otu table of unbinned sequences and get unique id for each sequence (to match sequences to target id)
37
- unbinned_otu_table = unbinned_otu_table .select ([
37
+ # Load otu table of target sequences and get unique id for each sequence (to match sequences to target id)
38
+ relevant_target_otu_table = target_otu_table .select ([
38
39
"gene" , "sequence" ,
39
40
pl .first ("target" ).over (["gene" , "sequence" ]).cast (str ),
40
41
pl .first ("taxonomy" ).over (["gene" , "sequence" ]),
@@ -46,7 +47,7 @@ def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edg
46
47
"coassembly"
47
48
).explode ("samples" )
48
49
49
- coassembly_edges = elusive_edges .with_columns (
50
+ elusive_edges = elusive_edges .with_columns (
50
51
pl .col ("sample1" ).str .replace (r"\.1$" , "" ),
51
52
pl .col ("sample2" ).str .replace (r"\.1$" , "" ),
52
53
).join (
@@ -58,29 +59,52 @@ def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edg
58
59
).with_columns (
59
60
pl .col ("target_ids" ).str .split ("," ).alias ("target" )
60
61
).explode ("target"
61
- ).select (["target" , "coassembly" ]
62
+ )
63
+
64
+ coassembly_edges = elusive_edges .select (["target" , "coassembly" ]
62
65
).unique ()
63
66
64
- # Create otu table with original sequence, cluster id, target id and associated coassemblies
67
+ # Create otu table with original sequence, samples present, cluster id, target id and associated coassemblies
68
+ sample_edges = elusive_edges .melt (
69
+ id_vars = ["coassembly" , "target" ],
70
+ value_vars = ["sample1" , "sample2" ],
71
+ value_name = "sample"
72
+ ).groupby ([
73
+ "coassembly" , "target"
74
+ ]).agg ([
75
+ pl .col ("sample" ).unique ().sort ().str .concat ("," ).alias ("source_samples" )
76
+ ])
77
+
65
78
elusive_otu_table = coassembly_edges .join (
66
- unbinned_otu_table , on = "target" , how = "left"
79
+ relevant_target_otu_table , on = "target" , how = "left"
67
80
).select (
68
- "gene" , "sequence" , "taxonomy" ,
81
+ "gene" , "sequence" , "coassembly" , " taxonomy" ,
69
82
pl .lit (None ).cast (str ).alias ("found_in" ),
70
- "coassembly" , "target" ,
83
+ "target" ,
84
+ ).join (
85
+ sample_edges , on = ["coassembly" , "target" ], how = "left"
71
86
)
72
87
73
88
# Add binned otu table to above with target NA
74
- nontarget_otu_table = binned_otu_table .select ([
89
+ nontarget_otu_table = pl .concat ([
90
+ binned_otu_table ,
91
+ target_otu_table
92
+ .join (elusive_otu_table , on = ["gene" , "sequence" ], how = "anti" )
93
+ .drop ("target" )
94
+ .with_columns (pl .lit (None ).cast (str ).alias ("found_in" ))
95
+ ]).select ([
75
96
pl .col ("sample" ).str .replace (r"\.1$" , "" ),
76
97
"gene" , "sequence" , "taxonomy" , "found_in"
77
98
]).join (
78
99
sample_coassemblies , left_on = "sample" , right_on = "samples" , how = "left"
79
- ).drop ("sample"
80
100
).drop_nulls ("coassembly"
81
- ).unique (
82
- ).with_columns (
83
- pl .lit (None ).cast (str ).alias ("target" )
101
+ ).groupby (["gene" , "sequence" , "coassembly" ]
102
+ ).agg ([
103
+ pl .first ("taxonomy" ),
104
+ pl .first ("found_in" ),
105
+ pl .lit (None ).cast (str ).alias ("target" ),
106
+ pl .col ("sample" ).unique ().sort ().str .concat ("," ).alias ("source_samples" )
107
+ ]).unique (
84
108
)
85
109
86
110
haystack_otu_table = pl .concat ([elusive_otu_table , nontarget_otu_table ])
@@ -94,7 +118,7 @@ def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edg
94
118
combined_otu_table = recovered_otu_table .join (
95
119
haystack_otu_table , on = ["coassembly" , "gene" , "sequence" ], how = "outer" , suffix = "old"
96
120
).select (
97
- "coassembly" , "gene" , "sequence" , "genome" , "target" , "found_in" ,
121
+ "coassembly" , "gene" , "sequence" , "genome" , "target" , "found_in" , "source_samples" ,
98
122
pl .when (pl .col ("taxonomy" ).is_null ())
99
123
.then (pl .col ("taxonomyold" ))
100
124
.otherwise (pl .col ("taxonomy" ))
@@ -106,11 +130,11 @@ def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edg
106
130
)
107
131
108
132
matches = combined_otu_table .filter (
109
- ~ pl .all (pl .col (["target" , "found_in" ]).is_null ())
133
+ ~ pl .all (pl .col (["target" , "found_in" , "source_samples" ]).is_null ())
110
134
)
111
135
112
136
unmatched = combined_otu_table .filter (
113
- (pl .col ("target" ). is_null ()) & ( pl . col ( "found_in" ).is_null ())
137
+ pl . all (pl .col ([ "target" , "found_in" , "source_samples" ] ).is_null ())
114
138
)
115
139
116
140
# Summarise recovery stats
@@ -145,14 +169,26 @@ def summarise_stats(matches, combined_otu_table, recovered_bins):
145
169
).groupby ([
146
170
"coassembly" , "status"
147
171
]).agg (
148
- pl .col ("sequence" ).len ().alias ("nontarget_sequences" )
172
+ pl .col ("sequence" ).len ().alias ("nontarget_bin_sequences" )
173
+ ),
174
+ on = ["coassembly" , "status" ], how = "outer"
175
+ ).join (
176
+ # Duplicate sequences are counted multiple times to give a proportion at bin level
177
+ recovered_hits .with_columns (
178
+ pl .when (
179
+ pl .all (pl .col (["target" , "found_in" ]).is_null ()) & (pl .col ("source_samples" ).is_not_null ())
180
+ ).then ("match" ).otherwise ("nonmatch" ).alias ("status" )
181
+ ).groupby ([
182
+ "coassembly" , "status"
183
+ ]).agg (
184
+ pl .col ("sequence" ).len ().alias ("nontarget_unbin_sequences" )
149
185
),
150
186
on = ["coassembly" , "status" ], how = "outer"
151
187
).join (
152
188
# Duplicate sequences are counted multiple times to give a proportion at bin level
153
189
recovered_hits .with_columns (
154
190
pl .when (
155
- (pl .col (" found_in"). is_null ()) & ( pl . col ( "target" ).is_null ())
191
+ pl . all (pl .col ([ "target" , " found_in", "source_samples" ] ).is_null ())
156
192
).then ("match" ).otherwise ("nonmatch" ).alias ("status" )
157
193
).groupby ([
158
194
"coassembly" , "status"
@@ -206,7 +242,7 @@ def summarise_stats(matches, combined_otu_table, recovered_bins):
206
242
os .environ ["POLARS_MAX_THREADS" ] = str (snakemake .threads )
207
243
import polars as pl
208
244
209
- unbinned_path = snakemake .params .unbinned_otu_table
245
+ target_path = snakemake .params .target_otu_table
210
246
binned_path = snakemake .params .binned_otu_table
211
247
elusive_clusters_path = snakemake .params .elusive_clusters
212
248
elusive_edges_path = snakemake .params .elusive_edges
@@ -216,13 +252,13 @@ def summarise_stats(matches, combined_otu_table, recovered_bins):
216
252
novel_hits_path = snakemake .output .novel_hits
217
253
summary_stats_path = snakemake .output .summary_stats
218
254
219
- unbinned_otu_table = pl .read_csv (unbinned_path , separator = "\t " )
255
+ target_otu_table = pl .read_csv (target_path , separator = "\t " )
220
256
binned_otu_table = pl .read_csv (binned_path , separator = "\t " )
221
257
elusive_clusters = pl .read_csv (elusive_clusters_path , separator = "\t " )
222
258
elusive_edges = pl .read_csv (elusive_edges_path , separator = "\t " )
223
259
recovered_otu_table = pl .read_csv (recovered_otu_table_path , separator = "\t " )
224
260
225
- matches , unmatched , summary = evaluate (unbinned_otu_table , binned_otu_table , elusive_clusters , elusive_edges , recovered_otu_table , recovered_bins )
261
+ matches , unmatched , summary = evaluate (target_otu_table , binned_otu_table , elusive_clusters , elusive_edges , recovered_otu_table , recovered_bins )
226
262
# Export hits matching elusive targets
227
263
matches .write_csv (matched_hits_path , separator = "\t " )
228
264
# Export non-elusive sequence hits
0 commit comments