Skip to content

Commit 3ecee3b

Browse files
authored
Merge pull request #16 from AroneyS/add-nontarget-unbinned-to-evaluate
Account for nontarget unbinned sequences in evaluate script
2 parents caa3bcb + a40d7aa commit 3ecee3b

File tree

5 files changed

+216
-100
lines changed

5 files changed

+216
-100
lines changed

ibis/workflow/evaluate.smk

+1-1
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ rule evaluate:
146146
novel_hits = output_dir + "/evaluate/novel_hits.tsv",
147147
summary_stats = output_dir + "/evaluate/summary_stats.tsv",
148148
params:
149-
unbinned_otu_table=config["targets"],
149+
target_otu_table=config["targets"],
150150
binned_otu_table=config["binned"],
151151
elusive_edges=config["elusive_edges"],
152152
elusive_clusters=config["elusive_clusters"],

ibis/workflow/scripts/evaluate.R

+8-7
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,12 @@ target_summary <- summary_stats %>%
110110
pivot_wider(id_cols = coassembly, names_from = statistic, values_from = match)
111111

112112
target_totals <- summary_stats %>%
113-
filter(!statistic %in% c("novel_sequences", "taxonomy")) %>%
113+
filter(!statistic %in% c("nontarget_unbin_sequences", "novel_sequences", "taxonomy")) %>%
114114
pivot_wider(id_cols = coassembly, names_from = statistic, values_from = total) %>%
115-
rename(total_targets = sequences, total_bins = bins, total_recovered = nontarget_sequences)
115+
rename(total_targets = sequences, total_bins = bins, total_recovered = nontarget_bin_sequences)
116116

117117
target_percentage <- summary_stats %>%
118-
filter(!statistic %in% c("bins", "nontarget_sequences", "novel_sequences", "taxonomy")) %>%
118+
filter(!statistic %in% c("bins", "nontarget_bin_sequences", "nontarget_unbin_sequences", "novel_sequences", "taxonomy")) %>%
119119
pivot_wider(id_cols = coassembly, names_from = statistic, values_from = match_percent) %>%
120120
rename(perc_targets = sequences)
121121

@@ -144,25 +144,26 @@ summary_table <- coassemble_summary %>%
144144
total_size = total_size / 10**9,
145145
unmapped_size = unmapped_size / 10**9,
146146
) %>%
147-
select(coassembly, length, total_size, unmapped_size, bins, sequences, nontarget_sequences, novel_sequences, novel_clusters) %>%
147+
select(coassembly, length, total_size, unmapped_size, bins, sequences, nontarget_bin_sequences, nontarget_unbin_sequences, novel_sequences, novel_clusters) %>%
148148
gt() %>%
149149
tab_spanner(
150150
label = "Gbp",
151151
columns = c(total_size, unmapped_size)
152152
) %>%
153153
tab_spanner(
154154
label = "Recovered sequences",
155-
columns = c(sequences, nontarget_sequences, novel_sequences)
155+
columns = c(sequences, nontarget_bin_sequences, nontarget_unbin_sequences, novel_sequences)
156156
) %>%
157-
fmt_integer(c(length, total_size, unmapped_size, nontarget_sequences, novel_sequences, novel_clusters)) %>%
157+
fmt_integer(c(length, total_size, unmapped_size, nontarget_bin_sequences, nontarget_unbin_sequences, novel_sequences, novel_clusters)) %>%
158158
cols_label(
159159
coassembly = "coassembly",
160160
length = "samples",
161161
total_size = "size",
162162
unmapped_size = "unmap",
163163
bins = "target bins",
164164
sequences = "targets",
165-
nontarget_sequences = "nontargets",
165+
nontarget_unbin_sequences = "non-targets",
166+
nontarget_bin_sequences = "prior binned",
166167
novel_sequences = "novel",
167168
novel_clusters = "novel clusters"
168169
) %>%

ibis/workflow/scripts/evaluate.py

+58-22
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
"genome": str,
1414
"target": str,
1515
"found_in": str,
16+
"source_samples": str,
1617
"taxonomy": str,
1718
}
1819
SUMMARY_COLUMNS = {
@@ -25,16 +26,16 @@
2526
"match_percent": float,
2627
}
2728

28-
def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edges, recovered_otu_table, recovered_bins):
29+
def evaluate(target_otu_table, binned_otu_table, elusive_clusters, elusive_edges, recovered_otu_table, recovered_bins):
2930

3031
print(f"Polars using {str(pl.threadpool_size())} threads")
3132

3233
if len(recovered_otu_table) == 0:
3334
empty_output = pl.DataFrame(schema=OUTPUT_COLUMNS)
3435
return empty_output, empty_output, pl.DataFrame(schema=SUMMARY_COLUMNS)
3536

36-
# Load otu table of unbinned sequences and get unique id for each sequence (to match sequences to target id)
37-
unbinned_otu_table = unbinned_otu_table.select([
37+
# Load otu table of target sequences and get unique id for each sequence (to match sequences to target id)
38+
relevant_target_otu_table = target_otu_table.select([
3839
"gene", "sequence",
3940
pl.first("target").over(["gene", "sequence"]).cast(str),
4041
pl.first("taxonomy").over(["gene", "sequence"]),
@@ -46,7 +47,7 @@ def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edg
4647
"coassembly"
4748
).explode("samples")
4849

49-
coassembly_edges = elusive_edges.with_columns(
50+
elusive_edges = elusive_edges.with_columns(
5051
pl.col("sample1").str.replace(r"\.1$", ""),
5152
pl.col("sample2").str.replace(r"\.1$", ""),
5253
).join(
@@ -58,29 +59,52 @@ def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edg
5859
).with_columns(
5960
pl.col("target_ids").str.split(",").alias("target")
6061
).explode("target"
61-
).select(["target", "coassembly"]
62+
)
63+
64+
coassembly_edges = elusive_edges.select(["target", "coassembly"]
6265
).unique()
6366

64-
# Create otu table with original sequence, cluster id, target id and associated coassemblies
67+
# Create otu table with original sequence, samples present, cluster id, target id and associated coassemblies
68+
sample_edges = elusive_edges.melt(
69+
id_vars=["coassembly", "target"],
70+
value_vars=["sample1", "sample2"],
71+
value_name="sample"
72+
).groupby([
73+
"coassembly", "target"
74+
]).agg([
75+
pl.col("sample").unique().sort().str.concat(",").alias("source_samples")
76+
])
77+
6578
elusive_otu_table = coassembly_edges.join(
66-
unbinned_otu_table, on="target", how="left"
79+
relevant_target_otu_table, on="target", how="left"
6780
).select(
68-
"gene", "sequence", "taxonomy",
81+
"gene", "sequence", "coassembly", "taxonomy",
6982
pl.lit(None).cast(str).alias("found_in"),
70-
"coassembly", "target",
83+
"target",
84+
).join(
85+
sample_edges, on=["coassembly", "target"], how="left"
7186
)
7287

7388
# Add binned otu table to above with target NA
74-
nontarget_otu_table = binned_otu_table.select([
89+
nontarget_otu_table = pl.concat([
90+
binned_otu_table,
91+
target_otu_table
92+
.join(elusive_otu_table, on=["gene", "sequence"], how="anti")
93+
.drop("target")
94+
.with_columns(pl.lit(None).cast(str).alias("found_in"))
95+
]).select([
7596
pl.col("sample").str.replace(r"\.1$", ""),
7697
"gene", "sequence", "taxonomy", "found_in"
7798
]).join(
7899
sample_coassemblies, left_on="sample", right_on="samples", how="left"
79-
).drop("sample"
80100
).drop_nulls("coassembly"
81-
).unique(
82-
).with_columns(
83-
pl.lit(None).cast(str).alias("target")
101+
).groupby(["gene", "sequence", "coassembly"]
102+
).agg([
103+
pl.first("taxonomy"),
104+
pl.first("found_in"),
105+
pl.lit(None).cast(str).alias("target"),
106+
pl.col("sample").unique().sort().str.concat(",").alias("source_samples")
107+
]).unique(
84108
)
85109

86110
haystack_otu_table = pl.concat([elusive_otu_table, nontarget_otu_table])
@@ -94,7 +118,7 @@ def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edg
94118
combined_otu_table = recovered_otu_table.join(
95119
haystack_otu_table, on=["coassembly", "gene", "sequence"], how="outer", suffix="old"
96120
).select(
97-
"coassembly", "gene", "sequence", "genome", "target", "found_in",
121+
"coassembly", "gene", "sequence", "genome", "target", "found_in", "source_samples",
98122
pl.when(pl.col("taxonomy").is_null())
99123
.then(pl.col("taxonomyold"))
100124
.otherwise(pl.col("taxonomy"))
@@ -106,11 +130,11 @@ def evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edg
106130
)
107131

108132
matches = combined_otu_table.filter(
109-
~pl.all(pl.col(["target", "found_in"]).is_null())
133+
~pl.all(pl.col(["target", "found_in", "source_samples"]).is_null())
110134
)
111135

112136
unmatched = combined_otu_table.filter(
113-
(pl.col("target").is_null()) & (pl.col("found_in").is_null())
137+
pl.all(pl.col(["target", "found_in", "source_samples"]).is_null())
114138
)
115139

116140
# Summarise recovery stats
@@ -145,14 +169,26 @@ def summarise_stats(matches, combined_otu_table, recovered_bins):
145169
).groupby([
146170
"coassembly", "status"
147171
]).agg(
148-
pl.col("sequence").len().alias("nontarget_sequences")
172+
pl.col("sequence").len().alias("nontarget_bin_sequences")
173+
),
174+
on=["coassembly", "status"], how="outer"
175+
).join(
176+
# Duplicate sequences are counted multiple times to give a proportion at bin level
177+
recovered_hits.with_columns(
178+
pl.when(
179+
pl.all(pl.col(["target", "found_in"]).is_null()) & (pl.col("source_samples").is_not_null())
180+
).then("match").otherwise("nonmatch").alias("status")
181+
).groupby([
182+
"coassembly", "status"
183+
]).agg(
184+
pl.col("sequence").len().alias("nontarget_unbin_sequences")
149185
),
150186
on=["coassembly", "status"], how="outer"
151187
).join(
152188
# Duplicate sequences are counted multiple times to give a proportion at bin level
153189
recovered_hits.with_columns(
154190
pl.when(
155-
(pl.col("found_in").is_null()) & (pl.col("target").is_null())
191+
pl.all(pl.col(["target", "found_in", "source_samples"]).is_null())
156192
).then("match").otherwise("nonmatch").alias("status")
157193
).groupby([
158194
"coassembly", "status"
@@ -206,7 +242,7 @@ def summarise_stats(matches, combined_otu_table, recovered_bins):
206242
os.environ["POLARS_MAX_THREADS"] = str(snakemake.threads)
207243
import polars as pl
208244

209-
unbinned_path = snakemake.params.unbinned_otu_table
245+
target_path = snakemake.params.target_otu_table
210246
binned_path = snakemake.params.binned_otu_table
211247
elusive_clusters_path = snakemake.params.elusive_clusters
212248
elusive_edges_path = snakemake.params.elusive_edges
@@ -216,13 +252,13 @@ def summarise_stats(matches, combined_otu_table, recovered_bins):
216252
novel_hits_path = snakemake.output.novel_hits
217253
summary_stats_path = snakemake.output.summary_stats
218254

219-
unbinned_otu_table = pl.read_csv(unbinned_path, separator="\t")
255+
target_otu_table = pl.read_csv(target_path, separator="\t")
220256
binned_otu_table = pl.read_csv(binned_path, separator="\t")
221257
elusive_clusters = pl.read_csv(elusive_clusters_path, separator="\t")
222258
elusive_edges = pl.read_csv(elusive_edges_path, separator="\t")
223259
recovered_otu_table = pl.read_csv(recovered_otu_table_path, separator="\t")
224260

225-
matches, unmatched, summary = evaluate(unbinned_otu_table, binned_otu_table, elusive_clusters, elusive_edges, recovered_otu_table, recovered_bins)
261+
matches, unmatched, summary = evaluate(target_otu_table, binned_otu_table, elusive_clusters, elusive_edges, recovered_otu_table, recovered_bins)
226262
# Export hits matching elusive targets
227263
matches.write_csv(matched_hits_path, separator="\t")
228264
# Export non-elusive sequence hits

test/test_evaluate.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,22 @@ def test_evaluate(self):
7878
]),
7979
"\t".join([
8080
"coassembly_0",
81-
"nontarget_sequences",
81+
"nontarget_bin_sequences",
8282
"recovery",
8383
"1",
8484
"2",
8585
"3",
8686
"33.33",
8787
]),
88+
"\t".join([
89+
"coassembly_0",
90+
"nontarget_unbin_sequences",
91+
"recovery",
92+
"0",
93+
"3",
94+
"3",
95+
"0.0",
96+
]),
8897
"\t".join([
8998
"coassembly_0",
9099
"novel_sequences",

0 commit comments

Comments
 (0)