Skip to content

Commit

Permalink
summarise: taxonomic_level_coverage_table: Print per-sample for scala…
Browse files Browse the repository at this point in the history
…bility.
  • Loading branch information
wwood committed Oct 10, 2024
1 parent af43d09 commit f9045bb
Showing 1 changed file with 41 additions and 37 deletions.
78 changes: 41 additions & 37 deletions singlem/summariser.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,43 +551,47 @@ def write_taxonomic_level_coverage_table(**kwargs):

logging.info("Writing taxonomic level coverage table")
# Read the taxonomic profile
all_profiles = []
for profile_file in input_taxonomic_profiles:
with open(profile_file) as f:
for profile in CondensedCommunityProfile.each_sample_wise(f):
name_to_coverage = {}
for node in profile.breadth_first_iter():
node_level = node.calculate_level()
if node_level == 0:
continue
if node_level not in name_to_coverage:
name_to_coverage[node_level] = 0.
name_to_coverage[node_level] += node.coverage
all_profiles.append(pl.DataFrame({
'level': list(name_to_coverage.keys()),
'coverage': list(name_to_coverage.values())
}).with_columns(pl.lit(profile.sample).alias('sample')).with_columns(
((pl.col('coverage') / pl.col('coverage').sum()).alias('relative_abundance') * 100).round(2),
))
# Concatenate all profiles
all_profiles = pl.concat(all_profiles)

if len(all_profiles.select(pl.col('level')).group_by('level').count()) in [7, 8]:
# If there's 7 or 8 (including 0) levels, then assume that this is a regular taxonomy going on.
levels = ['root','domain','phylum','class','order','family','genus','species']
level_id_to_level_name = {i: levels[i] for i in range(len(levels))}
all_profiles = all_profiles.with_columns(
level = pl.col('level').replace_strict(level_id_to_level_name, return_dtype=pl.Utf8)
)

all_profiles = all_profiles.select([
'sample',
'level',
pl.col('coverage').round(2),
pl.col('relative_abundance').alias('relative abundance (%)'),
])

all_profiles.write_csv(output_taxonomic_level_coverage_table, separator='\t')
first = True
with open(output_taxonomic_level_coverage_table, 'w') as output:
for profile_file in input_taxonomic_profiles:
with open(profile_file) as f:
for profile in CondensedCommunityProfile.each_sample_wise(f):
name_to_coverage = {}
for node in profile.breadth_first_iter():
node_level = node.calculate_level()
if node_level == 0:
continue
if node_level not in name_to_coverage:
name_to_coverage[node_level] = 0.
name_to_coverage[node_level] += node.coverage
result = pl.DataFrame({
'level': list(name_to_coverage.keys()),
'coverage': list(name_to_coverage.values())
}).with_columns(pl.lit(profile.sample).alias('sample')).with_columns(
((pl.col('coverage') / pl.col('coverage').sum()).alias('relative_abundance') * 100).round(2),
)

if len(result.select(pl.col('level')).group_by('level').count()) in [7, 8]:
# If there's 7 or 8 (including 0) levels, then assume that this is a regular taxonomy going on.
levels = ['root','domain','phylum','class','order','family','genus','species']
level_id_to_level_name = {i: levels[i] for i in range(len(levels))}
result = result.with_columns(
level = pl.col('level').replace_strict(level_id_to_level_name, return_dtype=pl.Utf8)
)

result = result.select([
'sample',
'level',
pl.col('coverage').round(2),
pl.col('relative_abundance').alias('relative abundance (%)'),
])

if first:
first = False
# Write the header
print("\t".join(result.columns), file=output)
for row in result.rows():
print("\t".join([str(cell) for cell in row]), file=output)

def write_taxonomic_profile(input_taxonomic_profiles, output_taxonomic_profile_io):
'''Write a taxonomic profile to a file'''
Expand Down

0 comments on commit f9045bb

Please sign in to comment.