Skip to content

Commit 8f7c357

Browse files
authored
Update handling of NCBI Taxonomy ranks in ComparaTree (#1104)
* Update NCBI Taxonomy rank order in ComparaTree * Treat taxa of rank clade as having no rank * Take next most basal rank for unranked taxon * Prevent infinite loop when fetching closest hierarchical rank * Add comment on NCBI Taxonomy depth cutoff * Add clarifying note about rank order in ComparaTree
1 parent 4da1c69 commit 8f7c357

File tree

1 file changed

+81
-7
lines changed

1 file changed

+81
-7
lines changed

modules/EnsEMBL/Web/Component/Gene/ComparaTree.pm

Lines changed: 81 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,71 @@ sub collapsed_nodes {
488488
}
489489
} elsif ($action =~ /rank_(\w+)/) {
490490
my $asked_rank = $1;
491-
my @rank_order = qw(subspecies species subgenus genus subfamily family superfamily parvorder infraorder suborder order superorder infraclass subclass class superclass subphylum phylum superphylum subkingdom kingdom superkingdom);
491+
492+
# Rank order info as described in
493+
# Schoch et al. (2020) NCBI Taxonomy: a comprehensive update on curation, resources and tools.
494+
# <https://europepmc.org/article/MED/32761142>,
495+
# with some updates based on NCBI Insights (2024-06-04) Upcoming changes to NCBI Taxonomy classifications.
496+
# <https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/>
497+
# and NCBI Insights (2025-04-25) NCBI Taxonomy updates to virus classification.
498+
# <https://ncbiinsights.ncbi.nlm.nih.gov/2025/04/25/ncbi-taxonomy-updates-virus-classification-april-2025/>.
499+
# Note that a handful of taxonomic lineages have ranks inconsistent with this ordering
500+
# (e.g. taxon 200324 of rank 'forma specialis' has parent taxon 860303 of rank 'varietas').
501+
# As of release 116, all such cases are below the species level, and species is the lowest taxonomic
502+
# rank in the collapse-by-rank dropdown, so these inconsistencies should not be an issue in practice.
503+
my @rank_order = (
504+
'isolate',
505+
'strain',
506+
'serotype',
507+
'biotype',
508+
'genotype',
509+
'serogroup',
510+
'pathogroup',
511+
'forma',
512+
'subvariety',
513+
'varietas',
514+
'form',
515+
'morph',
516+
'subspecies',
517+
'forma specialis',
518+
'special form',
519+
'species',
520+
'species subgroup',
521+
'species group',
522+
'subseries',
523+
'series',
524+
'subsection',
525+
'section',
526+
'subgenus',
527+
'genus',
528+
'subtribe',
529+
'tribe',
530+
'subfamily',
531+
'family',
532+
'superfamily',
533+
'parvorder',
534+
'infraorder',
535+
'suborder',
536+
'order',
537+
'superorder',
538+
'subcohort',
539+
'cohort',
540+
'infraclass',
541+
'subclass',
542+
'class',
543+
'superclass',
544+
'infraphylum',
545+
'subphylum',
546+
'phylum',
547+
'superphylum',
548+
'subkingdom',
549+
'kingdom',
550+
'domain',
551+
'realm',
552+
'cellular root',
553+
'acellular root',
554+
);
555+
492556
my %rank_pos = map {$rank_order[$_] => $_} 0..(scalar(@rank_order)-1);
493557
my @nodes_to_check = ($tree);
494558
while (@nodes_to_check) {
@@ -497,15 +561,25 @@ sub collapsed_nodes {
497561
next unless $internal_node->species_tree_node;
498562
my $taxon = $internal_node->species_tree_node->taxon;
499563
my $this_rank = $taxon->rank;
500-
if ($this_rank eq 'no rank') {
501-
# We traverse the taxonomy upwards until we find a rank, and get
502-
# the rank just below instead
503-
while ($this_rank eq 'no rank') {
564+
# Rank 'clade' is assigned to recognised groups without a formal rank.
565+
# See Schoch et al. (2020) NCBI Taxonomy: a comprehensive update on curation, resources and tools.
566+
# <https://europepmc.org/article/MED/32761142>.
567+
if ($this_rank eq 'no rank' || $this_rank eq 'clade') {
568+
# We traverse the taxonomy upwards until we find a rank
569+
my $i = 0;
570+
# We short-circuit the traversal if the number of steps exceeds the depth cutoff.
571+
# This cutoff must be greater than the maximum depth of the NCBI Taxonomy (~40),
572+
# with some headroom so we don't short-circuit a genuine lineage of unranked taxa.
573+
# The Hardy–Ramanujan number (1729) was chosen because it is a very interesting
574+
# number that happens to satisfy these criteria.
575+
my $depth_cutoff = 1729;
576+
do {
504577
$taxon = $taxon->parent;
505578
last unless $taxon;
506579
$this_rank = $taxon->rank;
507-
}
508-
$this_rank = $rank_pos{$this_rank}-1;
580+
$i += 1;
581+
} while (($this_rank eq 'no rank' || $this_rank eq 'clade') && $i < $depth_cutoff);
582+
$this_rank = $rank_pos{$this_rank};
509583
#warn sprintf("Mapped 'no rank' %s to %s\n", $internal_node->species_tree_node->taxon->name, $rank_order[$this_rank]);
510584
} else {
511585
$this_rank = $rank_pos{$this_rank};

0 commit comments

Comments
 (0)