Skip to content

Commit

Permalink
added squirrel MPXV analysis (#6666)
Browse files Browse the repository at this point in the history
* added squirrel tool MPXV analysis

* added shed.yml

* updated test data

* Update tools/squirrel/squirrel-phylo.xml

Co-authored-by: Björn Grüning <[email protected]>

* Update tools/squirrel/squirrel-phylo.xml

Co-authored-by: Björn Grüning <[email protected]>

* Update tools/squirrel/squirrel-phylo.xml

Co-authored-by: Björn Grüning <[email protected]>

* Update tools/squirrel/squirrel-phylo.xml

Co-authored-by: Björn Grüning <[email protected]>

* Update tools/squirrel/squirrel-qc.xml

Co-authored-by: Björn Grüning <[email protected]>

* added masking options

* removed masking, added more macros, changed tests

* updated test data to be smaller

* updated help

* moved masking to phylo wrapper, and changed help

* added conditional for advanced params

* removed category

* removed category

* changed from conditional to section

* readded html outputs to tests

* added test asserts for images

* removed html output, not useful

* updated test data

* added asserts for tree files

* smaller test files

* added outgroup param, changed tests to match

* renamed test data

* minified test data

* updated test data

* removed input oddities - READY?

* minor changes

* typo

---------

Co-authored-by: Björn Grüning <[email protected]>
  • Loading branch information
ammaraziz and bgruening authored Jan 16, 2025
1 parent ec96d43 commit ed19e40
Show file tree
Hide file tree
Showing 13 changed files with 10,772 additions and 0 deletions.
14 changes: 14 additions & 0 deletions tools/squirrel/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: squirrel
owner: iuc
description: "QC and Phylogenetic analysis of MPXV"
long_description: |
Squirrel provides a rapid way of producing reliable alignments for MPXV
and also enable maximum-likelihood phylogenetics pipeline tree estimation.
homepage_url: https://github.com/aineniamh/squirrel
remote_repository_url: https://github.com/aineniamh/squirrel
type: unrestricted
categories:
- Phylogenetics
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Wrapper for : {{ tool_name }}"
27 changes: 27 additions & 0 deletions tools/squirrel/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0"?>
<macros>
<token name="@TOOL_VERSION@">1.0.12</token>
<token name="@VERSION_SUFFIX@">0</token>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">squirrel</requirement>
</requirements>
</xml>
<xml name="version_command">
<version_command>squirrel --version</version_command>
</xml>
<xml name="citations">
<citations>
<citation type="bibtex">
@misc{githubsquirrel,
author = {O'Toole, Áine},
year = {2025},
title = {Squirrel: Some QUIck Reconstruction to Resolve Evolutionary Links},
publisher = {GitHub},
journal = {GitHub},
url = {https://github.com/aineniamh/squirrel},
}
</citation>
</citations>
</xml>
</macros>
211 changes: 211 additions & 0 deletions tools/squirrel/squirrel-phylo.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
<tool id="squirrel_phylo" name="Squirrel Phylo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
<description>Phylogenetic and APOBEC3 analysis of MPXV (Mpox virus)</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements"/>
<expand macro="version_command"/>

<command detect_errors="exit_code"><![CDATA[
#set $alignment_output = 'input.aln.fasta'
#set $tree_output = 'input.tree'
#if $apobec3
#set $aa_recon_output = "input.tree.amino_acid.reconstruction.csv"
#set $branch_snps_output = "input.tree.branch_snps.reconstruction.csv"
#set $svg_output = "input.tree.svg"
#set $png_output = "input.tree.png"
#end if
ln -s '${sequences}' input.fasta &&
squirrel
#if $apobec3
--run-apobec3-phylo
--fig-height $fig_height
--fig-width $fig_width
#else
--run-phylo
#end if
--clade $clade
#if $mask_file
--additional-mask $mask_file
#end if
#if $bg_file
--background-file '$bg_file'
#else
--include-background
#end if
#if $out_group
--outgroups $out_group
#end if
$no_mask
$no_iter_mask
--threads \${GALAXY_SLOTS:-1}
input.fasta &&
mv '${alignment_output}' '$alignment' &&
mv '${tree_output}' '$tree'
#if $apobec3
&& mv '${aa_recon_output}' '$aa_recon' &&
mv '${branch_snps_output}' '$branch_snps' &&
mv '${svg_output}' '$svg' &&
mv '${png_output}' '$png'
#end if
]]></command>

<inputs>
<param name="sequences"
type="data"
format="fasta"
label="Sequences in fasta format"
help="You can upload a FASTA sequence to the history and use it as reference" />
<param name="apobec3"
type="boolean"
checked="false"
label="Run additional APOBEC3-mutation reconstruction pipeline" />
<param name="clade"
type="select"
label="Select MPXV Clade">
<option value="cladei">Clade I</option>
<option value="cladeia">Clade Ia</option>
<option value="cladeib">Clade Ib</option>
<option value="cladeii">Clade II</option>
<option value="cladeiia">Clade IIa</option>
<option value="cladeiib">Clade IIb</option>
</param>
<section name="other_settings" expanded="false" title="Additional Settings">
<param name="no_mask"
type="boolean"
truevalue="--no-mask"
falsevalue=""
label="SKIP masking repeat regions?"
help="Set to True to Skip masking of repetitive regions. Default: masks repeat regions." />
<param name="no_iter_mask"
type="boolean"
truevalue="--no-itr-mask"
falsevalue=""
label="SKIP masking of end ITR?"
help="Set to True to skip masking of end ITR. Default: masks ITR" />
<param name="mask_file"
type="data"
format="csv"
optional="true"
label="Mask additional sites"
help="Run squirrel in alignment with QC to generate the SNP mask file." />
<param name="bg_file"
type="data"
format="fasta"
optional="true"
label="Background file - leave empty for automatic background sequences."
help="Include a default background set of sequences for the phylogenetics pipeline. The set will be determined by previous 'clade' setting"/>
<param name="out_group"
type="text"
label="Specify outgroup(s)"
help="Specify which MPXV outgroup(s) in the alignment to use in the phylogeny. These will get pruned out from the final tree."/>
<param name="fig_height"
label="Overwrite tree figure default height"
type="integer"
min="0"
value="25"
optional="true">
</param>
<param name="fig_width"
label="Overwrite tree figure default width"
type="integer"
min="0"
value="40"
optional="true">
</param>
</section>
</inputs>

<outputs>
<!-- standard outputs-->
<data name="tree" format="newick" label="${tool.name} - phylogenetic tree" />
<data name="alignment" format="fasta" label="${tool.name} - aligned sequences" />
<!-- apobec3 outputs-->
<data name="svg" format="svg" label="${tool.name} - phylotree svg image">
<filter>apobec3</filter>
</data>
<data name="png" format="png" label="${tool.name} - phylotree png image">
<filter>apobec3</filter>
</data>
<data name="aa_recon" format="png" label="${tool.name} - aa mutations ancestral reconstruction">
<filter>apobec3</filter>
</data>
<data name="branch_snps" format="png" label="${tool.name} - apobec3 nt mutations">
<filter>apobec3</filter>
</data>
</outputs>

<tests>
<test expect_num_outputs="2">
<param name="sequences" value="test-sequences.fasta" />
<param name="bg_file" value="test-background.fasta" />
<param name="out_group" value="KJ642615" />
<param name="apobec3" value="false" />
<output name="alignment" file="sequences.aln.fasta" />
<output name="tree">
<assert_contents>
<has_line_matching expression="#NEXUS"/>
</assert_contents>
</output>
</test>

<test expect_num_outputs="6">
<param name="sequences" value="test-sequences.fasta" />
<param name="bg_file" value="test-background.fasta" />
<param name="out_group" value="KJ642615" />
<param name="apobec3" value="true" />
<output name="alignment" file="sequences.aln.fasta" />
<output name="tree">
<assert_contents>
<has_line_matching expression="#NEXUS"/>
</assert_contents>
</output>
<output name="svg">
<assert_contents>
<has_text text="svg xmlns:"/>
<has_text text="DQ011155"/>
</assert_contents>
</output>
<output name="png" file="sequences.tree.png" ftype="png" compare="sim_size" delta="1000" />
<output name="aa_recon" file="sequences.tree.amino_acid.reconstruction.csv" />
<output name="branch_snps" file="sequences.tree.branch_snps.reconstruction.csv" />
</test>

</tests>
<help><![CDATA[
squirrel allows for rapidly producing reliable alignments for MPXV and also enable maximum-likelihood phylogenetics pipeline tree estimation.
Ensure your input sequences are of a singular clade and not mixed CladeI/CladeII. CladeI and CladeIa/b are fine to combine.
**Alignment**
Squirrel maps each query genome in the input file against a reference genome specific to each clade using minimap2. Using gofasta, the mapping file is then converted into a multiple sequence alignment.
For Clade II, the reference used is NC_063383 and for Clade I, we use NC_003310. This means that all coordinates within an alignment will be relative to these references. A benefit of this is that within a clade, alignment files and be combined without having to recalculate the alignment. Note however that insertions relative to the reference sequence will not be included in the alignment.
Squirrel by default creates a single alignment fasta file. Using the genbank coordinates for NC_063383 it also has the ability to extract the aligned coding sequences either as separate records or as a concatenated alignment. This can facilitate codon-aware phylogenetic or sequence analysis.
**APOBEC3**
Enrichment of APOBEC3-mutations in the MPXV population are a signature of sustained human-to-human transmission. Identifying APOBEC3-like mutations in MPXV genomes from samples in a new outbreak can be a piece of evidence to support sustained human transmission of mpox. Squirrel can run an APOBEC3-reconstruction and map these mutations onto the phylogeny.
**Default Masking**
Squirrel performs masking (replacement with N) on low-complexity or repetitive regions that have been characterised for Clade I and II. These regions are defined in to_mask.cladeii.csv and to_mask.cladei.csv (see github: https://github.com/aineniamh/squirrel/blob/main/squirrel/data/).
**Additional Masking**
Additional mask file can be provided to mask sites in addition to default masking. To generate additional masking file, run the galaxy tool *squirrel-qc*
]]></help>

<expand macro="citations" />
</tool>
79 changes: 79 additions & 0 deletions tools/squirrel/squirrel-qc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
<tool id="squirrel_qc" name="Squirrel QC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
<description>QC of MPXV (Mpox virus) sequences</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements"/>
<expand macro="version_command"/>

<command detect_errors="exit_code"><![CDATA[
#set $mask_output = 'input.suggested_mask.csv'
#set $exclude_output = 'suggested_to_exclude.csv'
ln -s '${sequences}' input.fasta &&
squirrel
--seq-qc
--clade $clade
--threads \${GALAXY_SLOTS:-1}
input.fasta &&
mv '${mask_output}' '$mask' &&
mv '${exclude_output}' '$exclude'
]]></command>

<inputs>
<param name="sequences"
type="data"
format="fasta"
label="Sequences in FASTA format" />
<param name="clade"
type="select"
label="Select MPXV Clade">
<option value="cladei">Clade I</option>
<option value="cladeia">Clade Ia</option>
<option value="cladeib">Clade Ib</option>
<option value="cladeii">Clade II</option>
<option value="cladeiia">Clade IIa</option>
<option value="cladeiib">Clade IIb</option>
</param>

</inputs>

<outputs>
<!-- standard outputs-->
<data name="mask" format="csv" label="${tool.name} - flagged mutations to mask" />
<data name="exclude" format="csv" label="${tool.name} - flagged sequences to exclude" />
</outputs>

<tests>
<test expect_num_outputs="2">
<param name="sequences" value="test-sequences.fasta" />
<param name="clade" value="cladeii" />
<output name="mask" file="sequences.suggested_mask.csv" />
<output name="exclude" file="suggested_to_exclude.csv" />
</test>
</tests>
<help><![CDATA[
Squirrel in QC mode can run quality control (QC) on the alignment and flag certain sites to the user that may need to be masked. Squirrel can flag potential issues in the MPXV sequences that have been provided for alignment (e.g. SNPS near tracts of N, clusters of unique SNPs, reversions to reference alleles and convergent mutations) and outputs these in a mask file for investigation.
It is recommended that the user looks at these sites in an alignment viewer to judge whether the sites should be masked or not.
Squirrel with check within the alignment for:
- Mutations that are adjacent to N bases
The rationale for this is that N sites are usually a product of low coverage regions. Mutations that occur directly adjacent to low coverage regions may be a result of mis-alignment prior to the low coverage masking and may not be real SNPs. In squirrel, non-majority alleles that are present next to an N are flagged as potential sites for masking
- Unique mutations that clump together
If mutations are observed in only a single sequence in the genome, they are classed as unique mutations. Usually mutations do not clump closely together and may suggest an alignment or assembly issue. If these mutations are not shared with any other sequences, they are flagged for masking.
- Sequences with a high N content
Sequences that have many ambiguous bases in them are flagged that they may want to be excluded in further analysis. This may not always be appropriate, often genomes that have a lot of ambiguity can still be informative, however if there is something unusual about a sequence, having lots of ambiguities can be a flag for wider problems (like low read count during assembly).
]]></help>
<expand macro="citations"/>
</tool>
6 changes: 6 additions & 0 deletions tools/squirrel/test-data/sequences.aln.fasta

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tools/squirrel/test-data/sequences.suggested_mask.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Name,Minimum,Maximum,Length,present_in,note
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
site,gene,direction,snp,dimer,apobec,aa_position,parent,parent_codon,parent_aa,child,child_codon,child_aa,mutation_category,score,prediction,homoplasy,occurrence
1355,OPG001_CDS_180,reverse,G->T,,False,2,Node1,ACT,T,DQ011155,AAT,N,nonsynonymous,65,moderately conservative,False,1
6600,NA,NA,G->T,,False,NA,Node1,NA,NA,DQ011155,NA,NA,intergenic,NA,NA,False,1
8973,NA,NA,C->A,,False,NA,Node1,NA,NA,DQ011155,NA,NA,intergenic,NA,NA,False,1
11363,OPG023_CDS_173,reverse,T->C,,False,2,Node1,TAT,Y,DQ011155,TGT,C,nonsynonymous,194,radical,False,1
21633,OPG034_CDS_165,reverse,C->T,AC,False,3,Node1,TTG,L,DQ011155,TTA,L,synonymous,NA,NA,False,1
35994,OPG052_CDS_148,reverse,C->G,,False,2,Node1,AGT,S,DQ011155,ACT,T,nonsynonymous,58,moderately conservative,False,1
37989,OPG054_CDS_146,reverse,T->C,,False,2,Node1,CAA,Q,DQ011155,CGA,R,nonsynonymous,43,conservative,False,1
44147,OPG062_CDS_138,forward,A->C,,False,1,Node1,AGA,R,DQ011155,CGA,R,synonymous,NA,NA,False,1
58283,OPG074_CDS_127,reverse,A->G,,False,3,Node1,GTT,V,DQ011155,GTC,V,synonymous,NA,NA,False,1
73148,OPG089_CDS_112,forward,G->A,GA,True,1,Node1,GAG,E,DQ011155,AAG,K,nonsynonymous,56,moderately conservative,False,1
83797,OPG105_CDS_96,forward,C->T,GC,False,3,Node1,AGC,S,DQ011155,AGT,S,synonymous,NA,NA,False,1
97065,OPG115_CDS_86,forward,C->T,GC,False,1,Node1,CAT,H,DQ011155,TAT,Y,nonsynonymous,83,moderately conservative,False,1
110532,OPG127_CDS_74,reverse,G->A,GT,False,1,Node1,CAT,H,DQ011155,TAT,Y,nonsynonymous,83,moderately conservative,False,1
132943,OPG151_CDS_50,forward,C->T,GC,False,3,Node1,CGC,R,DQ011155,CGT,R,synonymous,NA,NA,False,1
137411,NA,NA,G->A,GC,False,NA,Node1,NA,NA,DQ011155,NA,NA,intergenic,NA,NA,False,1
140036,OPG155_CDS_47,reverse,C->A,,False,2,Node1,AGA,R,DQ011155,ATA,I,nonsynonymous,97,moderately conservative,False,1
143163,OPG161_CDS_41,forward,C->T,CC,False,3,Node1,GCC,A,DQ011155,GCT,A,synonymous,NA,NA,False,1
159492,OPG185_CDS_24,forward,T->C,,False,3,Node1,ATT,I,DQ011155,ATC,I,synonymous,NA,NA,False,1
165893,OPG191_CDS_19,forward,G->A,GT,False,3,Node1,ATG,M,DQ011155,ATA,I,nonsynonymous,10,conservative,False,1
171132,OPG199_CDS_14,forward,G->A,GC,False,3,Node1,ATG,M,DQ011155,ATA,I,nonsynonymous,10,conservative,False,1
178028,NA,NA,C->T,GC,False,NA,Node1,NA,NA,DQ011155,NA,NA,intergenic,NA,NA,False,1
183889,OPG210_CDS_7,forward,A->C,,False,3,Node1,ACA,T,DQ011155,ACC,T,synonymous,NA,NA,False,1
241,NA,NA,A->C,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
826,NA,NA,C->T,TC,True,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
1788,OPG002_CDS_179,reverse,G->A,GC,False,3,Node1,TGC,C,JX878413,TGT,C,synonymous,NA,NA,False,1
6100,OPG015_CDS_177,reverse,A->G,,False,2,Node1,GTA,V,JX878413,GCA,A,nonsynonymous,64,moderately conservative,False,1
8477,NA,NA,T->C,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
11156,OPG023_CDS_173,reverse,G->A,GC,False,2,Node1,GCC,A,JX878413,GTC,V,nonsynonymous,64,moderately conservative,False,1
16178,OPG027_CDS_170,reverse,C->A,,False,3,Node1,GAG,E,JX878413,GAT,D,nonsynonymous,45,conservative,False,1
20627,NA,NA,A->G,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
27593,NA,NA,C->T,TC,True,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
49030,OPG066_CDS_134,reverse,C->T,CC,False,3,Node1,AAG,K,JX878413,AAA,K,synonymous,NA,NA,False,1
51979,OPG068_CDS_133,forward,T->C,,False,3,Node1,TTT,F,JX878413,TTC,F,synonymous,NA,NA,False,1
54162,OPG071_CDS_130,reverse,C->T,AC,False,3,Node1,CTG,L,JX878413,CTA,L,synonymous,NA,NA,False,1
87349,OPG106_CDS_95,reverse,C->T,AC,False,3,Node1,GTG,V,JX878413,GTA,V,synonymous,NA,NA,False,1
89411,OPG109_CDS_92,reverse,T->C,,False,3,Node1,AAA,K,JX878413,AAG,K,synonymous,NA,NA,False,1
118195,OPG134_CDS_67,forward,C->T,AC,False,1,Node1,CTA,L,JX878413,TTA,L,synonymous,NA,NA,False,1
129252,OPG148_CDS_53,forward,T->C,,False,3,Node1,TGT,C,JX878413,TGC,C,synonymous,NA,NA,False,1
138312,OPG153_CDS_49,reverse,G->A,GG,False,2,Node1,CCC,P,JX878413,CTC,L,nonsynonymous,98,moderately conservative,False,1
142201,OPG160_CDS_42,reverse,G->T,,False,3,Node1,TCC,S,JX878413,TCA,S,synonymous,NA,NA,False,1
153104,NA,NA,G->A,GT,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
154113,NA,NA,G->A,GT,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
157592,NA,NA,C->A,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
168929,NA,NA,G->T,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
173151,OPG201_CDS_12,forward,C->T,AC,False,2,Node1,ACA,T,JX878413,ATA,I,nonsynonymous,89,moderately conservative,False,1
176367,OPG205_CDS_10,forward,C->T,AC,False,3,Node1,AAC,N,JX878413,AAT,N,synonymous,NA,NA,False,1
177394,OPG205_CDS_10,forward,C->T,TC,True,1,Node1,CGT,R,JX878413,TGT,C,nonsynonymous,180,radical,False,1
186579,OPG210_CDS_7,forward,G->T,,False,2,Node1,AGA,R,JX878413,ATA,I,nonsynonymous,97,moderately conservative,False,1
187075,NA,NA,C->G,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
187738,NA,NA,C->T,TC,True,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
190234,OPG016_CDS_5,forward,G->A,GC,False,3,Node1,TTG,L,JX878413,TTA,L,synonymous,NA,NA,False,1
190759,OPG015_CDS_4,forward,T->C,,False,2,Node1,GTA,V,JX878413,GCA,A,nonsynonymous,64,moderately conservative,False,1
Loading

0 comments on commit ed19e40

Please sign in to comment.