Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added squirrel MPXV analysis #6666

Merged
merged 30 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8f6d310
added squirrel tool MPXV analysis
ammaraziz Jan 9, 2025
7e6a3b0
added shed.yml
ammaraziz Jan 9, 2025
7f6194d
updated test data
ammaraziz Jan 9, 2025
a38052a
Update tools/squirrel/squirrel-phylo.xml
ammaraziz Jan 9, 2025
ea58467
Update tools/squirrel/squirrel-phylo.xml
ammaraziz Jan 9, 2025
0f1ddaa
Update tools/squirrel/squirrel-phylo.xml
ammaraziz Jan 9, 2025
4f0ab36
Update tools/squirrel/squirrel-phylo.xml
ammaraziz Jan 9, 2025
18c814a
Update tools/squirrel/squirrel-qc.xml
ammaraziz Jan 9, 2025
72c9098
added masking options
ammaraziz Jan 11, 2025
fd09c29
removed masking, added more macros, changed tests
ammaraziz Jan 11, 2025
be05a9b
updated test data to be smaller
ammaraziz Jan 11, 2025
e012ac3
updated help
ammaraziz Jan 11, 2025
29c52cd
moved masking to phylo wrapper, and changed help
ammaraziz Jan 11, 2025
6725e7e
added conditional for advanced params
ammaraziz Jan 11, 2025
26949ce
removed category
ammaraziz Jan 11, 2025
03e7815
removed category
ammaraziz Jan 11, 2025
ba0b36d
changed from conditional to section
ammaraziz Jan 14, 2025
644743a
readded html outputs to tests
ammaraziz Jan 14, 2025
705e6e9
added test asserts for images
ammaraziz Jan 14, 2025
1ed3da8
removed html output, not useful
ammaraziz Jan 14, 2025
67b7eb0
updated test data
ammaraziz Jan 14, 2025
0f3314f
added asserts for tree files
ammaraziz Jan 14, 2025
9b4703f
smaller test files
ammaraziz Jan 15, 2025
95c3704
added outgroup param, changed tests to match
ammaraziz Jan 15, 2025
0aaacab
renamed test data
ammaraziz Jan 15, 2025
ceb954b
minified test data
ammaraziz Jan 15, 2025
1107d31
updated test data
ammaraziz Jan 15, 2025
d73eeec
removed input oddities - READY?
ammaraziz Jan 15, 2025
5fa0224
minor changes
ammaraziz Jan 16, 2025
d23daa5
typo
ammaraziz Jan 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions tools/squirrel/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: squirrel
owner: iuc
description: "QC and Phylogenetic analysis of MPXV"
long_description: |
Squirrel provides a rapid way of producing reliable alignments for MPXV
and also enable maximum-likelihood phylogenetics pipeline tree estimation.
homepage_url: https://github.com/aineniamh/squirrel
remote_repository_url: https://github.com/aineniamh/squirrel
type: unrestricted
categories:
- Phylogenetics
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Wrapper for : {{ tool_name }}"
27 changes: 27 additions & 0 deletions tools/squirrel/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0"?>
<macros>
<token name="@TOOL_VERSION@">1.0.12</token>
<token name="@VERSION_SUFFIX@">0</token>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">squirrel</requirement>
</requirements>
</xml>
<xml name="version_command">
<version_command>squirrel --version</version_command>
</xml>
<xml name="citations">
<citations>
<citation type="bibtex">
@misc{githubsquirrel,
author = {O'Toole, Áine},
year = {2025},
title = {Squirrel: Some QUIck Reconstruction to Resolve Evolutionary Links},
publisher = {GitHub},
journal = {GitHub},
url = {https://github.com/aineniamh/squirrel},
}
</citation>
</citations>
</xml>
</macros>
211 changes: 211 additions & 0 deletions tools/squirrel/squirrel-phylo.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
<tool id="squirrel_phylo" name="Squirrel Phylo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
<description>Phylogenetic and APOBEC3 analysis of MPXV (Mpox virus)</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements"/>
<expand macro="version_command"/>

<command detect_errors="exit_code"><![CDATA[
#set $alignment_output = 'input.aln.fasta'
#set $tree_output = 'input.tree'

#if $apobec3
#set $aa_recon_output = "input.tree.amino_acid.reconstruction.csv"
#set $branch_snps_output = "input.tree.branch_snps.reconstruction.csv"
#set $svg_output = "input.tree.svg"
#set $png_output = "input.tree.png"
#end if

ln -s '${sequences}' input.fasta &&

squirrel
#if $apobec3
--run-apobec3-phylo
--fig-height $fig_height
--fig-width $fig_width
#else
--run-phylo
#end if

--clade $clade

#if $mask_file
--additional-mask $mask_file
#end if

#if $bg_file
--background-file '$bg_file'
#else
--include-background
#end if

#if $out_group
--outgroups $out_group
#end if

$no_mask
$no_iter_mask

--threads \${GALAXY_SLOTS:-1}
input.fasta &&

mv '${alignment_output}' '$alignment' &&
mv '${tree_output}' '$tree'

#if $apobec3
&& mv '${aa_recon_output}' '$aa_recon' &&
mv '${branch_snps_output}' '$branch_snps' &&
mv '${svg_output}' '$svg' &&
mv '${png_output}' '$png'
#end if
]]></command>

<inputs>
<param name="sequences"
type="data"
format="fasta"
label="Sequences in fasta format"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
label="Sequences in fasta format"
label="Sequences in FASTA format"

help="You can upload a FASTA sequence to the history and use it as reference" />
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this hint is not needed, Galaxy users should now that, or we need to enhance the UI.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping ...

<param name="apobec3"
type="boolean"
checked="false"
label="Run additional APOBEC3-mutation reconstruction pipeline" />
<param name="clade"
type="select"
label="Select MPXV Clade">
<option value="cladei">Clade I</option>
<option value="cladeia">Clade Ia</option>
<option value="cladeib">Clade Ib</option>
<option value="cladeii">Clade II</option>
<option value="cladeiia">Clade IIa</option>
<option value="cladeiib">Clade IIb</option>
</param>
<section name="other_settings" expanded="false" title="Additional Settings">
<param name="no_mask"
type="boolean"
truevalue="--no-mask"
falsevalue=""
label="SKIP masking repeat regions?"
help="Set to True to Skip masking of repetitive regions. Default: masks repeat regions." />
<param name="no_iter_mask"
type="boolean"
truevalue="--no-itr-mask"
falsevalue=""
label="SKIP masking of end ITR?"
help="Set to True to skip masking of end ITR. Default: masks ITR" />
<param name="mask_file"
type="data"
format="csv"
optional="true"
label="Mask additional sites"
help="Run squirrel in alignment with QC to generate the SNP mask file." />
<param name="bg_file"
type="data"
format="fasta"
optional="true"
label="Background file - leave empty for automatic background sequences."
help="Include a default background set of sequences for the phylogenetics pipeline. The set will be determined by previous 'clade' setting"/>
<param name="out_group"
type="text"
label="Specify outgroup(s)"
help="Specify which MPXV outgroup(s) in the alignment to use in the phylogeny. These will get pruned out from the final tree."/>
<param name="fig_height"
label="Overwrite tree figure default height"
type="integer"
min="0"
value="25"
optional="true">
</param>
<param name="fig_width"
label="Overwrite tree figure default width"
type="integer"
min="0"
value="40"
optional="true">
</param>
</section>
</inputs>

<outputs>
<!-- standard outputs-->
<data name="tree" format="newick" label="${tool.name} - phylogenetic tree" />
<data name="alignment" format="fasta" label="${tool.name} - aligned sequences" />
<!-- apobec3 outputs-->
<data name="svg" format="svg" label="${tool.name} - phylotree svg image">
<filter>apobec3</filter>
</data>
<data name="png" format="png" label="${tool.name} - phylotree png image">
<filter>apobec3</filter>
</data>
<data name="aa_recon" format="png" label="${tool.name} - aa mutations ancestral reconstruction">
<filter>apobec3</filter>
</data>
<data name="branch_snps" format="png" label="${tool.name} - apobec3 nt mutations">
<filter>apobec3</filter>
</data>
</outputs>

<tests>
<test expect_num_outputs="2">
<param name="sequences" value="test-sequences.fasta" />
<param name="bg_file" value="test-background.fasta" />
<param name="out_group" value="KJ642615" />
<param name="apobec3" value="false" />
<output name="alignment" file="sequences.aln.fasta" />
<output name="tree">
<assert_contents>
<has_line_matching expression="#NEXUS"/>
</assert_contents>
</output>
</test>

<test expect_num_outputs="6">
<param name="sequences" value="test-sequences.fasta" />
<param name="bg_file" value="test-background.fasta" />
<param name="out_group" value="KJ642615" />
<param name="apobec3" value="true" />
<output name="alignment" file="sequences.aln.fasta" />
<output name="tree">
<assert_contents>
<has_line_matching expression="#NEXUS"/>
</assert_contents>
</output>
<output name="svg">
<assert_contents>
<has_text text="svg xmlns:"/>
<has_text text="DQ011155"/>
</assert_contents>
</output>
<output name="png" file="sequences.tree.png" ftype="png" compare="sim_size" delta="1000" />
<output name="aa_recon" file="sequences.tree.amino_acid.reconstruction.csv" />
<output name="branch_snps" file="sequences.tree.branch_snps.reconstruction.csv" />
</test>

</tests>
<help><![CDATA[
squirrel allows for rapidly producing reliable alignments for MPXV and also enable maximum-likelihood phylogenetics pipeline tree estimation.

Ensure your input sequences are of a singular clade and not mixed CladeI/CladeII. CladeI and CladeIa/b are fine to combine.

**Alignment**
Squirrel maps each query genome in the input file against a reference genome specific to each clade using minimap2. Using gofasta, the mapping file is then converted into a multiple sequence alignment.

For Clade II, the reference used is NC_063383 and for Clade I, we use NC_003310. This means that all coordinates within an alignment will be relative to these references. A benefit of this is that within a clade, alignment files and be combined without having to recalculate the alignment. Note however that insertions relative to the reference sequence will not be included in the alignment.

Squirrel by default creates a single alignment fasta file. Using the genbank coordinates for NC_063383 it also has the ability to extract the aligned coding sequences either as separate records or as a concatenated alignment. This can facilitate codon-aware phylogenetic or sequence analysis.

**APOBEC3**
Enrichment of APOBEC3-mutations in the MPXV population are a signature of sustained human-to-human transmission. Identifying APOBEC3-like mutations in MPXV genomes from samples in a new outbreak can be a piece of evidence to support sustained human transmission of mpox. Squirrel can run an APOBEC3-reconstruction and map these mutations onto the phylogeny.

**Default Masking**
Squirrel performs masking (replacement with N) on low-complexity or repetitive regions that have been characterised for Clade I and II. These regions are defined in to_mask.cladeii.csv and to_mask.cladei.csv (see github: https://github.com/aineniamh/squirrel/blob/main/squirrel/data/).

**Additional Masking**
Additional mask file can be provided to mask sites in addition to default masking. To generate additional masking file, run the galaxy tool *squirrel-qc*


]]></help>

<expand macro="citations" />
</tool>
79 changes: 79 additions & 0 deletions tools/squirrel/squirrel-qc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
<tool id="squirrel_qc" name="Squirrel QC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
<description>QC of MPXV (Mpox virus) sequences</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements"/>
<expand macro="version_command"/>

<command detect_errors="exit_code"><![CDATA[
#set $mask_output = 'input.suggested_mask.csv'
#set $exclude_output = 'suggested_to_exclude.csv'

ln -s '${sequences}' input.fasta &&

squirrel
--seq-qc
--clade $clade

--threads \${GALAXY_SLOTS:-1}

input.fasta &&

mv '${mask_output}' '$mask' &&
mv '${exclude_output}' '$exclude'
]]></command>

<inputs>
<param name="sequences"
type="data"
format="fasta"
label="Sequences in FASTA format" />
<param name="clade"
type="select"
label="Select MPXV Clade">
<option value="cladei">Clade I</option>
<option value="cladeia">Clade Ia</option>
<option value="cladeib">Clade Ib</option>
<option value="cladeii">Clade II</option>
<option value="cladeiia">Clade IIa</option>
<option value="cladeiib">Clade IIb</option>
</param>

</inputs>

<outputs>
<!-- standard outputs-->
<data name="mask" format="csv" label="${tool.name} - flagged mutations to mask" />
<data name="exclude" format="csv" label="${tool.name} - flagged sequences to exclude" />
</outputs>

<tests>
<test expect_num_outputs="2">
<param name="sequences" value="test-sequences.fasta" />
<param name="clade" value="cladeii" />
<output name="mask" file="sequences.suggested_mask.csv" />
<output name="exclude" file="suggested_to_exclude.csv" />
</test>
</tests>
<help><![CDATA[
Squirrel in QC mode can run quality control (QC) on the alignment and flag certain sites to the user that may need to be masked. Squirrel can flag potential issues in the MPXV sequences that have been provided for alignment (e.g. SNPS near tracts of N, clusters of unique SNPs, reversions to reference alleles and convergent mutations) and outputs these in a mask file for investigation.

It is recommended that the user looks at these sites in an alignment viewer to judge whether the sites should be masked or not.

Squirrel with check within the alignment for:

- Mutations that are adjacent to N bases

The rationale for this is that N sites are usually a product of low coverage regions. Mutations that occur directly adjacent to low coverage regions may be a result of mis-alignment prior to the low coverage masking and may not be real SNPs. In squirrel, non-majority alleles that are present next to an N are flagged as potential sites for masking

- Unique mutations that clump together

If mutations are observed in only a single sequence in the genome, they are classed as unique mutations. Usually mutations do not clump closely together and may suggest an alignment or assembly issue. If these mutations are not shared with any other sequences, they are flagged for masking.

- Sequences with a high N content

Sequences that have many ambiguous bases in them are flagged that they may want to be excluded in further analysis. This may not always be appropriate, often genomes that have a lot of ambiguity can still be informative, however if there is something unusual about a sequence, having lots of ambiguities can be a flag for wider problems (like low read count during assembly).
]]></help>
<expand macro="citations"/>
</tool>
6 changes: 6 additions & 0 deletions tools/squirrel/test-data/sequences.aln.fasta

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tools/squirrel/test-data/sequences.suggested_mask.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Name,Minimum,Maximum,Length,present_in,note
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
site,gene,direction,snp,dimer,apobec,aa_position,parent,parent_codon,parent_aa,child,child_codon,child_aa,mutation_category,score,prediction,homoplasy,occurrence
1355,OPG001_CDS_180,reverse,G->T,,False,2,Node1,ACT,T,DQ011155,AAT,N,nonsynonymous,65,moderately conservative,False,1
6600,NA,NA,G->T,,False,NA,Node1,NA,NA,DQ011155,NA,NA,intergenic,NA,NA,False,1
8973,NA,NA,C->A,,False,NA,Node1,NA,NA,DQ011155,NA,NA,intergenic,NA,NA,False,1
11363,OPG023_CDS_173,reverse,T->C,,False,2,Node1,TAT,Y,DQ011155,TGT,C,nonsynonymous,194,radical,False,1
21633,OPG034_CDS_165,reverse,C->T,AC,False,3,Node1,TTG,L,DQ011155,TTA,L,synonymous,NA,NA,False,1
35994,OPG052_CDS_148,reverse,C->G,,False,2,Node1,AGT,S,DQ011155,ACT,T,nonsynonymous,58,moderately conservative,False,1
37989,OPG054_CDS_146,reverse,T->C,,False,2,Node1,CAA,Q,DQ011155,CGA,R,nonsynonymous,43,conservative,False,1
44147,OPG062_CDS_138,forward,A->C,,False,1,Node1,AGA,R,DQ011155,CGA,R,synonymous,NA,NA,False,1
58283,OPG074_CDS_127,reverse,A->G,,False,3,Node1,GTT,V,DQ011155,GTC,V,synonymous,NA,NA,False,1
73148,OPG089_CDS_112,forward,G->A,GA,True,1,Node1,GAG,E,DQ011155,AAG,K,nonsynonymous,56,moderately conservative,False,1
83797,OPG105_CDS_96,forward,C->T,GC,False,3,Node1,AGC,S,DQ011155,AGT,S,synonymous,NA,NA,False,1
97065,OPG115_CDS_86,forward,C->T,GC,False,1,Node1,CAT,H,DQ011155,TAT,Y,nonsynonymous,83,moderately conservative,False,1
110532,OPG127_CDS_74,reverse,G->A,GT,False,1,Node1,CAT,H,DQ011155,TAT,Y,nonsynonymous,83,moderately conservative,False,1
132943,OPG151_CDS_50,forward,C->T,GC,False,3,Node1,CGC,R,DQ011155,CGT,R,synonymous,NA,NA,False,1
137411,NA,NA,G->A,GC,False,NA,Node1,NA,NA,DQ011155,NA,NA,intergenic,NA,NA,False,1
140036,OPG155_CDS_47,reverse,C->A,,False,2,Node1,AGA,R,DQ011155,ATA,I,nonsynonymous,97,moderately conservative,False,1
143163,OPG161_CDS_41,forward,C->T,CC,False,3,Node1,GCC,A,DQ011155,GCT,A,synonymous,NA,NA,False,1
159492,OPG185_CDS_24,forward,T->C,,False,3,Node1,ATT,I,DQ011155,ATC,I,synonymous,NA,NA,False,1
165893,OPG191_CDS_19,forward,G->A,GT,False,3,Node1,ATG,M,DQ011155,ATA,I,nonsynonymous,10,conservative,False,1
171132,OPG199_CDS_14,forward,G->A,GC,False,3,Node1,ATG,M,DQ011155,ATA,I,nonsynonymous,10,conservative,False,1
178028,NA,NA,C->T,GC,False,NA,Node1,NA,NA,DQ011155,NA,NA,intergenic,NA,NA,False,1
183889,OPG210_CDS_7,forward,A->C,,False,3,Node1,ACA,T,DQ011155,ACC,T,synonymous,NA,NA,False,1
241,NA,NA,A->C,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
826,NA,NA,C->T,TC,True,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
1788,OPG002_CDS_179,reverse,G->A,GC,False,3,Node1,TGC,C,JX878413,TGT,C,synonymous,NA,NA,False,1
6100,OPG015_CDS_177,reverse,A->G,,False,2,Node1,GTA,V,JX878413,GCA,A,nonsynonymous,64,moderately conservative,False,1
8477,NA,NA,T->C,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
11156,OPG023_CDS_173,reverse,G->A,GC,False,2,Node1,GCC,A,JX878413,GTC,V,nonsynonymous,64,moderately conservative,False,1
16178,OPG027_CDS_170,reverse,C->A,,False,3,Node1,GAG,E,JX878413,GAT,D,nonsynonymous,45,conservative,False,1
20627,NA,NA,A->G,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
27593,NA,NA,C->T,TC,True,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
49030,OPG066_CDS_134,reverse,C->T,CC,False,3,Node1,AAG,K,JX878413,AAA,K,synonymous,NA,NA,False,1
51979,OPG068_CDS_133,forward,T->C,,False,3,Node1,TTT,F,JX878413,TTC,F,synonymous,NA,NA,False,1
54162,OPG071_CDS_130,reverse,C->T,AC,False,3,Node1,CTG,L,JX878413,CTA,L,synonymous,NA,NA,False,1
87349,OPG106_CDS_95,reverse,C->T,AC,False,3,Node1,GTG,V,JX878413,GTA,V,synonymous,NA,NA,False,1
89411,OPG109_CDS_92,reverse,T->C,,False,3,Node1,AAA,K,JX878413,AAG,K,synonymous,NA,NA,False,1
118195,OPG134_CDS_67,forward,C->T,AC,False,1,Node1,CTA,L,JX878413,TTA,L,synonymous,NA,NA,False,1
129252,OPG148_CDS_53,forward,T->C,,False,3,Node1,TGT,C,JX878413,TGC,C,synonymous,NA,NA,False,1
138312,OPG153_CDS_49,reverse,G->A,GG,False,2,Node1,CCC,P,JX878413,CTC,L,nonsynonymous,98,moderately conservative,False,1
142201,OPG160_CDS_42,reverse,G->T,,False,3,Node1,TCC,S,JX878413,TCA,S,synonymous,NA,NA,False,1
153104,NA,NA,G->A,GT,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
154113,NA,NA,G->A,GT,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
157592,NA,NA,C->A,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
168929,NA,NA,G->T,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
173151,OPG201_CDS_12,forward,C->T,AC,False,2,Node1,ACA,T,JX878413,ATA,I,nonsynonymous,89,moderately conservative,False,1
176367,OPG205_CDS_10,forward,C->T,AC,False,3,Node1,AAC,N,JX878413,AAT,N,synonymous,NA,NA,False,1
177394,OPG205_CDS_10,forward,C->T,TC,True,1,Node1,CGT,R,JX878413,TGT,C,nonsynonymous,180,radical,False,1
186579,OPG210_CDS_7,forward,G->T,,False,2,Node1,AGA,R,JX878413,ATA,I,nonsynonymous,97,moderately conservative,False,1
187075,NA,NA,C->G,,False,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
187738,NA,NA,C->T,TC,True,NA,Node1,NA,NA,JX878413,NA,NA,intergenic,NA,NA,False,1
190234,OPG016_CDS_5,forward,G->A,GC,False,3,Node1,TTG,L,JX878413,TTA,L,synonymous,NA,NA,False,1
190759,OPG015_CDS_4,forward,T->C,,False,2,Node1,GTA,V,JX878413,GCA,A,nonsynonymous,64,moderately conservative,False,1
Loading
Loading