galaxyproject · mthang · Aug 27, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 26, 2025
diff --git a/tools/autocycler/.shed.yml b/tools/autocycler/.shed.yml
@@ -0,0 +1,23 @@
+---
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "{{ tool_name }} from the autocycler suite"
+categories:
+  - Assembly
+description: Autocycler is a tool for generating consensus long-read assemblies for bacterial genomes. It is the successor to Trycycler.
+exclude:
+  - tool_test_output.html
+  - tool_test_output.json
+homepage_url: https://github.com/rrwick/Autocycler
+long_description: >
+  Autocycler is a tool for generating consensus long-read assemblies for bacterial genomes.
+name: autocycler
+owner: iuc
+remote_repository_url: https://github.com/usegalaxy-au/tools-au/tree/master/tools/autocycler
+suite:
+  name: suite_autocycler
+  description: >
+    Autocycler is a tool for generating consensus long-read assemblies for bacterial genomes.
+  long_description: >
+    Autocycler is a tool for generating consensus long-read assemblies for bacterial genomes.
+type: unrestricted
diff --git a/tools/autocycler/autocycler_clean.xml b/tools/autocycler/autocycler_clean.xml
@@ -0,0 +1,91 @@
+<tool id="autocycler_clean" name="AutoCycler Clean" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
+    <description>clean and modify assembly graph by removing or duplicating segments</description>
+    <macros>
+          <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+
+        ln -s '${in_gfa}' consensus_assembly.gfa &&
+
+
+        autocycler clean
+	--in_gfa consensus_assembly.gfa 
+        --out_gfa $cleaned_gfa
+        #if $remove:
+            --remove '$remove'
+        #end if
+        #if $duplicate:
+            --duplicate '$duplicate'
+        #end if
+    ]]></command>
+    <inputs>
+        <param name="in_gfa" type="data" format="gfa1" label="Input GFA file"
+               help="Assembly graph to clean/modify"/>
+
+        <param argument="-r" name="remove" type="text" optional="false" label="Segments to remove"
+               help="Comma-separated list of tig numbers to remove (e.g., '1,5,8')"/>
+
+        <param argument="-d" name="duplicate" type="text" optional="true" label="Segments to duplicate"
+               help="Comma-separated list of tig numbers to duplicate (e.g., '2,3,7')"/>
+    </inputs>
+    <outputs>
+        <data name="cleaned_gfa" format="gfa1" 
+              label="${tool.name} on ${on_string}: cleaned assembly"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="in_gfa" value="consensus_assembly_unfiltered.gfa"/>
+            <param name="remove" value="743,782"/>
+	    <output name="cleaned_gfa" file="clean.gfa" ftype="gfa1">
+		    <assert_contents>
+			    <has_size value="46397" delta="1000"/>
+			    <has_n_lines n="171"/>
+		    </assert_contents>
+	    </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+**AutoCycler Clean Tool**
+Autocycler clean allows users remove or duplicate sequences in a consensus assembly produced by Autocycler combine.
+
+**Inputs**
+
+- Input GFA file (assembly graph)
+- Optional lists of segments to remove or duplicate
+
+**Parameters**
+
+- Segments to remove: Comma-separated list of tig numbers to remove from graph
+- Segments to duplicate: Comma-separated list of tig numbers to duplicate in graph
+
+**Outputs**
+
+- Cleaned/modified GFA file
+
+**Notes**
+
+- Viewing consensus_assembly.gfa in Bandage can be useful for determining which tigs should be kept/deleted.
+- Depth values in consensus_assembly.gfa represent the number of input sequences that contributed to each sequence. Users will typically delete lower-depth tigs to prioritise resolving higher-depth tigs.
+- The output GFA created by Autocycler clean is also a valid input. This means that you can clean in multiple stages, using the output of one round of cleaning as the input for the next.
+- The values for -r and -d can contain spaces if it is enclosed in quotes. For example: -r "7, 8". This allows for copy-pasting from Bandage's 'selected nodes' list.
+- If -r and -d are not specified, no sequences will be removed, and the input graph will remain unchanged.
+- If any of the specified tigs in -r or -d do not exist in the input graph, Autocycler clean will return an error and terminate. Ensure tig IDs match those in the GFA.
+- The only tigs which can be duplicated are those which contain exactly two links to other tigs. Each copy of the duplicated tig will keep one of the links.
+- If an invalid tig ID is specified in -d (e.g., a tig with more than two links), Autocycler will return an error and terminate. Verify tig IDs and their properties in Bandage before running the command.
+- Useful for manual curation of assembly graphs
+- Can remove problematic segments or amplify important ones
+- Both removal and duplication can be performed in one operation
+- Input tig numbers should match those in your GFA file
+
+.. class:: infomark
+
+**More Information**
+
+- `Autocycler clean`: https://github.com/rrwick/Autocycler/wiki/Autocycler-clean
+
+**Citations**
+
+    ]]></help>
+	    <expand macro="citations"/>
+</tool>
diff --git a/tools/autocycler/autocycler_cluster.xml b/tools/autocycler/autocycler_cluster.xml
@@ -0,0 +1,182 @@
+<tool id="autocycler_cluster" name="AutoCycler Cluster" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
+    <description>cluster assemblies based on compressed representations</description>
+    <macros>
+          <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        mkdir -p out_dir &&
+
+        #import re
+        #set input_directory = 'assemblies'
+        mkdir $input_directory &&
+
+        #for $value, $input_file in enumerate($autocycler_dir):
+                #if $input_file.is_of_type("yaml")
+                    ln -s $input_file '$input_directory/$(input_file.element_identifier).yaml' &&
+                #elif $input_file.ext == 'gfa1'
+                    ln -s $input_file '$input_directory/$(input_file.element_identifier).gfa' &&
+                #else
+                    echo "${input_file} is not a fasta/gfa file"
+                #end if
+        #end for
+
+
+
+	autocycler cluster
+        --autocycler_dir '$input_directory'
+        --cutoff '$cutoff'
+	#if $min_assemblies and int($min_assemblies) > 0:
+            --min_assemblies '$min_assemblies'
+        #end if
+        --max_contigs '$max_contigs'
+        #if $manual :
+            --manual '$manual'
+	#end if
+	&& cp -r '$input_directory'/clustering out_dir/
+	&& mv out_dir/clustering/clustering.newick out_dir/clustering/clustering_newick.newick
+	&& mv out_dir/clustering/clustering.yaml out_dir/clustering/clustering_yaml.yaml
+	&& mv out_dir/clustering/clustering.tsv out_dir/clustering/clustering_tsv.tsv
+	&& 
+
+	### rename identifcal 1_untrimmed.yaml to 1_untrimmed_yaml.yaml in qc_pass folder
+    	if [ -d "out_dir/clustering/qc_pass" ]; then
+        	for file in out_dir/clustering/qc_pass/*/1_untrimmed.yaml; do
+            	mv \$file \${file%/*}/1_untrimmed_yaml.yaml;
+        	done;
+    	fi
+
+	&&
+        ### rename identifcal 1_untrimmed.yaml to 1_untrimmed_yaml.yaml in qc_fail folder
+        if [ -d "out_dir/clustering/qc_fail" ]; then
+                for file in out_dir/clustering/qc_fail/*/1_untrimmed.yaml; do
+                mv \$file \${file%/*}/1_untrimmed_yaml.yaml;
+                done;
+        fi
+
+
+    ]]></command>
+    <inputs>
+	<!-- Main input directory -->
+	<param name="autocycler_dir" type="data_collection" collection_type="list"  format="yaml,gfa1" label="GFA and Yaml File" 
+	       help="Directory containing input_assemblies.gfa file (from compress step)"/>
+
+        <!-- Clustering cutoff -->
+        <param name="cutoff" type="float" min="0" max="1" value="0.2" label="Clustering cutoff"
+               help="Distance threshold for hierarchical clustering (0-1)"/>
+
+        <!-- Minimum assemblies -->
+	<param name="min_assemblies" type="integer" min="0" value="0" label="Minimum assemblies per cluster" />
+
+
+        <!-- Max contigs -->
+        <param name="max_contigs" type="integer" min="1" value="25" label="Maximum contigs per assembly"
+               help="Refuse to run if mean contigs per assembly exceeds this value"/>
+
+        <!-- Manual clustering  -->
+        <param name="manual" type="text" value="" label="Cluster "/>
+    </inputs>
+    <outputs>
+        <collection name="cluster_results" type="list" label="${tool.name} on ${on_string}: clustering results">
+		<discover_datasets pattern="(?P&lt;designation&gt;.+)\.phylip$" directory="out_dir/clustering" format="phylip" />
+		<discover_datasets pattern="(?P&lt;designation&gt;.+)\.newick$" directory="out_dir/clustering" format="newick" />
+		<discover_datasets pattern="(?P&lt;designation&gt;.+)\.yaml$" directory="out_dir/clustering" format="yaml" />
+		<discover_datasets pattern="(?P&lt;designation&gt;.+)\.tsv$" directory="out_dir/clustering" format="tsv" />
+	</collection>
+	<collection name="qc_pass" type="list:list" label="${tool.name} on ${on_string}: clustering QC pass">
+		<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?P&lt;identifier_1&gt;.+)\.yaml$" format="yaml" directory="out_dir/clustering/qc_pass/" recurse="true" match_relative_path="true"/>
+		<discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?P&lt;identifier_1&gt;.+)\.gfa$" format="gfa1" directory="out_dir/clustering/qc_pass/" recurse="true" match_relative_path="true"/>
+	</collection>
+       <collection name="qc_fail" type="list:list" label="${tool.name} on ${on_string}: clustering QC fail">
+                <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?P&lt;identifier_1&gt;.+)\.gfa$"  format="gfa1" directory="out_dir/clustering/qc_fail/" recurse="true" match_relative_path="true"/>
+                <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?P&lt;identifier_1&gt;.+)\.yaml$"  format="yaml" directory="out_dir/clustering/qc_fail/" recurse="true" match_relative_path="true"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+           <param name="autocycler_dir">
+		<collection type="list">
+			<element name="input_assemblies" value="input_assemblies.gfa" ftype="gfa1"/>
+                </collection>
+           </param>
+	   <param name="cutoff" value="0.2"/>
+           <param name="max_contigs" value="25"/>
+	   <output_collection name="cluster_results" type="list">
+		   <element name="pairwise_distances" file="pairwise_distances.phylip" ftype="phylip">
+			   <assert_contents>
+				   <has_size value="4609" delta="100"/>
+				   <has_n_lines n="20"/>
+				   <has_text text="flye_01.fasta contig_1"/>
+				   <has_text text="flye_04.fasta contig_6"/>
+			   </assert_contents>
+                    </element>
+		    <element name="clustering_newick" file="clustering_newick.newick" ftype="newick" compare="diff" lines_diff="10">
+			    <assert_contents>
+				   <has_size value="1500" delta="200"/>
+        			   <has_n_lines n="1"/>
+			    </assert_contents>
+                    </element>
+                    <element name="clustering_yaml" file="clustering_yaml.yaml" ftype="yaml">
+			    <assert_contents>
+				   <has_size value="303" delta="100"/>
+				   <has_n_lines n="9"/>
+				   <has_text text="pass_cluster_count: 1"/>
+				   <has_text text="fail_cluster_count: 17"/>
+			    </assert_contents>
+		    </element>
+                    <element name="clustering_tsv" file="clustering_tsv.tsv" ftype="tsv">
+			    <assert_contents>
+				   <has_size value="1740" delta="100"/>
+				   <has_n_lines n="20"/>
+				   <has_text text="1__flye_01.fasta__contig_1__8538_bp"/>
+				   <has_text text="2__flye_01.fasta__contig_2__9319_bp"/>
+			   </assert_contents>
+                    </element>
+           </output_collection>
+           <output_collection name="qc_pass" type="list:list">
+                        <element name="cluster_012">
+				<element name="1_untrimmed" file="qc_pass/cluster_012/1_untrimmed.gfa" ftype="gfa1"/>
+                        </element>
+	   </output_collection>
+           <output_collection name="qc_fail" type="list:list">
+                        <element name="cluster_001">
+				<element name="1_untrimmed" file="qc_fail/cluster_001/1_untrimmed.gfa" ftype="gfa1"/>
+                        </element>
+           </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+**AutoCycler Cluster Tool**
+
+The autocycler cluster command groups input contigs into clusters. A cluster is a group of contigs which represent the same genomic sequence. It also decides which of these clusters should be included in the final assembly (QC-pass) and which should not (QC-fail).
+
+**Inputs**
+
+- AutoCycler directory (must contain input_assemblies.gfa from compress step)
+
+**Parameters**
+
+- Clustering cutoff: Distance threshold for hierarchical clustering (0-1, default: 0.2)
+- Minimum assemblies: Exclude clusters with fewer than this many assemblies (auto or manual)
+- Maximum contigs: Refuse to run if mean contigs per assembly exceeds this value (default: 25)
+- Manual clustering: Optionally specify exact cluster nodes from the tree
+
+**Outputs**
+
+- Collection of cluster files (one per cluster)
+- Summary file with clustering statistics
+
+**Notes**
+
+- The --max_contigs option exists to catch obviously bad input data. If the mean number of contigs per input assemblies exceeds this value (default of 25), Autocycler cluster will refuse to run and display an error message. For example, if you give Autocycler 10 input assemblies with a total of 1000 contigs, that is an average of 100 contigs per assembly, which almost certainly means that they are fragmented or contaminated and thus not appropriate for Autocycler.
+
+.. class:: infomark
+
+**More Information**
+
+- `Autocycler cluster`: https://github.com/rrwick/Autocycler/wiki/Autocycler-cluster
+
+**Citations**
+    ]]></help>
+	    <expand macro="citations"/>
+</tool>