galaxyproject · mthang · Aug 27, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 26, 2025
diff --git a/tools/autocycler/.shed.yml b/tools/autocycler/.shed.yml
@@ -0,0 +1,12 @@
+name: autocycler
+owner: iuc
+description: "Autocycler is a tool for generating consensus long-read assemblies for bacterial genomes."
+long_description: "Autocycler is a tool for generating consensus long-read assemblies for bacterial genomes. It is the successor to Trycycler"
+categories:
+  - Assembly
+homepage_url: https://github.com/rrwick/Autocycler
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/autocycler
+type: unrestricted
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "{{ tool_name }} from the autocycler suite"
diff --git a/tools/autocycler/autocycler_clean.xml b/tools/autocycler/autocycler_clean.xml
@@ -0,0 +1,95 @@
+<tool id="autocycler_clean" name="AutoCycler Clean" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
+	<description>and modify assembly graph by removing or duplicating segments</description>
+	<macros>
+		<import>macros.xml</import>
+	</macros>
+	<expand macro="requirements"/>
+	<command detect_errors="exit_code"><![CDATA[
+
+		#import re
+		#set $identifier = re.sub('[^\w\-_\.]', '_', $in_gfa.element_identifier)
+		ln -s '$in_gfa' '${identifier}.gfa' &&
+
+
+		autocycler clean
+		--in_gfa '${identifier}.gfa'
+		--out_gfa '$cleaned_gfa'
+		#if $remove:
+		--remove '$remove'
+		#end if
+		#if $duplicate:
+		--duplicate '$duplicate'
+		#end if
+		]]></command>
+	<inputs>
+		<param argument="--in_gfa" type="data" format="gfa1" label="Input GFA file"
+			help="Assembly graph to clean/modify"/>
+
+		<param argument="-r" name="remove" type="text" optional="false" label="Segments to remove"
+			help="Comma-separated list of tig numbers to remove (e.g., '1,5,8')">
+			<validator type="regex" message="Enter a comma-separated list of positive integers (e.g., 1,4)">^\d+(,\d+)*$</validator>
+		</param>
+		<param argument="-d" name="duplicate" type="text" optional="true" label="Segments to duplicate"
+			help="Comma-separated list of tig numbers to duplicate (e.g., '2,3,7')">
+			<validator type="regex" message="Enter a comma-separated list of positive integers (e.g., 1,4)">^\d+(,\d+)*$</validator>
+		</param>
+	</inputs>
+	<outputs>
+		<data name="cleaned_gfa" format="gfa1"/>
+	</outputs>
+	<tests>
+		<test>
+			<param name="in_gfa" value="consensus_assembly_unfiltered.gfa"/>
+			<param name="remove" value="743,782"/>
+			<output name="cleaned_gfa" file="clean.gfa" ftype="gfa1">
+				<has_size value="46397"/>
+				<has_text text="S       1"/>
+				<has_text text="S       50"/>
+				<has_n_lines n="171"/>
+			</output>
+		</test>
+	</tests>
+	 <help><![CDATA[
+**AutoCycler Clean Tool**
+Autocycler clean allows users remove or duplicate sequences in a consensus assembly produced by Autocycler combine.
+
+**Inputs**
+
+- Input GFA file (assembly graph)
+- Optional lists of segments to remove or duplicate
+
+**Parameters**
+
+- Segments to remove: Comma-separated list of contigs numbers to remove from graph
+- Segments to duplicate: Comma-separated list of contigs numbers to duplicate in graph
+
+**Outputs**
+
+- Cleaned/modified GFA file
+
+**Notes**
+
+- Viewing consensus_assembly.gfa in Bandage can be useful for determining which tigs should be kept/deleted.
+- Depth values in consensus_assembly.gfa represent the number of input sequences that contributed to each sequence. Users will typically delete lower-depth tigs (contigs) to prioritise resolving higher-depth tigs (contigs).
+- The output GFA created by Autocycler clean is also a valid input. This means that you can clean in multiple stages, using the output of one round of cleaning as the input for the next.
+- The values for -r and -d can contain spaces if it is enclosed in quotes. For example: -r "7, 8". This allows for copy-pasting from Bandage's 'selected nodes' list.
+- If -r and -d are not specified, no sequences will be removed, and the input graph will remain unchanged.
+- If any of the specified tigs in -r or -d do not exist in the input graph, Autocycler clean will return an error and terminate. Ensure tig  (contig) IDs match those in the GFA.
+- The only tigs (contigs) which can be duplicated are those which contain exactly two links to other tigs. Each copy of the duplicated tig will keep one of the links.
+- If an invalid tig (contig) ID is specified in -d (e.g., a contig with more than two links), Autocycler will return an error and terminate. Verify tig (contig) IDs and their properties in Bandage before running the command.
+- Useful for manual curation of assembly graphs
+- Can remove problematic segments or amplify important ones
+- Both removal and duplication can be performed in one operation
+- Input tig (contig) numbers should match those in your GFA file
+
+.. class:: infomark
+
+**More Information**
+
+- `Autocycler clean`: https://github.com/rrwick/Autocycler/wiki/Autocycler-clean
+
+**Citations**
+
+		]]></help>
+	<expand macro="citations"/>
+</tool>
diff --git a/tools/autocycler/autocycler_cluster.xml b/tools/autocycler/autocycler_cluster.xml
@@ -0,0 +1,236 @@
+<tool id="autocycler_cluster" name="AutoCycler Cluster" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
+    <description>cluster assemblies based on compressed representations</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        #import re
+
+        mkdir assemblies_dir &&
+
+        #for $input_file in $autocycler_dir:
+            #set $identifier = re.sub('[^\w\-_\.]', '_', $input_file.element_identifier)
+            ln -s $input_file 'assemblies_dir/$(identifier).gfa' &&
+        #end for
+
+        autocycler cluster
+        --autocycler_dir assemblies_dir
+        --cutoff '$cutoff'
+        --min_assemblies '$min_assemblies'
+        --max_contigs '$max_contigs'
+        #if $manual :
+            --manual '$manual'
+        #end if
+        && 
+
+        ### rename identical 1_untrimmed.yaml to 1_untrimmed_yaml.yaml in qc_pass folder
+        if [ -d "assemblies_dir/clustering/qc_pass" ]; then
+            for cluster in assemblies_dir/clustering/qc_pass/*; do
+                mv \$cluster/1_untrimmed.gfa \$cluster.gfa &&
+                mv \$cluster/1_untrimmed.yaml \$cluster.yaml &&
+                rmdir \$cluster;
+            done;
+        fi
+
+        &&
+        ### rename identical 1_untrimmed.yaml to 1_untrimmed_yaml.yaml in qc_fail folder
+        if [ -d "assemblies_dir/clustering/qc_fail" ]; then
+            for cluster in assemblies_dir/clustering/qc_fail/*; do
+                mv \$cluster/1_untrimmed.gfa \$cluster.gfa &&
+                mv \$cluster/1_untrimmed.yaml \$cluster.yaml &&
+                rmdir \$cluster;
+            done;
+        fi
+
+        ]]></command>
+    <inputs>
+        <!-- Main input directory -->
+        <param argument="--autocycler_dir" type="data_collection" collection_type="list"  format="gfa1" label="GFA Files" 
+            help="Directory containing input_assemblies.gfa file (from compress step)"/>
+
+        <!-- Clustering cutoff -->
+        <param argument="--cutoff" type="float" min="0.1" max="1" value="0.2" label="Clustering cutoff"
+            help="Distance threshold for hierarchical clustering (0-1)"/>
+
+        <!-- Minimum assemblies -->
+        <param argument="--min_assemblies" type="integer" min="1" value="1" label="Minimum assemblies per cluster" />
+
+        <!-- Max contigs -->
+        <param argument="--max_contigs" type="integer" min="1" value="25" label="Maximum contigs per assembly"
+            help="Refuse to run if mean contigs per assembly exceeds this value"/>
+
+        <!-- Manual clustering  -->
+        <param argument="--manual" type="text" value="" label="specify which clades of the tree should be used to define clusters">
+            <validator type="regex" message="Enter a comma-separated list of positive integers (e.g., 1,4)">^(|\d+(,\d+)*)$</validator>
+        </param>
+
+        <param name="outputs" type="select" optional="true" multiple="true" label="Outputs">
+            <option value="distances">Distances (Table)</option>
+            <option value="clustering_newick">Clustering (Tree)</option>
+            <option value="clustering_tsv">Clustering (Table)</option>
+            <option value="qc_fail_gfa">Failed clusters (GFA)</option>
+            <option value="qc_fail_yaml">Failed clusters (yaml)</option>
+        </param>
+    </inputs>
+    <outputs>
+		<data name="clustering_newick" format="newick" from_work_dir="assemblies_dir/clustering/clustering.newick" label="${tool.name} on ${on_string}: Clustering tree">
+            <filter>outputs and "clustering_newick" in outputs</filter>
+        </data>
+		<data name="clustering_tsv" format="tabular" from_work_dir="assemblies_dir/clustering/clustering.tsv" label="${tool.name} on ${on_string}: Clustering table">
+            <filter>outputs and "clustering_tsv" in outputs</filter>
+        </data>
+        <data name="clustering_yaml" format="tabular" from_work_dir="assemblies_dir/clustering/clustering.yaml" label="${tool.name} on ${on_string}: Clustering yaml"/>
+		<data name="distances" format="tabular" from_work_dir="assemblies_dir/clustering/pairwise_distances.phylip" label="${tool.name} on ${on_string}: Pairwise distances">
+            <filter>outputs and "distances" in outputs</filter>
+        </data>
+
+        <collection name="qc_pass" type="list" label="${tool.name} on ${on_string}: clustering QC pass">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.gfa$" format="gfa1" directory="assemblies_dir/clustering/qc_pass/" recurse="false"/>
+        </collection>
+        <collection name="qc_fail" type="list" label="${tool.name} on ${on_string}: clustering QC fail">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.gfa$" format="gfa1" directory="assemblies_dir/clustering/qc_fail/" recurse="false"/>
+            <filter>outputs and "qc_fail_gfa" in outputs</filter>
+        </collection>
+        <collection name="qc_pass_yaml" type="list" label="${tool.name} on ${on_string}: clustering QC pass (yaml)">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.yaml$" format="yaml" directory="assemblies_dir/clustering/qc_pass/" recurse="false"/>
+        </collection>
+        <collection name="qc_fail_yaml" type="list" label="${tool.name} on ${on_string}: clustering QC fail (yaml)">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.yaml$" format="yaml" directory="assemblies_dir/clustering/qc_fail/" recurse="false"/>
+            <filter>outputs and "qc_fail_yaml" in outputs</filter>
+        </collection>
+
+    </outputs>
+    <tests>
+        <test expect_num_outputs="6">
+            <param name="autocycler_dir">
+                <collection type="list">
+                    <element name="input_assemblies" value="input_assemblies.gfa" ftype="gfa1"/>
+                </collection>
+            </param>
+            <param name="cutoff" value="0.2"/>
+            <param name="max_contigs" value="25"/>
+            <param name="manual" value="1,3"/>
+            <param name="outputs" value="clustering_newick,clustering_tsv,distances"/>
+			<output name="clustering_newick">
+				<assert_contents>
+					<has_size value="1519" delta="200"/>
+					<has_text text=");"/>
+					<has_n_lines n="1"/>
+				</assert_contents>
+			</output>
+			<output name="clustering_tsv">
+				<assert_contents>
+					<has_size value="1739" delta="100"/>
+					<has_n_lines n="20"/>
+					<has_text text="1__flye_01.fasta__contig_1__8538_bp"/>
+					<has_text text="2__flye_01.fasta__contig_2__9319_bp"/>
+				</assert_contents>
+		    </output>
+			<output name="clustering_yaml">
+				<assert_contents>
+					<has_size value="260" delta="100"/>
+					<has_n_lines n="9"/>
+					<has_text text="pass_cluster_count: 2"/>
+					<has_text text="fail_cluster_count: 17"/>
+				</assert_contents>
+		    </output>
+            <output name="distances">
+                <assert_contents>
+                    <has_size value="4609" delta="100"/>
+                    <has_n_lines n="20"/>
+                    <has_text text="flye_01.fasta contig_1"/>
+                    <has_text text="flye_04.fasta contig_6"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="8">
+            <param name="autocycler_dir">
+                <collection type="list">
+                    <element name="input_assemblies" value="input_assemblies.gfa" ftype="gfa1"/>
+                </collection>
+            </param>
+            <param name="cutoff" value="0.2"/>
+            <param name="max_contigs" value="25"/>
+            <param name="manual" value="2,4"/>
+            <param name="outputs" value="clustering_newick,clustering_tsv,distances,qc_fail_gfa,qc_fail_yaml"/>
+            <output name="clustering_newick">
+				<assert_contents>
+                    <has_size value="1516" delta="200"/>
+                    <has_text text=");"/>
+                    <has_n_lines n="1"/>
+                </assert_contents>
+			</output>
+			<output name="clustering_tsv">
+				<assert_contents>
+                    <has_size value="1739" delta="100"/>
+                    <has_n_lines n="20"/>
+                    <has_text text="1__flye_01.fasta__contig_1__8538_bp"/>
+                    <has_text text="2__flye_01.fasta__contig_2__9319_bp"/>
+                </assert_contents>
+		    </output>
+			<output name="clustering_yaml">
+				<assert_contents>
+                    <has_size value="288" delta="100"/>
+                    <has_n_lines n="9"/>
+                    <has_text text="pass_cluster_count: 2"/>
+                    <has_text text="fail_cluster_count: 16"/>
+                </assert_contents>
+		    </output>
+            <output name="distances">
+                <assert_contents>
+                    <has_size value="4609" delta="100"/>
+                    <has_n_lines n="20"/>
+                    <has_text text="flye_01.fasta contig_1"/>
+                    <has_text text="flye_04.fasta contig_6"/>
+                </assert_contents>
+            </output>
+            <output_collection name="qc_pass" type="list" count="2">
+                <element name="cluster_003" file="qc_pass/cluster_003/1_untrimmed.gfa" ftype="gfa1"/>
+                <element name="cluster_011" file="qc_pass/cluster_011/1_untrimmed.gfa" ftype="gfa1"/>
+            </output_collection>
+            <output_collection name="qc_fail" type="list" count="16">
+                <element name="cluster_002" file="qc_fail/cluster_002/1_untrimmed.gfa" ftype="gfa1"/>
+            </output_collection>
+            <output_collection name="qc_pass_yaml" type="list" count="2">
+                <element name="cluster_003" file="qc_pass/cluster_003/1_untrimmed.yaml" ftype="yaml"/>
+                <element name="cluster_011" file="qc_pass/cluster_011/1_untrimmed.yaml" ftype="yaml"/>
+            </output_collection>
+            <output_collection name="qc_fail_yaml" type="list" count="16"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**AutoCycler Cluster Tool**
+
+The autocycler cluster command groups input contigs into clusters. A cluster is a group of contigs which represent the same genomic sequence. It also decides which of these clusters should be included in the final assembly (QC-pass) and which should not (QC-fail).
+
+**Inputs**
+
+- AutoCycler directory (must contain input_assemblies.gfa from compress step)
+
+**Parameters**
+
+- Clustering cutoff: Distance threshold for hierarchical clustering (0-1, default: 0.2)
+- Minimum assemblies: Exclude clusters with fewer than this many assemblies (auto or manual)
+- Maximum contigs: Refuse to run if mean contigs per assembly exceeds this value (default: 25)
+- Manual clustering: Optionally specify exact cluster nodes from the tree
+
+**Outputs**
+
+- Collection of cluster files (one per cluster)
+- Summary file with clustering statistics
+
+**Notes**
+
+- The --max_contigs option exists to catch obviously bad input data. If the mean number of contigs per input assemblies exceeds this value (default of 25), Autocycler cluster will refuse to run and display an error message. For example, if you give Autocycler 10 input assemblies with a total of 1000 contigs, that is an average of 100 contigs per assembly, which almost certainly means that they are fragmented or contaminated and thus not appropriate for Autocycler.
+
+.. class:: infomark
+
+**More Information**
+
+- `Autocycler cluster`: https://github.com/rrwick/Autocycler/wiki/Autocycler-cluster
+
+**Citations**
+    ]]></help>
+    <expand macro="citations"/>
+</tool>