galaxyproject · rlibouba · Oct 16, 2023 · Oct 16, 2023
diff --git a/workflows/genome_annotation/repeat_element/.dockstore.yml b/workflows/genome_annotation/repeat_element/.dockstore.yml
@@ -0,0 +1,11 @@
+version: 1.2
+workflows:
+- name: main
+  subclass: Galaxy
+  publish: true
+  primaryDescriptorPath: /RepeatElement-Workflow.ga 
+  testParameterFiles:
+  - /RepeatElement-Workflow-tests.yml
+  authors:
+  - name: Romane Libouban
+    email: [email protected]
diff --git a/workflows/genome_annotation/repeat_element/CHANGELOG.md b/workflows/genome_annotation/repeat_element/CHANGELOG.md
@@ -0,0 +1,5 @@
+# Changelog
+
+## [0.1]
+
+Initial version of the repeat elements workflow for genomic sequencing data.
diff --git a/workflows/genome_annotation/repeat_element/README.md b/workflows/genome_annotation/repeat_element/README.md
@@ -0,0 +1,37 @@
+# Repeat elements workflow
+
+This workflow uses RepeatModeler, RepeatMasker and Red (REpeat Detector) to detect repeated elements.
+
+RepeatModeler is a software package for identifying and modeling de novo families of transposable elements (TEs). At the heart of RepeatModeler are three de novo repeat search programs (RECON, RepeatScout and LtrHarvest/Ltr_retriever) which use complementary computational methods to identify repeat element boundaries and family relationships from sequence data.
+
+RepeatMasker is a program that analyzes DNA sequences for *interleaved repeats* and *low-complexity* DNA sequences. The result of the program is a detailed annotation of the repeats present in the query sequence, as well as a modified version of the query sequence in which all annotated repeats are present.
+
+Red is an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale.
+
+## Input dataset for RepeatModeler
+- RepeatModeler requires a single input file, a genome in fasta format.
+
+## Outputs dataset for RepeatModeler
+- Two output files are generated:
+    - summary file (.tbl)
+    - fasta file containing alignments in order of appearance in the query sequence
+
+## Input dataset for RepeatMasker
+- ReapatMasker requires the fasta file generated by RepeatModeler
+
+## Outputs datasets for RepeatMasker
+- Five output files are generated:
+    - a fasta file with masked genome
+    - .gff3 file
+    - a table summarizing the repeated content of the sequence analyzed
+    - a file with statistics related to the repeated content of the sequence analyzed
+    - a summary of the mutation sites found and the order of grouping
+
+
+## Input dataset for Red
+- Red requires a single input file, a genome in fasta format.
+
+## Outputs dataset for Red
+- Two output files are generated:
+    - a fasta file with masked genome
+    - a bed file 
diff --git a/workflows/genome_annotation/repeat_element/RepeatElement-Workflow-tests.yml b/workflows/genome_annotation/repeat_element/RepeatElement-Workflow-tests.yml
@@ -0,0 +1,47 @@
+- doc: Test outline for RepeatMasking Workflow
+  job:
+    input:
+      class: File
+      location: https://zenodo.org/record/8364146/files/eco.fasta?download=1
+      filetype: fasta
+
+  outputs:
+    RepeatModeler consensus sequences:
+      location: https://zenodo.org/record/8364146/files/repeatmodeler_output_sequences.fasta?download=1
+      compare: sim_size
+      delta: 30000
+
+    RepeatModeler seeds alignments:
+      location: https://zenodo.org/record/8364146/files/repeatmodeler_output_seeds.stockholm?download=1
+      compare: sim_size
+      delta: 90000000
+
+    RepeatMasker masked genome:
+      location: https://zenodo.org/record/8364146/files/repeatmasker_output_masked_genome.fasta?download=1
+      compare: sim_size
+      delta: 30000
+    RepeatMasker output log:
+      location: https://zenodo.org/record/8364146/files/repeatmasker_output_log.tabular?download=1
+      compare: sim_size
+      delta: 30000
+    RepeatMasker repeat statistics:
+      location: https://zenodo.org/record/8364146/files/repeatmasker_output_table.txt?download=1
+      compare: sim_size
+      delta: 30000
+    RepeatMasker repeat catalog:
+      location: https://zenodo.org/record/8364146/files/repeatmasker_output_repeat_catalog.txt?download=1
+      compare: sim_size
+      delta: 30000
+    RepeatMasker repeat annotation:
+      location: https://zenodo.org/record/8364146/files/repeatmasker_output_gff.gff?download=1
+      compare: sim_size
+      delta: 30000
+
+    Red masked genome:
+      location: https://zenodo.org/records/10006487/files/red_masked_genome.fasta?download=1
+      compare: sim_size
+      delta: 90000000
+    Red bed:
+      location: https://zenodo.org/records/10006487/files/red_bed.bed?download=1
+      compare: sim_size
+      delta: 30000
diff --git a/workflows/genome_annotation/repeat_element/RepeatElement-Workflow.ga b/workflows/genome_annotation/repeat_element/RepeatElement-Workflow.ga
@@ -0,0 +1,235 @@
+{
+    "a_galaxy_workflow": "true",
+    "annotation": "This workflow takes as input a fasta sequence that will first be processed by RepeatModeler to identify and model families of transposable elements (TEs). The resulting fasta will be processed by RepeatMasker to analyze DNA sequences for interleaved repeats and so-called low-complexity sequences.\nIn a second step, the Red tool will be used to rapidly detect de novo repeats on a genomic scale.",
+    "creator": [
+        {
+            "class": "Person",
+            "email": "mailto:[email protected]",
+            "name": "Romane Libouban"
+        }
+    ],
+    "format-version": "0.1",
+    "license": "MIT",
+    "name": "RepeatElement",
+    "steps": {
+        "0": {
+            "annotation": "This workflow takes as input a fasta sequence which will first be processed by RepeatModeler to identify and model families of transposable elements (TEs). The resulting fasta will be processed by RepeatMasker to annalyze DNA sequences for interleaved repeats and so-called low-complexity sequences. \nIn a second step, the Red tool will be used to rapidly detect de novo repeats on a genomic scale.",
+            "content_id": null,
+            "errors": null,
+            "id": 0,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "This workflow takes as input a fasta sequence which will first be processed by RepeatModeler to identify and model families of transposable elements (TEs). The resulting fasta will be processed by RepeatMasker to annalyze DNA sequences for interleaved repeats and so-called low-complexity sequences. \nIn a second step, the Red tool will be used to rapidly detect de novo repeats on a genomic scale.",
+                    "name": "input"
+                }
+            ],
+            "label": "input",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "left": 0,
+                "top": 0
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"fasta.gz\"], \"tag\": null}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "78d4e14f-0e1f-406c-9c52-2322f4c07e29",
+            "when": null,
+            "workflow_outputs": []
+        },
+        "1": {
+            "annotation": "RepeatModeler is a software package for identifying and modeling de novo families of transposable elements (TEs).",
+            "content_id": "toolshed.g2.bx.psu.edu/repos/csbl/repeatmodeler/repeatmodeler/2.0.4+galaxy1",
+            "errors": null,
+            "id": 1,
+            "input_connections": {
+                "input_file": {
+                    "id": 0,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [],
+            "label": "RepeatModeler",
+            "name": "RepeatModeler",
+            "outputs": [
+                {
+                    "name": "sequences",
+                    "type": "fasta"
+                },
+                {
+                    "name": "seeds",
+                    "type": "stockholm"
+                }
+            ],
+            "position": {
+                "left": 314,
+                "top": 101
+            },
+            "post_job_actions": {},
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/csbl/repeatmodeler/repeatmodeler/2.0.4+galaxy1",
+            "tool_shed_repository": {
+                "changeset_revision": "8661b2607b7e",
+                "name": "repeatmodeler",
+                "owner": "csbl",
+                "tool_shed": "toolshed.g2.bx.psu.edu"
+            },
+            "tool_state": "{\"__input_ext\": \"input\", \"chromInfo\": \"/shared/ifbstor1/galaxy/mutable-config/tool-data/shared/ucsc/chrom/?.len\", \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": "2.0.4+galaxy1",
+            "type": "tool",
+            "uuid": "a2e87b22-00e1-4482-a35b-dcf1b0c751f5",
+            "when": null,
+            "workflow_outputs": [
+                {
+                    "label": "RepeatModeler consensus sequences",
+                    "output_name": "sequences",
+                    "uuid": "b5d89c66-f0a3-423f-94d6-a175c95b1f84"
+                },
+                {
+                    "label": "RepeatModeler seeds alignments",
+                    "output_name": "seeds",
+                    "uuid": "0e45ee1b-54c6-4b00-9960-3b14a2e14a21"
+                }
+            ]
+        },
+        "2": {
+            "annotation": "Detection de novo repeats on a genomic scale",
+            "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/red/red/2018.09.10+galaxy1",
+            "errors": null,
+            "id": 2,
+            "input_connections": {
+                "input": {
+                    "id": 0,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [],
+            "label": "Red",
+            "name": "Red",
+            "outputs": [
+                {
+                    "name": "masked",
+                    "type": "fasta"
+                },
+                {
+                    "name": "bed",
+                    "type": "bed"
+                }
+            ],
+            "position": {
+                "left": 137.99005087408705,
+                "top": 500.9208121357971
+            },
+            "post_job_actions": {},
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/red/red/2018.09.10+galaxy1",
+            "tool_shed_repository": {
+                "changeset_revision": "18a46265455b",
+                "name": "red",
+                "owner": "iuc",
+                "tool_shed": "toolshed.g2.bx.psu.edu"
+            },
+            "tool_state": "{\"input\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": "2018.09.10+galaxy1",
+            "type": "tool",
+            "uuid": "9a2d46b4-fe3d-4719-a301-d6139296a7b1",
+            "when": null,
+            "workflow_outputs": [
+                {
+                    "label": "Red masked genome",
+                    "output_name": "masked",
+                    "uuid": "061dc513-b5c1-4d9c-b1a0-5638de23b33c"
+                },
+                {
+                    "label": "Red bed",
+                    "output_name": "bed",
+                    "uuid": "12d17ab9-5fb0-4d42-bd3e-accd493b01f2"
+                }
+            ]
+        },
+        "3": {
+            "annotation": "RepeatMasker is a program that analyzes DNA sequences for interleaved repeats and *low-complexity* DNA sequences.",
+            "content_id": "toolshed.g2.bx.psu.edu/repos/bgruening/repeat_masker/repeatmasker_wrapper/4.1.5+galaxy0",
+            "errors": null,
+            "id": 3,
+            "input_connections": {
+                "input_fasta": {
+                    "id": 1,
+                    "output_name": "sequences"
+                }
+            },
+            "inputs": [],
+            "label": "RepeatMasker ",
+            "name": "RepeatMasker",
+            "outputs": [
+                {
+                    "name": "output_masked_genome",
+                    "type": "fasta"
+                },
+                {
+                    "name": "output_log",
+                    "type": "tabular"
+                },
+                {
+                    "name": "output_table",
+                    "type": "txt"
+                },
+                {
+                    "name": "output_repeat_catalog",
+                    "type": "txt"
+                },
+                {
+                    "name": "output_gff",
+                    "type": "gff"
+                }
+            ],
+            "position": {
+                "left": 630,
+                "top": 117
+            },
+            "post_job_actions": {},
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/bgruening/repeat_masker/repeatmasker_wrapper/4.1.5+galaxy0",
+            "tool_shed_repository": {
+                "changeset_revision": "ba6d2c32f797",
+                "name": "repeat_masker",
+                "owner": "bgruening",
+                "tool_shed": "toolshed.g2.bx.psu.edu"
+            },
+            "tool_state": "{\"advanced\": {\"is_only\": false, \"is_clip\": false, \"no_is\": false, \"rodspec\": false, \"primspec\": false, \"nolow\": false, \"noint\": false, \"norna\": false, \"alu\": false, \"div\": false, \"search_speed\": \"\", \"frag\": \"40000\", \"gc\": null, \"gccalc\": false, \"nocut\": false, \"xout\": false, \"keep_alignments\": false, \"invert_alignments\": false, \"poly\": false}, \"excln\": true, \"gff\": true, \"input_fasta\": {\"__class__\": \"ConnectedValue\"}, \"repeat_source\": {\"source_type\": \"dfam\", \"__current_case__\": 0, \"species_source\": {\"species_from_list\": \"no\", \"__current_case__\": 1, \"species_name\": \"\"}}, \"xsmall\": false, \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": "4.1.5+galaxy0",
+            "type": "tool",
+            "uuid": "14e86987-b10d-4f95-b96c-a5d2f5757a30",
+            "when": null,
+            "workflow_outputs": [
+                {
+                    "label": "RepeatMasker masked genome",
+                    "output_name": "output_masked_genome",
+                    "uuid": "695aac03-85cd-4a88-86e2-6c723a7df60c"
+                },
+                {
+                    "label": "RepeatMasker repeat catalog",
+                    "output_name": "output_repeat_catalog",
+                    "uuid": "b2661274-9f04-4510-bc7d-cdb9307cb562"
+                },
+                {
+                    "label": "RepeatMasker repeat annotation",
+                    "output_name": "output_gff",
+                    "uuid": "b77b3b6a-236f-4ca8-b007-c2be20d0be34"
+                },
+                {
+                    "label": "RepeatMasker repeat statistics",
+                    "output_name": "output_table",
+                    "uuid": "9786846d-2105-4e6b-b2b3-b639d025a550"
+                },
+                {
+                    "label": "RepeatMasker output log",
+                    "output_name": "output_log",
+                    "uuid": "657a9247-2204-4a8e-b322-512858bfe68c"
+                }
+            ]
+        }
+    },
+    "tags": [],
+    "uuid": "075a5667-8ad4-4a7c-872e-7f13b0e9e024",
+    "version": 6
+}