sanger-tol · DLBPointon · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,23 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.4.8] - Ancient Hippaforalkus (H8) - [2025-05-XX]
+
+Our 15th release for sanger-tol/treeval.
+
+### Enhancements & Fixes
+- Update to documentation.
+  - Correct `RAPID-TOL` to `RAPID_TOL`
+- Better defaults for various paramters if not needed by user.
+- Remove the `kmer_prof` arguments from the yaml. This wasn't in use by the pipeline.
+- Replace the above with the param `kmer_length`.
+- Added an example `params-file` [here](./local_testing/params.json).
+
+### Parameters
+| Old Parameter | New Parameter |
+| ------------- | ------------- |
+| -             | --kmer_length |
-| -             | --kmer_length |
+|               | --kmer_length |
-| -             | --kmer_length |
+|               | --kmer_length |
+
 ## [1.4.7] - Ancient Hippaforalkus (H7) - [2025-04-16]
 
 Our 14th release for sanger-tol/treeval.
@@ -28,8 +45,8 @@ Our 13th release for sanger-tol/treeval.
 | Module                                 | Old Version            | New Versions                       |
 | -------------------------------------- | ---------------------- | ---------------------------------- |
 | GET_LARGEST_SCAFFOLD (coreutils)       | 9.1                    | REMOVED                            |
-| busco/busco                            | 5.7.1                  | 6.0.0                              |   
-| bwamem2/index (bwa-mem2)               | 2.2.1                  | 2.3+htslib=1.22.1+samtools=1.22.1  |   
+| busco/busco                            | 5.7.1                  | 6.0.0                              |
+| bwamem2/index (bwa-mem2)               | 2.2.1                  | 2.3+htslib=1.22.1+samtools=1.22.1  |
 | cat/cat (pigz)                         | 2.3.4                  | 2.8                                |
 | cooler/cload (cooler)                  | 0.9.2                  | 0.10.4                             |
 | cooler/zoomify (cooler)                | 0.9.2                  | 0.10.4+numpy=1.26.4                |

diff --git a/assets/TreeValTinyFullSangerTest.yaml b/assets/TreeValTinyFullSangerTest.yaml
@@ -17,10 +17,6 @@ hic_data:
     - /nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/hic-arima/SUBSET-1000.cram
     - /nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/hic-arima/SUBSET-2000.cram
   hic_aligner: bwamem2
-kmer_profile:
-  # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
-  kmer_length: 31
-  profile: /nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData
 alignment:
   genesets:
     - /nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/gene_alignment_data/fungi/csv_data/LaetiporusSulphureus.gfLaeSulp1-data.csv

diff --git a/assets/TreeValTinyFullTest.yaml b/assets/TreeValTinyFullTest.yaml
@@ -17,10 +17,6 @@ hic_data:
     - https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData/genomic_data/hic-arima/SUBSET-1000.cram
     - https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData/genomic_data/hic-arima/SUBSET-2000.cram
   hic_aligner: bwamem2
-kmer_profile:
-  # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
-  kmer_length: 31
-  profile: https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData/
 alignment:
   genesets:
     - https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData/gene_alignment_data/fungi/csv_data/LaetiporusSulphureus.gfLaeSulp1-data.csv

diff --git a/assets/local_testing/grTriPseu1.yaml b/assets/local_testing/grTriPseu1.yaml
@@ -16,17 +16,9 @@ hic_data:
   hic_cram:
     - /nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/assets/local_testing/grTriPseu1_hic.fofn
   hic_aligner: minimap2
-kmer_profile:
-  # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
-  kmer_length: 31
-  profile: /lustre/scratch122/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/
 alignment:
   genesets:
     - /nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/gene_alignment_data/fungi/csv_data/LaetiporusSulphureus.gfLaeSulp1-data.csv
-self_comp:
-  motif_len: 0
-intron:
-  size: "50k"
 telomere:
   teloseq: TTAGGG
 synteny:

diff --git a/assets/local_testing/nxOscDF5033-BGA.yaml b/assets/local_testing/nxOscDF5033-BGA.yaml
@@ -13,11 +13,6 @@ assem_reads:
 alignment:
   genesets:
     - /lustre/scratch123/tol/resources/treeval/gene_alignment_data/nematode/csv_data/OscheiusTipulae.ASM1342590v1-data.csv
-self_comp:
-  motif_len: 0
-  mummer_chunk: 10
-intron:
-  size: "50k"
 telomere:
   teloseq: TTAGGG
 synteny:

diff --git a/assets/local_testing/nxOscSUBSET.yaml b/assets/local_testing/nxOscSUBSET.yaml
@@ -16,15 +16,9 @@ hic_data:
   hic_cram:
     - /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/genomic_data/hic-arima2/SUBSET-1000.cram
     - /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/genomic_data/hic-arima2/SUBSET-2000.cram
-kmer_profile:
-  # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
-  kmer_length: 31
-  profile: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/genomic_data/nxOscSpes1/pacbio/
 alignment:
   genesets:
     - /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/gene_set/nematode/csv_data/CaenorhabditisElegans.WBcel235-data.csv
-self_comp:
-  motif_len: 0
 intron:
   size: "50k"
 telomere:

diff --git a/assets/local_testing/params.json b/assets/local_testing/params.json
@@ -0,0 +1,7 @@
+{
+    "input": "./assets/local_testing/grTriPseu1.yaml",
+    "outdir": "TESTING",
+    "kmer_length": 31,
+    "mode": "RAPID_TOL",
+    "steps": "telo_finder"
+}
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -169,16 +169,7 @@
         }
     },
     "required": [
-        "busco",
-        "synteny",
-        "telomere",
-        "intron",
-        "self_comp",
-        "alignment",
-        "kmer_profile",
-        "hic_data",
         "assem_reads",
-        "map_order",
         "reference_file",
         "assembly"
     ]

diff --git a/assets/treeval_v2_input_proposal/params.json b/assets/treeval_v2_input_proposal/params.json
@@ -0,0 +1,13 @@
+{
+    "assembly_name": "grTriPseu1",
+    "assembly_version": "1",
+    "scientific_name": "",
+    "defined_class": "fungi",
+    "tracking_id": "RC-1010",
+    "outdir": "TESTING",
+    "mode": "RAPID_TOL",
+    "steps": "telo_finder",
+    "telomere_motif": "TTAGGG",
+    "busco_lineages": "fungi_odb10",
+    "samplesheet": "../treeval_v2_input_proposal/samplesheet.csv"
+}
diff --git a/assets/treeval_v2_input_proposal/production.config b/assets/treeval_v2_input_proposal/production.config
@@ -0,0 +1,30 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run sanger-tol/treeval -profile test,singularity
+
+    On LSF / tol farm:
+        bsub -Is -tty -e error -o out -n 2 -q oversubscribed -M4000 -R'select[mem>4000] rusage[mem=4000] span[hosts=1]' 'nextflow run main.nf -profile test,singularity,sanger'
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = "Production profile"
+    config_profile_description = "Static settings for DTOL production"
+
+    // Not normally in use
+    assembly_level  = "scaffold"
+    project_id      = "DTOL"
+
+    // Used params
+    kmer_length     = 31
+    map_order       = "length"
+    split_telomere  = true
+    run_hires       = false
+    run_ultra       = false
+}
diff --git a/assets/treeval_v2_input_proposal/samplesheet.csv b/assets/treeval_v2_input_proposal/samplesheet.csv
@@ -0,0 +1,7 @@
+assembly,datatype,readtype,readfile
+grTriPseu1,ASSEMBLY,PRIMARY,/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa
+grTriPseu1,LONGREAD,PACBIO,/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/genomic_data/pacbio/seqkitPacbio50000.fasta.gz
+grTriPseu1,HIC,ARIMA2,/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/genomic_data/hic-arima/SUBSET-1000.cram
+grTriPseu1,HIC,ARIMA2,/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/genomic_data/hic-arima/SUBSET-2000.cram
+grTriPseu1,ALIGNMENTS,GENESET,/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/gene_alignment_data/fungi/csv_data/LaetiporusSulphureus.gfLaeSulp1-data.csv
+grTriPseu1,ALIGNMENTS,SYNTENY,/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/synteny/fungi/LaetiporusSulphureus.fasta
diff --git a/conf/modules.config b/conf/modules.config
@@ -510,7 +510,7 @@ process {
     }
 
     withName: ".*:KMER:FASTK_FASTK" {
-        ext.args        = "-k31 -t -P."
+        ext.args        = { "-k${params.kmer_length} -t -P." }
     }
 
     withName: MERQURYFK_MERQURYFK {

diff --git a/docs/output.md b/docs/output.md
@@ -33,8 +33,8 @@ This workflow generates a .genome file which describes the base pair length of e
 <details markdown="1">
 <summary>Output files</summary>
 
-- `treeval_upload/`
-  - `my.genome`: Genome description file of the reference genome.
+- `hic_files/`
+  - `{sample}.sizes`: Description file of the reference genome.
 
 </details>
 
@@ -47,13 +47,13 @@ Read Coverage uses genome sequence reads (HiFi, CLR, ONT or Illumina) reads to g
 <details markdown="1">
 <summary>Output files</summary>
 
-- `treeval_upload/`
-  - `coverage.bw`: Coverage of aligned reads across the reference genome in bigwig format.
-  - `coverage_log.bw`: A log corrected coverage file which aims to smooth out the above track.
+- `hic_files/`
+  - `{sample}_coverage_normal.bigWig`: Coverage of aligned reads across the reference genome in bigwig format.
+
 - `treeval_upload/punchlists/`
-  - `maxdepth.bigbed`: Max read depth punchlist in bigBed format.
-  - `zerodepth.bigbed`: Zero read depth punchlist in bigBed format.
-  - `halfcoverage.bigbed`: Half read depth punchlist in bigBed format.
+  - `maxdepth.bed`: Max read depth punchlist in bed format.
+  - `zerodepth.bed`: Zero read depth punchlist in bed format.
+  - `halfcoverage.bed`: Half read depth punchlist in bed format.
 
 </details>
 
@@ -67,10 +67,10 @@ The gap-finder subworkflow generates a bed file containing the genomic locations
 <summary>Output files</summary>
 
 - `treeval_upload/`
-  - `*.bed.gz`: A bgzipped file containing gap locations
-  - `*.bed.gz.tbi`: A tabix index file for the above file.
+  - `gap_{sample}.bed.gz`: A bgzipped file containing gap locations
+  - `*.bed.gz.csi`: An index file for the above file in Coordinate-Sorted Index format.
 - `hic_files/`
-  - `*.bed`: The raw bed file needed for ingestion into Pretext
+  - `{sample}_gap.bed`: The raw bed file needed for ingestion into Pretext
 
 </details>
 
@@ -84,7 +84,7 @@ This uses [WindowMasker](https://github.com/goeckslab/WindowMasker) to mark pote
 <summary>Output files</summary>
 
 - `hic_files/`
-  - `*_repeat_density.bw`: Intersected read windows aligned to the reference genome in bigwig format.
+  - `{sample}_repeat_density.bigWig`: Intersected read windows aligned to the reference genome in bigwig format.
 
 </details>
 
@@ -102,6 +102,7 @@ The hic-mapping subworkflow takes a set of HiC read files in .cram format as inp
   - `*_pretext_hr.pretext`: High resolution pretext map.
   - `*_pretext_normal.pretext`: Standard resolution pretext map.
   - `*.mcool`: HiC map required for HiGlass
+  - `*_normalFullMap.png`: A pretext snapshot of the normal resolution pretext ONLY. Making snapshots of the other pretexts becomes very computationally expensive for no real gain in quality.
 
 </details>
 
@@ -117,10 +118,12 @@ Optionally, you can now also use the `--split_telomere` flag to generate split t
 <summary>Output files</summary>
 
 - `treeval_upload/`
-  - `*.bed.gz`: A bgzipped file containing telomere sequence locations
-  - `*.bed.gz.tbi`: A tabix index file for the above file.
+  - `telo_*.bed.gz`: A bgzipped file containing telomere sequence locations
+  - `telo_*.bed.gz.csi`: A csi index file for the above file.
 - `hic_files/`
-  - `*.bed`: The raw .bed file needed for ingestion into Pretext
+  - `*_telomere.bed`: The raw .bed file needed for ingestion into Pretext
+  - `*_5P_telomere.bed`: Containing the forward strand associated telomere sites.
+  - `*_3P_telomere.bed`: Containing the reverse strand associated telomere sites.
 
 </details>
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -51,6 +51,8 @@ nextflow run main.nf -profile test_github,singularity
 <details markdown="1">
   <summary>Details</summary>
 
+These data are not required for RAPID or RAPID_TOL modes.
+
 #### Step 1 -- Preparing Synteny data
 
 For synteny you should provide the full genomic fasta file, of any high quality genome you want to be compared against.
@@ -200,7 +202,7 @@ This is all useful for the pipeline which generates job ids based on the org col
 
 Now let's use what we know to fill out the yaml.
 
-The yaml is a file that we need in order to tell the pipeline where everything is, an example can be found [here](https://raw.githubusercontent.com/sanger-tol/treeval/dev/assets/local_testing/nxOscDF5033.yaml).
+The yaml is a file that we need in order to tell the pipeline where everything is, an example can be found [here](https://raw.githubusercontent.com/sanger-tol/treeval/main/assets/local_testing/nxOscSUBSET.yaml).
 
 ```yaml
 alignment:
@@ -293,7 +295,7 @@ samtools bam2fq {prefix}.bam | seqtk seq -a - | gzip - > {prefix}.fasta.gz
 
 Note: This will require you to install bigwigToBedGraph from the ucsc package. Instructions on downloading this can be found at [EXAMPLE #3](https://genome.ucsc.edu/goldenPath/help/bigWig.html#:~:text=Alternatively%2C%20bigWig%20files%20can%20be,to%20the%20Genome%20Browser%20server.)
 
-The PreText files generated by the pipeline _are_ automatically ingested into the pretext files. However, you may want to ingest from other versions of pretextgraph or have your own pre-generated file you want to ingest. For this you must use the following code:
+The pretext accessory files generated by the pipeline _are_ automatically ingested into the pretext files. However, you may want to ingest from other versions of pretextgraph or have your own pre-generated file you want to ingest. For this you must use the following code:
 
 ```
 cd {outdir}/hic_files
@@ -307,6 +309,8 @@ cat {telomere.bedgraph} | awk -v OFS="\t" '{$4 = 1000; print}'|PretextGraph -i {
 cat {gap.bedgraph} | awk -v OFS="\t" '{$4= 1000; print}'| PretextGraph -i { your.pretext } -n "gap"
 ```
 
+BigWigs must be converted to Bedgraph and these 4 column Bedgraphs, must be streamed into PretextGraph.
+
 </details>
 
 ## Full samplesheet
@@ -315,39 +319,36 @@ YAML is "Yet Another Markdown Language", it is a human-readable format that we u
 
 ### YAML contents
 
-The following is an example YAML file we have used during production: [nxOscDF5033.yaml](../assets/local_testing/nxOscDF5033.yaml) and is shown below. This contains some annotations we believe to be helpful, information on the alignment, synteny, longread and hic data.
+The following is an example YAML file we have used during production: [nxOscSUBSET.yaml](https://raw.githubusercontent.com/sanger-tol/treeval/main/assets/local_testing/nxOscSUBSET.yaml) and is shown below. This contains some annotations we believe to be helpful, information on the alignment, synteny, longread and hic data.
+
+Please be aware that the YAML is shaped by how processes have evolved in Tree of Life, Sanger and will adapt primarily as those processes change.
 
 - `assembly`
-  - `assem_level`: scaffold or contig level assembly (not used).
-  - `assem_version`: Used to complete sample_id.
-  - `sample_id`: ToLID of the sample.
-  - `latin_name`: Latin identification of species
-  - `defined_class`: Clade name (as used to group synteny sequences and to complete alignment/data_dir).
-  - `project_id`: Project id for the ticket (not used)
+  - `assem_level` < OPTIONAL, use "" in this case >: scaffold or contig level assembly (not used).
+  - `assem_version` < OPTIONAL, used to name output >: Used to complete sample_id.
+  - `sample_id` < REQUIRED, used to name output >: ToLID of the sample.
+  - `latin_name` < OPTIONAL, use "" in this case >: Latin identification of species
+  - `defined_class` < REQUIRED for synteny and gene_alignemnt runs>: Clade name (as used to group synteny sequences and to complete alignment/data_dir).
+  - `project_id` < OPTIONAL, use "" in this case >: Project id for the ticket (not used)
 - `reference_file`: Sample .fa file.
-- `assem_reads`
+- `assem_reads` < REQUIRED >:
   - `read_type`: { hifi | clr | ont | illumina } To be used in future update.
   - `read_data`:
     - List of paths (ending with `/`) to folder containing fasta.gz files.
   - `supplementary_data`: Will be required in future development.
-- `hic_data`:
+- `hic_data` < REQUIRED >:
   - `hic_cram`: path (ending with `/`) to folder containing cram files.
   - `hic_aligner`: choice between `bwamam2` and `minimap2`
-- `alignment`
+- `alignment` < Only for `FULL` or `JBROWSE` runs >:
   - `genesets`:
     - List of Gene alignment data .csv file paths.
-- `kmer_profile`:
-  - `kmer_length`: length of kmer to be used in plotting, normally 31
-  - `dir`: directory containing old plot to be regenerated if applicable
-- `self_comp`
-  - `motif_len`: Length of motif to be used in self complementary sequence finding
-- `synteny`
+- `synteny` < Only for `FULL`, `JBROWSE` or `FULL_COMBINED` runs >:
   - List of paths to syntenic genomes grouped by clade.
-- `intron:`
+- `intron` < Only for `FULL` runs >:
   - `size`: base pair size of introns default is 50k
-- `telomere`:
+- `telomere` < OPTIONAL with --steps telo_finder >:
   - `teloseq`: Telomeric motif
-- `busco`
+- `busco` < OPTIONAL with --steps busco >:
   - `lineages_path`: path to folder above lineages folder
   - `lineage`: Example is `nematode_odb10`
 
@@ -416,13 +417,13 @@ find v5/data -name "*.tar.gz" | parallel "cd {//}; tar -xzf {/}"
 - `BUSCO_ANALYSIS`
   - Uses BUSCO to identify ancestral elements. Also use to identify ancestral Lepidopteran genes (merian units).
 - `KMER`
-  - Generating kmer graphs of the assembly.
+  - Generating kmer data and graphs of the assembly.
 
 </details>
 
 ## Running the pipeline
 
-The typical command for running the pipeline is as follows (if you require the RAPID workflow you can append `--mode RAPID` to the command):
+The typical and minimal command for running the pipeline is as follows (if you require the RAPID workflow you can append `--mode RAPID` to the command):
 
 ```console
 nextflow run sanger-tol/treeval --input assets/treeval.yaml --outdir <OUTDIR> -profile singularity,sanger
@@ -443,11 +444,11 @@ work                # Directory containing the nextflow working files
 
 The TreeVal pipeline now contains a command line option for `--mode` which replaces the now depreciated `-entry` parameter.
 
-This enum param expects only one of ["FULL", "RAPID", "RAPID-TOL", "JBROWSE", "FULL_COMBINED"].
+This enum param expects only one of ["FULL", "RAPID", "RAPID_TOL", "JBROWSE", "FULL_COMBINED"].
 
-FULL will run all subworkflows shows in all_steps_list.
+FULL will run all subworkflows shown below in the all_steps_list.
 
-RAPID and RAPID-TOL will run all subworkflows in rapid_include_list. Although there is no obvious difference, RAPID-TOL includes a check later in the pipeline to stop the generation of Juicer files which are no longer in use at Sanger. The logic is:
+RAPID and RAPID_TOL will run all subworkflows in rapid_include_list. Although there is no obvious difference, RAPID_TOL includes a check later in the pipeline to stop the generation of Juicer files which are no longer in use at Sanger. The logic is:
 
 ```
 if workflow != RAPID_TOL and param.juicer == false, then run juicer subsetting
@@ -457,7 +458,7 @@ This qualifies as: if (false && false) { run juicer } or if workflow is RAPID_TO
 
 JBROWSE, generates the data which can be ingested by JBROWSE. Useful when RAPID has been used in a previous run and you now need "the rest" of the data. This runs all subworkflows notes in jbrowse_include_list.
 
-FULL_COMBINED runs all subworkflows in combined_include_list, which includes all steps except selfcomp and gene_alignment.
+FULL_COMBINED (intended for runs where the primary and n\*haplotype have been merged into one file) runs all subworkflows in combined_include_list, which includes all steps except selfcomp and gene_alignment.
 
 ```
 all_steps_list          = ["insilico_digest", "gene_alignment", "repeat_density", "gap_finder", "selfcomp", "synteny", "read_coverage", "telo_finder", "busco", "kmer", "hic_mapping", "NONE"]