instadeepai · JemmaLDaniel · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/.gitignore b/.gitignore
@@ -12,16 +12,30 @@ docs_public
 *.csv
 *.parquet
 *.ipc
+*.mztab
+*.fasta
+*.mgf
 *.pkl
 *.json
 *.yaml
+*.pdf
+*.png
+
+*.ipynb
 
 examples/winnow-general-model
 examples/winnow-ms-datasets
 examples/output
 
+# Sample data files
+examples/example_data/*.ipc
+examples/example_data/*.csv
+examples/example_data/*.parquet
+
 build/
 
+.cursorrules
+
 # Coverage reports
 htmlcov/
 .coverage

diff --git a/Makefile b/Makefile
@@ -86,12 +86,16 @@ install-all:
 ## Development commands														 	#
 #################################################################################
 
-.PHONY: tests test-docker bash set-gcp-credentials set-ceph-credentials
+.PHONY: tests clean-coverage test-docker bash build-package clean-build clean-workspace test-build clean-all-build test-cli-isolated test-cli-config set-gcp-credentials set-ceph-credentials
 
 ## Run all tests
 tests:
 	$(PYTEST)
 
+## Clean coverage reports
+clean-coverage:
+	rm -rf htmlcov/ .coverage coverage.xml pytest.xml
+
 ## Run all tests in the Docker Image
 test-docker:
 	docker run $(DOCKER_RUN_FLAGS) $(DOCKER_IMAGE) $(PYTEST)
@@ -100,6 +104,17 @@ test-docker:
 bash:
 	docker run -it $(DOCKER_RUN_FLAGS) $(DOCKER_IMAGE) /bin/bash
 
+## Build the winnow-fdr package (creates wheel and sdist in dist/)
+build-package:
+	uv build
+
+## Clean all build artifacts (dist/, build/, *.egg-info/)
+clean-build:
+	rm -rf dist/ build/ *.egg-info/ winnow_fdr.egg-info/
+
+## Build the package and then clean up (safe test build)
+test-build: build-package clean-build
+
 ## Set the GCP credentials
 set-gcp-credentials:
 	uv run python scripts/set_gcp_credentials.py
@@ -108,3 +123,28 @@ set-gcp-credentials:
 ## Set the Ceph credentials
 set-ceph-credentials:
 	uv run python scripts/set_ceph_credentials.py
+
+#################################################################################
+## Sample data and CLI commands													#
+#################################################################################
+
+.PHONY: sample-data train-sample predict-sample clean clean-all
+
+## Generate sample data files for testing
+sample-data:
+	uv run python scripts/generate_sample_data.py
+
+## Run winnow train with sample data (uses defaults from config)
+train-sample:
+	winnow train
+
+## Run winnow predict with sample data (uses locally trained model from models/new_model)
+predict-sample:
+	winnow predict calibrator.pretrained_model_name_or_path=models/new_model
+
+## Clean output directories (does not delete sample data)
+clean:
+	rm -rf models/ results/
+
+## Clean outputs and regenerate sample data
+clean-all: clean sample-data
diff --git a/README.md b/README.md
@@ -42,9 +42,9 @@
     <a href="https://instadeepai.github.io/winnow/"><strong>Explore the docs »</strong></a>
     <br />
     <br />
-    <a href="https://github.com/instadeepai/winnow/issues/new?labels=bug&template=bug_report.md">Report Bug</a>
+    <a href="https://github.com/instadeepai/winnow/issues/new?labels=bug&template=bug_report.md">Report bug</a>
     &middot;
-    <a href="https://github.com/instadeepai/winnow/issues/new?labels=enhancement&template=feature_request.md">Request Feature</a>
+    <a href="https://github.com/instadeepai/winnow/issues/new?labels=enhancement&template=feature_request.md">Request feature</a>
   </p>
 </div>
 
@@ -55,16 +55,13 @@
   <summary>Table of Contents</summary>
   <ol>
     <li>
-      <a href="#about-the-project">About The Project</a>
+      <a href="#about-the-project">About the project</a>
     </li>
     <li>
       <a href="#installation">Installation</a>
     </li>
-    <li><a href="#usage">Usage</a>
-      <ul>
-        <li><a href="#CLI">CLI</a></li>
-        <li><a href="#Package">Package</a></li>
-      </ul>
+    <li>
+      <a href="#usage">Usage</a>
     </li>
     <li><a href="#contributing">Contributing</a></li>
   </ol>
@@ -77,7 +74,7 @@
 </div>
 
 <!-- ABOUT THE PROJECT -->
-## About The Project
+## About the project
 
 <!-- [![Product Name Screen Shot][product-screenshot]](https://example.com) -->
 In bottom-up proteomics workflows, peptide sequencing—matching an MS2 spectrum to a peptide—is just the first step. The resulting peptide-spectrum matches (PSMs) often contain many incorrect identifications, which can negatively impact downstream tasks like protein assembly.
@@ -87,7 +84,7 @@ To mitigate this, intermediate steps are introduced to:
 1. Assign confidence scores to PSMs that better correlate with correctness.
 2. Estimate and control the false discovery rate (FDR) by filtering identifications based on confidence scores.
 
-For database search-based peptide sequencing, PSM rescoring and target-decoy competition (TDC) are standard approaches, supported by an extensive ecosystem of tools. However, *de novo* peptide sequencing lacks standardized methods for these tasks.
+For database search-based peptide sequencing, PSM rescoring and target-decoy competition (TDC) are standard approaches, supported by an extensive ecosystem of tools. However, *de novo* peptide sequencing lacks standardised methods for these tasks.
 
 `winnow` aims to fill this gap by implementing the calibrate-estimate framework for FDR estimation. Unlike TDC, this approach is directly applicable to *de novo* sequencing models. Additionally, its calibration step naturally incorporates common confidence rescoring workflows as part of FDR estimation.
 
@@ -109,7 +106,38 @@ uv pip install winnow-fdr
 ```
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
+<!-- QUICK START -->
+## Quick Start
+
+Get started with `winnow` in minutes using the included sample data:
+
+```bash
+# Generate sample data (if not already present)
+make sample-data
+
+# Train a calibrator on the sample data
+make train-sample
+
+# Run prediction with the trained model
+make predict-sample
+```
+
+The sample data is automatically configured in `configs/train.yaml` and `configs/predict.yaml`, so you can also use the commands directly:
+
+```bash
+# Train with sample data
+winnow train
+
+# Predict with pretrained HuggingFace model
+winnow predict
+
+# Predict with your locally trained model
+winnow predict calibrator.pretrained_model_name_or_path=models/new_model
+```
+
+**Note:** The sample data is minimal (20 spectra) and intended for testing only. For production use, replace with your own datasets.
 
+<p align="right">(<a href="#readme-top">back to top</a>)</p>
 
 <!-- USAGE EXAMPLES -->
 ## Usage
@@ -126,13 +154,26 @@ Installing `winnow` provides the `winnow` command with two sub-commands:
 1. `winnow train` – Performs confidence calibration on a dataset of annotated PSMs, outputting the fitted model checkpoint.
 2. `winnow predict` – Performs confidence calibration using a fitted model checkpoint (defaults to a pretrained general model from HuggingFace), estimates and controls FDR using the calibrated confidence scores.
 
-By default, `winnow predict` uses a pretrained general model (`InstaDeepAI/winnow-general-model`) hosted on HuggingFace Hub, allowing you to get started immediately without training. You can also specify custom HuggingFace models or use locally trained models.
+By default, `winnow predict` uses a pretrained general model ([`InstaDeepAI/winnow-general-model`](https://huggingface.co/InstaDeepAI/winnow-general-model)) hosted on HuggingFace Hub, allowing you to get started immediately without training. You can also specify custom HuggingFace models or use locally trained models.
+
+Winnow uses [Hydra](https://hydra.cc/) for flexible, hierarchical configuration management. All parameters can be configured via YAML files or overridden on the command line:
+
+```bash
+# Quick start with defaults
+winnow predict
 
-Refer to the documentation for details on command-line arguments and usage examples.
+# Override specific parameters
+winnow predict fdr_control.fdr_threshold=0.01
+
+# Specify different data source and dataset paths
+winnow predict data_loader=mztab dataset.spectrum_path_or_directory=data/spectra.parquet dataset.predictions_path=data/preds.mztab
+```
+
+Refer to the [CLI Guide](cli.md) and [Configuration Guide](configuration.md) for details on usage and configuration options.
 
 ### Package
 
-The `winnow` package is organized into three sub-modules:
+The `winnow` package is organised into three sub-modules:
 
 1. `winnow.datasets` – Handles data loading and saving, including the `CalibrationDataset` class for mapping peptide sequencing output formats.
 2. `winnow.calibration` – Implements confidence calibration. Key components include:
@@ -152,10 +193,11 @@ For an example, check out the [example notebook](https://github.com/instadeepai/
 Contributions are what make the open-source community such an amazing place to learn, inspire and create, and we welcome your support! Any contributions you make are **greatly appreciated**.
 
 If you have ideas for enhancements, you can:
+
 - Fork the repository and submit a pull request.
 - Open an issue and tag it with "enhancement".
 
-### Contribution Process
+### Contribution process
 
 1. Fork the repository.
 2. Create a feature branch (`git checkout -b feature-amazing-feature`).
@@ -166,7 +208,7 @@ Don't forget to give the project a star! Thanks again! :star:
 
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
-### BibTeX entry and citation info
+### BibTeX entry and citation information
 
 If you use `winnow` in your research, please cite the following preprint:
 
@@ -175,7 +217,11 @@ If you use `winnow` in your research, please cite the following preprint:
       title={De novo peptide sequencing rescoring and FDR estimation with Winnow},
       author={Amandla Mabona and Jemma Daniel and Henrik Servais Janssen Knudsen and Rachel Catzel
       and Kevin Michael Eloff and Erwin M. Schoof and Nicolas Lopez Carranza and Timothy P. Jenkins
-      and Jeroen Van Goey and Konstantinos Kalogeropoulos},
+@article{mabona2025novopeptidesequencingrescoring,
+     title={De novo peptide sequencing rescoring and FDR estimation with Winnow},
+     author={Amandla Mabona and Jemma Daniel and Henrik Servais Janssen Knudsen
+     and Rachel Catzel and Kevin Michael Eloff and Erwin M. Schoof and Nicolas Lopez
+     Carranza and Timothy P. Jenkins and Jeroen Van Goey and Konstantinos Kalogeropoulos},
       year={2025},
       eprint={2509.24952},
       archivePrefix={arXiv},

diff --git a/docs/api/calibration.md b/docs/api/calibration.md
@@ -96,7 +96,7 @@ class CustomFeature(CalibrationFeatures):
 - **Column Specification**: Define output column names
 - **Dataset Integration**: Direct access to CalibrationDataset for computation
 
-## Built-in Features
+## Built-in features
 
 ### MassErrorFeature
 
@@ -159,7 +159,7 @@ feature = RetentionTimeFeature(hidden_dim=10, train_fraction=0.1)
 
 **Purpose**: Incorporates chromatographic information for confidence calibration.
 
-## Handling Missing Features
+## Handling missing features
 
 Prosit-dependent features (PrositFeatures, ChimericFeatures, RetentionTimeFeature) may not be computable for all peptides due to limitations like:
 
@@ -170,7 +170,7 @@ Prosit-dependent features (PrositFeatures, ChimericFeatures, RetentionTimeFeatur
 
 Winnow provides two strategies for handling such cases:
 
-### Learn Strategy (Default, `learn_from_missing=True`)
+### Learn strategy (Default, `learn_from_missing=True`)
 
 **Recommended for most use cases.**
 
@@ -179,7 +179,7 @@ Winnow provides two strategies for handling such cases:
 - Uses all available data, maximising recall
 - More robust across diverse datasets
 
-### Filter Strategy (`learn_from_missing=False`)
+### Filter strategy (`learn_from_missing=False`)
 
 **Use when you want strict data quality requirements.**
 
@@ -228,14 +228,14 @@ rt_feat = RetentionTimeFeature(hidden_dim=10, train_fraction=0.1, learn_from_mis
 
 ## Workflow
 
-### Training Workflow
+### Training workflow
 
 1. **Create Calibrator**: Initialise `ProbabilityCalibrator`
 2. **Add Features**: Use `add_feature()` to include desired calibration features
 3. **Fit Model**: Call `fit()` with labelled `CalibrationDataset`
 4. **Save Model**: Use `save()` to persist trained calibrator
 
-### Prediction Workflow
+### Prediction workflow
 
 1. **Load Calibrator**: Use `load()` to restore trained model from a HuggingFace repository or a local directory
    ```python
@@ -251,7 +251,7 @@ rt_feat = RetentionTimeFeature(hidden_dim=10, train_fraction=0.1, learn_from_mis
 2. **Predict**: Call `predict()` with unlabelled `CalibrationDataset`
 3. **Access Results**: Calibrated scores stored in dataset's "calibrated_confidence" column
 
-## Feature Dependencies
+## Feature dependencies
 
 The system automatically handles feature dependencies:
 

diff --git a/docs/api/datasets.md b/docs/api/datasets.md
@@ -37,12 +37,12 @@ dataset.save(Path("output_directory"))
 
 **Key Features:**
 
-- **Multiple Format Support**: Load data using specialized loaders for different file formats
+- **Multiple Format Support**: Load data using specialised loaders for different file formats
 - **Data Integration**: Combines spectral data with prediction metadata
 - **Filtering**: Removes invalid tokens and unsupported modifications
 - **Evaluation**: Computes correctness labels when ground truth available
 
-### Data Loaders
+### Data loaders
 
 The datasets module provides several data loaders that implement the `DatasetLoader` protocol:
 

diff --git a/docs/api/fdr.md b/docs/api/fdr.md
@@ -2,7 +2,7 @@
 
 The `winnow.fdr` module implements false discovery rate (FDR) estimation and control methods for *de novo* peptide sequencing using both database-grounded and non-parametric approaches.
 
-## Base Interface
+## Base interface
 
 ### FDRControl
 
@@ -108,9 +108,9 @@ dataset_with_q_values = fdr_control.add_psm_q_value(dataset, "confidence")
     - **Q-value**: `compute_q_value(score)` - Minimum FDR for significance
 - **No Ground Truth Required**: Works with confidence scores alone
 
-## Additional Features
+## Additional features
 
-### PSM-Specific FDR
+### PSM-specific FDR
 
 Both methods support PSM-specific FDR estimation:
 
@@ -140,7 +140,7 @@ dataset_with_q_values = fdr_control.add_psm_q_value(
 psm_q_values = dataset_with_q_values["psm_q_value"]
 ```
 
-### Confidence Curves
+### Confidence curves
 
 Generate FDR vs confidence curves for analysis:
 
@@ -159,7 +159,7 @@ plt.xlabel("FDR Threshold")
 plt.ylabel("Confidence Cutoff")
 ```
 
-### Dataset Filtering
+### Dataset filtering
 
 Filter PSM datasets at target FDR levels:
 
@@ -175,7 +175,7 @@ filtered_psms = fdr_control.filter_entries(
 print(f"Retained {len(filtered_psms)} PSMs at 1% FDR")
 ```
 
-### FDR Estimation Method Selection
+### FDR estimation method selection
 
 **Use DatabaseGroundedFDRControl when:**