diff --git a/README.md b/README.md index fcadadb2..106ff27b 100644 --- a/README.md +++ b/README.md @@ -27,16 +27,24 @@ This repository is a work in progress, and still in early development. This repo ## Installation -To install any component of `gtars`, you must have the rust toolchain installed. You can install it by [following the instructions](https://www.rust-lang.org/tools/install). +To install `gtars`, you must first [install the rust toolchain](https://www.rust-lang.org/tools/install). ### Command-line interface + You may build the cli binary locally by navigating to `gtars-cli` and using `cargo build --release`. This will create a binary in `target/release/gtars` at the top level of the workspace. You can then add this to your path, or run it directly. Alternatively, you can run `cargo install --path gtars-cli` from the top level of the workspace. This will install the binary to your cargo bin directory (usually `~/.cargo/bin`). +We feature-gate binary dependencies maximize compatibility and minimize install size. You can specify features during installation like so: + +``` +cargo install --path gtars-cli gtars-cli --features "uniwig tokenizers" +``` + Finally, you can download precompiled binaries from the [releases page](https://github.com/databio/gtars/releases). ### Python bindings + You can install the Python bindings via pip. First, ensure you have a recent version of pip installed. Then run: ```bash @@ -50,6 +58,11 @@ from gtars import __version__ print(__version__) ``` +### Dev Python bindings + + + + ## Usage `gtars` provides several useful tools. There are 3 ways to use `gtars`. diff --git a/gtars-core/src/models/region_set.rs b/gtars-core/src/models/region_set.rs index d7b5ab07..9dfc59e3 100644 --- a/gtars-core/src/models/region_set.rs +++ b/gtars-core/src/models/region_set.rs @@ -22,7 +22,9 @@ use bigtools::{BedEntry, BigBedWrite}; use crate::models::Region; #[cfg(feature = "http")] use crate::utils::get_dynamic_reader_from_url; -use crate::utils::{get_chrom_sizes, get_dynamic_reader}; +use crate::utils::get_dynamic_reader; +#[cfg(feature = "bigbed")] +use crate::utils::get_chrom_sizes; /// /// RegionSet struct, the representation of the interval region set file, diff --git a/gtars-core/src/utils.rs b/gtars-core/src/utils.rs index ed26d524..2a56701c 100644 --- a/gtars-core/src/utils.rs +++ b/gtars-core/src/utils.rs @@ -2,14 +2,19 @@ use std::collections::HashMap; use std::ffi::OsStr; use std::fs::File; use std::io::prelude::*; -use std::io::{BufRead, BufReader, Cursor}; +use std::io::{BufRead, BufReader}; +#[cfg(feature = "http")] +use std::io::Cursor; use std::path::{Path, PathBuf}; use std::str::FromStr; use anyhow::{Context, Result}; -use flate2::read::{GzDecoder, MultiGzDecoder}; +#[cfg(feature = "http")] +use flate2::read::GzDecoder; +use flate2::read::MultiGzDecoder; #[cfg(feature = "http")] use reqwest::blocking::Client; +#[cfg(feature = "http")] use std::error::Error; use crate::models::region::Region; diff --git a/gtars-python/README.md b/gtars-python/README.md new file mode 100644 index 00000000..aff2af7f --- /dev/null +++ b/gtars-python/README.md @@ -0,0 +1,45 @@ +# gtars + +This is a Python package that wraps the `gtars` crate so you can call gtars code from Python. + +Documentation for Python bindings is hosted at: https://docs.bedbase.org/gtars/ + +## Brief instructions + +To install the development version, you'll have to build it locally. The easy way to build/install is to do this, which reads the configuration in `pyproject.toml` for easy building/installing: + +``` +python -m pip install -e . +``` + +Or, you can have more control over it in two steps using `maturin` directly. +Build Python bindings like this: + +```console +cd gtars-python +python_version=$(python --version | awk '{print $2}' | cut -d '.' -f1-2 ) +maturin build --interpreter $python_version --release +``` + + +Then install the local wheel that was just built: + +```console +gtars_version=`grep '^version =' Cargo.toml | cut -d '"' -f 2` +python_version_nodot=$(python --version | awk '{print $2}' | cut -d '.' -f1-2 | tr -d '.') +ll ../target/wheels/gtars* +wheel_path=$(find ../target/wheels/gtars-${gtars_version}-cp${python_version_nodot}-cp${python_version_nodot}-manylinux*.whl) +echo $wheel_path +pip install --force-reinstall ${wheel_path} +``` + + +## Importing into python + +Once installed, you can import and use the package in Python. For example: + +``` +from gtars import refget +sc = refget.digest_fasta("../tests/data/fasta/base.fa") +sc2 = refget.load_fasta("../tests/data/fasta/base.fa") +``` diff --git a/gtars-python/py_src/gtars/refget/__init__.pyi b/gtars-python/py_src/gtars/refget/__init__.pyi index 3aa41f12..c6536955 100644 --- a/gtars-python/py_src/gtars/refget/__init__.pyi +++ b/gtars-python/py_src/gtars/refget/__init__.pyi @@ -1,3 +1,18 @@ +"""Type stubs and documentation for the gtars.refget module. + +This file serves two purposes: + +1. **Type Hints**: Provides type annotations for IDE autocomplete and static + type checking tools like mypy. + +2. **Documentation**: Contains Google-style docstrings that mkdocstrings uses + to generate the API reference documentation website. + +Note: The actual implementation is in Rust (gtars-python/src/refget/mod.rs) +and compiled via PyO3. This stub file provides the Python interface definition +and structured documentation that tools can parse properly. +""" + from typing import Union, Optional, List, Type from enum import Enum from os import PathLike @@ -93,74 +108,422 @@ class StorageMode(Enum): Encoded: int class GlobalRefgetStore: - """ - A global store for refget sequences, allowing import, retrieval, and storage operations. + """A global store for GA4GH refget sequences with lazy-loading support. + + GlobalRefgetStore provides content-addressable storage for reference genome + sequences following the GA4GH refget specification. Supports both local and + remote stores with on-demand sequence loading. + + Attributes: + cache_path: Local directory path where the store is located or cached. + None for in-memory stores. + remote_url: Remote URL of the store if loaded remotely, None otherwise. + + Examples: + Create a new store and import sequences:: + + from gtars.refget import GlobalRefgetStore, StorageMode + store = GlobalRefgetStore(StorageMode.Encoded) + store.import_fasta("genome.fa") + + Load an existing local store:: + + store = GlobalRefgetStore.load_local("/data/hg38") + seq = store.get_substring("chr1_digest", 0, 1000) + + Load a remote store with caching:: + + store = GlobalRefgetStore.load_remote( + "/local/cache", + "https://example.com/hg38" + ) """ - def __init__(self, mode: StorageMode) -> None: ... - def import_fasta(self, file_path: Union[str, PathLike]) -> None: + cache_path: Optional[str] + remote_url: Optional[str] + + def __init__(self, mode: StorageMode) -> None: + """Create a new empty GlobalRefgetStore. + + Args: + mode: Storage mode - StorageMode.Raw (uncompressed) or + StorageMode.Encoded (bit-packed, space-efficient). + + Example:: + + store = GlobalRefgetStore(StorageMode.Encoded) """ - Import a fasta into the GlobalRefgetStore + ... + + @classmethod + def load_local(cls, cache_path: Union[str, PathLike]) -> "GlobalRefgetStore": + """Load a local RefgetStore from a directory. + + Loads metadata from the local store immediately; sequence data is loaded + on-demand when first accessed. This is efficient for large genomes where + you may only need specific sequences. + + Args: + cache_path: Local directory containing the refget store (must have + index.json and sequences.farg files). + + Returns: + GlobalRefgetStore with metadata loaded, sequences lazy-loaded. + + Raises: + IOError: If the store directory or index files cannot be read. + + Example:: + + store = GlobalRefgetStore.load_local("/data/hg38_store") + seq = store.get_substring("chr1_digest", 0, 1000) """ ... - def get_sequence_by_id(self, digest: str) -> Optional[SequenceRecord]: + + @classmethod + def load_remote( + cls, cache_path: Union[str, PathLike], remote_url: str + ) -> "GlobalRefgetStore": + """Load a remote RefgetStore with local caching. + + Fetches metadata (index.json, sequences.farg) from a remote URL immediately. + Sequence data (.seq files) are downloaded on-demand when first accessed and + cached locally. This is ideal for working with large remote genomes where + you only need specific sequences. + + Args: + cache_path: Local directory to cache downloaded metadata and sequences. + Created if it doesn't exist. + remote_url: Base URL of the remote refget store (e.g., + "https://example.com/hg38" or "s3://bucket/hg38"). + + Returns: + GlobalRefgetStore with metadata loaded, sequences fetched on-demand. + + Raises: + IOError: If remote metadata cannot be fetched or cache cannot be written. + + Example:: + + store = GlobalRefgetStore.load_remote( + "/data/cache/hg38", + "https://refget-server.com/hg38" + ) + # First access fetches from remote and caches + seq = store.get_substring("chr1_digest", 0, 1000) + # Second access uses cache + seq2 = store.get_substring("chr1_digest", 1000, 2000) """ - Retrieves a sequence record by its SHA512t24u or MD5 digest. + ... + + def import_fasta(self, file_path: Union[str, PathLike]) -> None: + """Import sequences from a FASTA file into the store. + + Reads all sequences from a FASTA file and adds them to the store. + Computes GA4GH digests and creates a sequence collection. + + Args: + file_path: Path to the FASTA file. + + Raises: + IOError: If the file cannot be read or parsed. + + Example:: + + store = GlobalRefgetStore(StorageMode.Encoded) + store.import_fasta("genome.fa") """ ... + + def get_sequence_by_id(self, digest: str) -> Optional[SequenceRecord]: + """Retrieve a sequence record by its digest (SHA-512/24u or MD5). + + Searches for a sequence by its GA4GH SHA-512/24u digest. If not found + and the input looks like an MD5 digest (32 hex characters), tries MD5 lookup. + + Args: + digest: Sequence digest (SHA-512/24u base64url or MD5 hex string). + + Returns: + The sequence record if found, None otherwise. + + Example:: + + record = store.get_sequence_by_id("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2") + if record: + print(f"Found: {record.metadata.name}") + """ + ... + def get_sequence_by_collection_and_name( self, collection_digest: str, sequence_name: str ) -> Optional[SequenceRecord]: - """ - Retrieve a SequenceRecord from the store by its collection digest and name - """ + """Retrieve a sequence by collection digest and sequence name. + Looks up a sequence within a specific collection using its name + (e.g., "chr1", "chrM"). This is useful when you know the genome assembly + (collection) and chromosome name. + + Args: + collection_digest: The collection's SHA-512/24u digest. + sequence_name: Name of the sequence within that collection. + + Returns: + The sequence record if found, None otherwise. + + Example:: + + record = store.get_sequence_by_collection_and_name( + "uC_UorBNf3YUu1YIDainBhI94CedlNeH", + "chr1" + ) + """ ... + def get_substring(self, seq_digest: str, start: int, end: int) -> Optional[str]: - """ - Retrieves a substring from an encoded sequence by its SHA512t24u digest. + """Extract a substring from a sequence. + + Retrieves a specific region from a sequence using 0-based, half-open + coordinates [start, end). Automatically loads sequence data if not + already cached (for lazy-loaded stores). + Args: - seq_digest - str - the path to import from - start - int - The start index of the substring (inclusive) - end - int - The end index of the substring (exclusive) + seq_digest: Sequence digest (SHA-512/24u). + start: Start position (0-based, inclusive). + end: End position (0-based, exclusive). + Returns: - substring - str - returns substring if found, None if not found + The substring sequence if found, None otherwise. + + Example:: + + # Get first 1000 bases of chr1 + seq = store.get_substring("chr1_digest", 0, 1000) + print(f"First 50bp: {seq[:50]}") + """ + ... + + def list_sequences(self) -> List[SequenceMetadata]: + """List all sequence metadata in the store. + Returns: + List of metadata for all sequences in the store. """ ... + + def list_collections(self) -> List[SequenceCollection]: + """List all sequence collections in the store. + + Returns: + List of all sequence collections. + """ + ... + def write_store_to_directory( self, root_path: Union[str, PathLike], seqdata_path_template: str ) -> None: + """Write the store to a directory on disk. + + Persists the store with all sequences and metadata to disk using the + RefgetStore directory format. + + Args: + root_path: Directory path to write the store to. + seqdata_path_template: Path template for sequence files (e.g., + "sequences/%s2/%s.seq" where %s2 = first 2 chars of digest, + %s = full digest). + + Example:: + + store.write_store_to_directory( + "/data/my_store", + "sequences/%s2/%s.seq" + ) """ - Write a GlobalRefgetStore object to a directory + ... + + def get_seqs_bed_file( + self, + collection_digest: str, + bed_file_path: Union[str, PathLike], + output_fasta_path: Union[str, PathLike], + ) -> None: + """Extract sequences for BED regions and write to FASTA. + + Args: + collection_digest: Collection digest to look up sequence names. + bed_file_path: Path to BED file with regions. + output_fasta_path: Path to write output FASTA file. """ ... - @classmethod - def load_from_directory( - cls: Type["GlobalRefgetStore"], root_path: Union[str, PathLike] - ) -> "GlobalRefgetStore": + + def get_seqs_bed_file_to_vec( + self, collection_digest: str, bed_file_path: Union[str, PathLike] + ) -> List[RetrievedSequence]: + """Extract sequences for BED regions and return as list. + + Args: + collection_digest: Collection digest to look up sequence names. + bed_file_path: Path to BED file with regions. + + Returns: + List of retrieved sequence segments. """ - Load a GlobalRefgetStore from a directory path + ... + + def export_fasta( + self, + collection_digest: str, + output_path: Union[str, PathLike], + sequence_names: Optional[List[str]] = None, + line_width: Optional[int] = None, + ) -> None: + """Export sequences from a collection to a FASTA file. + + Args: + collection_digest: Collection to export from. + output_path: Path to write FASTA file. + sequence_names: Optional list of sequence names to export. If None, + exports all sequences in the collection. + line_width: Optional line width for wrapping sequences. If None, + uses default of 80. """ + ... + def export_fasta_by_digests( + self, + digests: List[str], + output_path: Union[str, PathLike], + line_width: Optional[int] = None, + ) -> None: + """Export sequences by their digests to a FASTA file. + + Args: + digests: List of sequence digests to export. + output_path: Path to write FASTA file. + line_width: Optional line width for wrapping sequences. If None, + uses default of 80. + """ ... + def __str__(self) -> str: ... def __repr__(self) -> str: ... def sha512t24u_digest(readable: Union[str, bytes]) -> str: - """ - Computes the GA4GH SHA512t24u digest for a given string or bytes. + """Compute the GA4GH SHA-512/24u digest for a sequence. + + This function computes the GA4GH refget standard digest (truncated SHA-512, + base64url encoded) for a given sequence string or bytes. + + Args: + readable: Input sequence as str or bytes. + + Returns: + The SHA-512/24u digest (32 character base64url string). + + Raises: + TypeError: If input is not str or bytes. + + Example:: + from gtars.refget import sha512t24u_digest + digest = sha512t24u_digest("ACGT") + print(digest) + # Output: 'aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2' """ ... def md5_digest(readable: Union[str, bytes]) -> str: - """ - Computes the MD5 digest for a given string or bytes. + """Compute the MD5 digest for a sequence. + + This function computes the MD5 hash for a given sequence string or bytes. + MD5 is supported for backward compatibility with legacy systems. + + Args: + readable: Input sequence as str or bytes. + + Returns: + The MD5 digest (32 character hexadecimal string). + + Raises: + TypeError: If input is not str or bytes. + + Example:: + from gtars.refget import md5_digest + digest = md5_digest("ACGT") + print(digest) + # Output: 'f1f8f4bf413b16ad135722aa4591043e' """ ... def digest_fasta(fasta: Union[str, PathLike]) -> SequenceCollection: + """Digest all sequences in a FASTA file and compute collection-level digests. + + This function reads a FASTA file and computes GA4GH-compliant digests for + each sequence, as well as collection-level digests (Level 1 and Level 2) + following the GA4GH refget specification. + + Args: + fasta: Path to FASTA file (str or PathLike). + + Returns: + Collection containing all sequences with their metadata and computed digests. + + Raises: + IOError: If the FASTA file cannot be read or parsed. + + Example:: + from gtars.refget import digest_fasta + collection = digest_fasta("genome.fa") + print(f"Collection digest: {collection.digest}") + print(f"Number of sequences: {len(collection)}") + """ + ... + +def compute_fai(fasta: Union[str, PathLike]) -> List["FaiRecord"]: + """Compute FASTA index (FAI) metadata for all sequences in a FASTA file. + + This function computes the FAI index metadata (offset, line_bases, line_bytes) + for each sequence in a FASTA file, compatible with samtools faidx format. + Only works with uncompressed FASTA files. + + Args: + fasta: Path to FASTA file (str or PathLike). Must be uncompressed. + + Returns: + List of FAI records, one per sequence, containing name, length, + and FAI metadata (offset, line_bases, line_bytes). + + Raises: + IOError: If the FASTA file cannot be read or is compressed. + + Example:: + from gtars.refget import compute_fai + fai_records = compute_fai("genome.fa") + for record in fai_records: + ... print(f"{record.name}: {record.length} bp") """ - Digests a FASTA file and returns a SequenceCollection object. + ... + +def load_fasta(fasta: Union[str, PathLike]) -> SequenceCollection: + """Load a FASTA file with sequence data into a SequenceCollection. + + This function reads a FASTA file and loads all sequences with their data + into memory. Unlike digest_fasta(), this includes the actual sequence data, + not just metadata. + + Args: + fasta: Path to FASTA file (str or PathLike). + + Returns: + Collection containing all sequences with their metadata and sequence data loaded. + + Raises: + IOError: If the FASTA file cannot be read or parsed. + + Example:: + from gtars.refget import load_fasta + collection = load_fasta("genome.fa") + first_seq = collection[0] + print(f"Sequence: {first_seq.data[:50]}...") """ ... diff --git a/gtars-python/src/refget/mod.rs b/gtars-python/src/refget/mod.rs index 28d3746e..27de93cb 100644 --- a/gtars-python/src/refget/mod.rs +++ b/gtars-python/src/refget/mod.rs @@ -8,13 +8,18 @@ use pyo3::types::{PyBytes, PyString, PyType}; use gtars_refget::alphabet::AlphabetType; use gtars_refget::collection::{ - SeqColDigestLvl1, SequenceCollection, SequenceMetadata, SequenceRecord, + FaiMetadata, SeqColDigestLvl1, SequenceCollection, SequenceMetadata, SequenceRecord, }; use gtars_refget::digest::{md5, sha512t24u}; +use gtars_refget::fasta::FaiRecord; use gtars_refget::store::GlobalRefgetStore; use gtars_refget::store::StorageMode; // use gtars::refget::store::RetrievedSequence; // This is the Rust-native struct +/// Compute the GA4GH SHA-512/24u digest for a sequence. +/// +/// Accepts either a string or bytes and returns the truncated SHA-512 digest +/// as a 32-character base64url string. #[pyfunction] pub fn sha512t24u_digest(readable: &Bound<'_, PyAny>) -> PyResult { if let Ok(s) = readable.cast::() { @@ -26,6 +31,10 @@ pub fn sha512t24u_digest(readable: &Bound<'_, PyAny>) -> PyResult { } } +/// Compute the MD5 digest for a sequence. +/// +/// Accepts either a string or bytes and returns the MD5 hash as a 32-character +/// hexadecimal string. Supported for backward compatibility with legacy systems. #[pyfunction] pub fn md5_digest(readable: &Bound<'_, PyAny>) -> PyResult { if let Ok(s) = readable.cast::() { @@ -37,8 +46,10 @@ pub fn md5_digest(readable: &Bound<'_, PyAny>) -> PyResult { } } -// This can take either a PosixPath or a string -// The `&Bound<'_, PyAny>` references any Python object, bound to the Python runtime. +/// Read a FASTA file and compute GA4GH digests for all sequences. +/// +/// Returns a SequenceCollection with computed sequence-level and collection-level +/// digests following the GA4GH refget specification. #[pyfunction] pub fn digest_fasta(fasta: &Bound<'_, PyAny>) -> PyResult { let fasta = fasta.to_string(); @@ -51,6 +62,38 @@ pub fn digest_fasta(fasta: &Bound<'_, PyAny>) -> PyResult } } +/// Compute FASTA index (FAI) metadata for all sequences in a FASTA file. +/// +/// Returns a list of FAI records compatible with samtools faidx format. +/// Only works with uncompressed FASTA files. +#[pyfunction] +pub fn compute_fai(fasta: &Bound<'_, PyAny>) -> PyResult> { + let fasta = fasta.to_string(); + match gtars_refget::fasta::compute_fai(&fasta) { + Ok(fai_records) => Ok(fai_records.into_iter().map(PyFaiRecord::from).collect()), + Err(e) => Err(PyErr::new::(format!( + "Error computing FAI: {}", + e + ))), + } +} + +/// Load a FASTA file with sequence data into a SequenceCollection. +/// +/// Unlike digest_fasta(), this loads the actual sequence data into memory, +/// not just metadata and digests. +#[pyfunction] +pub fn load_fasta(fasta: &Bound<'_, PyAny>) -> PyResult { + let fasta = fasta.to_string(); + match gtars_refget::fasta::load_fasta(&fasta) { + Ok(sequence_collection) => Ok(PySequenceCollection::from(sequence_collection)), + Err(e) => Err(PyErr::new::(format!( + "Error loading FASTA file: {}", + e + ))), + } +} + #[pyclass(name = "AlphabetType")] #[derive(Clone)] pub enum PyAlphabetType { @@ -75,6 +118,30 @@ pub struct PySequenceMetadata { pub md5: String, #[pyo3(get, set)] pub alphabet: PyAlphabetType, + #[pyo3(get, set)] + pub fai: Option, +} + +#[pyclass(name = "FaiMetadata")] +#[derive(Clone)] +pub struct PyFaiMetadata { + #[pyo3(get, set)] + pub offset: u64, + #[pyo3(get, set)] + pub line_bases: u32, + #[pyo3(get, set)] + pub line_bytes: u32, +} + +#[pyclass(name = "FaiRecord")] +#[derive(Clone)] +pub struct PyFaiRecord { + #[pyo3(get, set)] + pub name: String, + #[pyo3(get, set)] + pub length: usize, + #[pyo3(get, set)] + pub fai: Option, } #[pyclass(name = "SequenceRecord")] @@ -83,7 +150,7 @@ pub struct PySequenceRecord { #[pyo3(get, set)] pub metadata: PySequenceMetadata, #[pyo3(get, set)] - pub data: Option>, + pub sequence: Option>, } #[pyclass(name = "SeqColDigestLvl1")] @@ -108,8 +175,6 @@ pub struct PySequenceCollection { pub lvl1: PySeqColDigestLvl1, #[pyo3(get, set)] pub file_path: Option, - #[pyo3(get, set)] - pub has_data: bool, } #[pyclass(name = "RetrievedSequence")] @@ -182,7 +247,10 @@ impl PyAlphabetType { #[pymethods] impl PySequenceMetadata { fn __repr__(&self) -> String { - format!("", self.name) + format!( + "SequenceMetadata(name='{}', length={}, sha512t24u='{}', md5='{}', alphabet={})", + self.name, self.length, self.sha512t24u, self.md5, self.alphabet.__str__() + ) } fn __str__(&self) -> PyResult { @@ -193,21 +261,111 @@ impl PySequenceMetadata { } } +#[pymethods] +impl PyFaiMetadata { + fn __repr__(&self) -> String { + format!("", + self.offset, self.line_bases, self.line_bytes) + } + + fn __str__(&self) -> String { + format!( + "FaiMetadata:\n offset: {}\n line_bases: {}\n line_bytes: {}", + self.offset, self.line_bases, self.line_bytes + ) + } +} + +#[pymethods] +impl PyFaiRecord { + fn __repr__(&self) -> String { + format!("", self.name, self.length) + } + + fn __str__(&self) -> String { + let fai_str = if let Some(ref fai) = self.fai { + format!("\n FAI offset: {}\n FAI line_bases: {}\n FAI line_bytes: {}", + fai.offset, fai.line_bases, fai.line_bytes) + } else { + "\n FAI: None (gzipped file)".to_string() + }; + format!("FaiRecord:\n name: {}\n length: {}{}", + self.name, self.length, fai_str) + } +} + #[pymethods] impl PySequenceRecord { fn __repr__(&self) -> String { - format!("", self.metadata.name) + format!( + "SequenceRecord(name='{}', length={}, sha512t24u='{}', has_data={})", + self.metadata.name, + self.metadata.length, + self.metadata.sha512t24u, + self.sequence.is_some() + ) } fn __str__(&self) -> String { format!("SequenceRecord for {}", self.metadata.name) } + + /// Decode the sequence data to a string. + /// + /// This method decodes the sequence data stored in this record. It handles + /// both raw (uncompressed UTF-8) and encoded (bit-packed) data automatically + /// based on the alphabet type. + /// + /// Returns: + /// Optional[str]: The decoded sequence string if data is loaded, None otherwise. + /// + /// Example: + /// >>> record = store.get_sequence_by_collection_and_name(digest, "chr1") + /// >>> sequence = record.decode() + /// >>> if sequence: + /// ... print(f"First 50 bases: {sequence[:50]}") + pub fn decode(&self) -> Option { + // Convert PySequenceRecord to Rust SequenceRecord + let metadata = SequenceMetadata { + name: self.metadata.name.clone(), + length: self.metadata.length, + sha512t24u: self.metadata.sha512t24u.clone(), + md5: self.metadata.md5.clone(), + alphabet: match self.metadata.alphabet { + PyAlphabetType::Dna2bit => AlphabetType::Dna2bit, + PyAlphabetType::Dna3bit => AlphabetType::Dna3bit, + PyAlphabetType::DnaIupac => AlphabetType::DnaIupac, + PyAlphabetType::Protein => AlphabetType::Protein, + PyAlphabetType::Ascii => AlphabetType::Ascii, + PyAlphabetType::Unknown => AlphabetType::Unknown, + }, + fai: self.metadata.fai.as_ref().map(|fai| FaiMetadata { + offset: fai.offset, + line_bases: fai.line_bases, + line_bytes: fai.line_bytes, + }), + }; + + let rust_record = match &self.sequence { + None => SequenceRecord::Stub(metadata), + Some(seq) => SequenceRecord::Full { + metadata, + sequence: seq.clone(), + }, + }; + + // Call the Rust decode method + rust_record.decode() + } } #[pymethods] impl PySeqColDigestLvl1 { fn __repr__(&self) -> String { - "".to_string() + format!( + "SeqColDigestLvl1(sequences='{}', names='{}', lengths='{}')", + self.sequences_digest, self.names_digest, self.lengths_digest + ) } fn __str__(&self) -> String { @@ -222,8 +380,9 @@ impl PySeqColDigestLvl1 { impl PySequenceCollection { fn __repr__(&self) -> String { format!( - "", - self.sequences.len() + "SequenceCollection(n_sequences={}, digest='{}')", + self.sequences.len(), + self.digest ) } @@ -253,6 +412,103 @@ impl PySequenceCollection { )) } } + + /// Write the collection to a FASTA file. + /// + /// Args: + /// file_path (str): Path to the output FASTA file + /// line_width (int, optional): Number of bases per line (default: 70) + /// + /// Raises: + /// IOError: If any sequence doesn't have data loaded + /// + /// Example: + /// >>> collection = load_fasta("genome.fa") + /// >>> collection.write_fasta("output.fa") + /// >>> collection.write_fasta("output.fa", line_width=60) + fn write_fasta(&self, file_path: &str, line_width: Option) -> PyResult<()> { + // Convert Python sequences back to Rust SequenceCollection + let rust_collection = SequenceCollection { + sequences: self.sequences.iter().map(|py_rec| { + let metadata = SequenceMetadata { + name: py_rec.metadata.name.clone(), + length: py_rec.metadata.length, + sha512t24u: py_rec.metadata.sha512t24u.clone(), + md5: py_rec.metadata.md5.clone(), + alphabet: match &py_rec.metadata.alphabet { + PyAlphabetType::Dna2bit => AlphabetType::Dna2bit, + PyAlphabetType::Dna3bit => AlphabetType::Dna3bit, + PyAlphabetType::DnaIupac => AlphabetType::DnaIupac, + PyAlphabetType::Protein => AlphabetType::Protein, + PyAlphabetType::Ascii => AlphabetType::Ascii, + PyAlphabetType::Unknown => AlphabetType::Unknown, + }, + fai: py_rec.metadata.fai.as_ref().map(|f| FaiMetadata { + offset: f.offset, + line_bases: f.line_bases, + line_bytes: f.line_bytes, + }), + }; + + match &py_rec.sequence { + None => SequenceRecord::Stub(metadata), + Some(seq) => SequenceRecord::Full { + metadata, + sequence: seq.clone(), + }, + } + }).collect(), + digest: self.digest.clone(), + lvl1: SeqColDigestLvl1 { + sequences_digest: self.lvl1.sequences_digest.clone(), + names_digest: self.lvl1.names_digest.clone(), + lengths_digest: self.lvl1.lengths_digest.clone(), + }, + file_path: None, + }; + + rust_collection + .write_fasta(file_path, line_width) + .map_err(|e| PyErr::new::(format!( + "Failed to write FASTA: {}", + e + ))) + } + + /// Iterate over sequences in the collection. + /// + /// Allows Pythonic iteration: `for seq in collection:` + /// + /// Yields: + /// SequenceRecord: Each sequence record in the collection. + /// + /// Example: + /// >>> collection = digest_fasta("genome.fa") + /// >>> for seq in collection: + /// ... print(f"{seq.metadata.name}: {seq.metadata.length} bp") + fn __iter__(slf: PyRef<'_, Self>) -> PyResult> { + let py = slf.py(); + Py::new(py, PySequenceCollectionIterator { + iter: slf.sequences.clone().into_iter(), + }) + } +} + +/// Iterator for SequenceCollection - simple wrapper around Vec iterator. +#[pyclass] +pub struct PySequenceCollectionIterator { + iter: std::vec::IntoIter, +} + +#[pymethods] +impl PySequenceCollectionIterator { + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(mut slf: PyRefMut<'_, Self>) -> Option { + slf.iter.next() + } } // Conversion from Rust AlphabetType to Python PyAlphabetType @@ -269,6 +525,28 @@ impl From for PyAlphabetType { } } +// Conversion from Rust FaiMetadata to Python PyFaiMetadata +impl From for PyFaiMetadata { + fn from(value: FaiMetadata) -> Self { + PyFaiMetadata { + offset: value.offset, + line_bases: value.line_bases, + line_bytes: value.line_bytes, + } + } +} + +// Conversion from Rust FaiRecord to Python PyFaiRecord +impl From for PyFaiRecord { + fn from(value: FaiRecord) -> Self { + PyFaiRecord { + name: value.name, + length: value.length, + fai: value.fai.map(PyFaiMetadata::from), + } + } +} + // Conversion from Rust SequenceMetadata to Python PySequenceMetadata impl From for PySequenceMetadata { fn from(value: SequenceMetadata) -> Self { @@ -278,6 +556,7 @@ impl From for PySequenceMetadata { sha512t24u: value.sha512t24u, md5: value.md5, alphabet: PyAlphabetType::from(value.alphabet), + fai: value.fai.map(PyFaiMetadata::from), } } } @@ -285,9 +564,15 @@ impl From for PySequenceMetadata { // Conversion from Rust SequenceRecord to Python PySequenceRecord impl From for PySequenceRecord { fn from(value: SequenceRecord) -> Self { - PySequenceRecord { - metadata: PySequenceMetadata::from(value.metadata), - data: value.data, + match value { + SequenceRecord::Stub(metadata) => PySequenceRecord { + metadata: PySequenceMetadata::from(metadata), + sequence: None, + }, + SequenceRecord::Full { metadata, sequence } => PySequenceRecord { + metadata: PySequenceMetadata::from(metadata), + sequence: Some(sequence), + }, } } } @@ -315,7 +600,6 @@ impl From for PySequenceCollection { digest: value.digest, lvl1: PySeqColDigestLvl1::from(value.lvl1), file_path: value.file_path.map(|p| p.to_string_lossy().to_string()), - has_data: value.has_data, } } } @@ -352,20 +636,137 @@ pub struct PyGlobalRefgetStore { #[pymethods] impl PyGlobalRefgetStore { - #[new] - fn new(mode: PyStorageMode) -> Self { + /// Create a disk-backed RefgetStore. + /// + /// Sequences are written to disk immediately and loaded on-demand (lazy loading). + /// Only metadata is kept in memory. + /// + /// Args: + /// cache_path (str or Path): Directory for storing sequences and metadata + /// mode: Storage mode (StorageMode.Raw or StorageMode.Encoded) + /// + /// Returns: + /// GlobalRefgetStore: A configured disk-backed store + /// + /// Example: + /// >>> from gtars.refget import GlobalRefgetStore + /// >>> store = GlobalRefgetStore.on_disk("/data/store") + /// >>> store.add_sequence_collection_from_fasta("genome.fa") + #[classmethod] + fn on_disk(_cls: &Bound<'_, PyType>, cache_path: &Bound<'_, PyAny>) -> PyResult { + let cache_path = cache_path.to_string(); + let store = GlobalRefgetStore::on_disk(cache_path).map_err(|e| { + PyErr::new::(format!("Error with disk-backed store: {}", e)) + })?; + Ok(Self { inner: store }) + } + + /// Create an in-memory RefgetStore. + /// + /// All sequences kept in RAM for fast access. + /// Defaults to Encoded storage mode (2-bit packing for space efficiency). + /// Use set_mode() to change storage mode after creation. + /// + /// Returns: + /// GlobalRefgetStore: A new in-memory store + /// + /// Example: + /// >>> from gtars.refget import GlobalRefgetStore + /// >>> store = GlobalRefgetStore.in_memory() + /// >>> store.add_sequence_collection_from_fasta("genome.fa") + #[classmethod] + fn in_memory(_cls: &Bound<'_, PyType>) -> Self { Self { - inner: GlobalRefgetStore::new(mode.into()), + inner: GlobalRefgetStore::in_memory(), } } - fn import_fasta(&mut self, file_path: &str) -> PyResult<()> { - self.inner.import_fasta(file_path).map_err(|e| { + /// Change the storage mode, re-encoding/decoding existing sequences as needed. + /// + /// When switching from Raw to Encoded: + /// - All Full sequences in memory are encoded (2-bit packed) + /// + /// When switching from Encoded to Raw: + /// - All Full sequences in memory are decoded back to raw bytes + /// + /// Args: + /// mode: The storage mode to switch to (StorageMode.Raw or StorageMode.Encoded) + /// + /// Example: + /// >>> from gtars.refget import GlobalRefgetStore, StorageMode + /// >>> store = GlobalRefgetStore.in_memory() + /// >>> store.set_mode(StorageMode.Raw) + fn set_mode(&mut self, mode: PyStorageMode) { + self.inner.set_mode(mode.into()); + } + + /// Enable 2-bit encoding for space efficiency. + /// Re-encodes any existing Raw sequences in memory. + /// + /// Example: + /// >>> store = GlobalRefgetStore.in_memory() + /// >>> store.disable_encoding() # Switch to Raw + /// >>> store.enable_encoding() # Back to Encoded + fn enable_encoding(&mut self) { + self.inner.enable_encoding(); + } + + /// Disable encoding, use raw byte storage. + /// Decodes any existing Encoded sequences in memory. + /// + /// Example: + /// >>> store = GlobalRefgetStore.in_memory() + /// >>> store.disable_encoding() # Switch to Raw mode + fn disable_encoding(&mut self) { + self.inner.disable_encoding(); + } + + /// Add a sequence collection from a FASTA file. + /// + /// Reads a FASTA file, digests the sequences, creates a SequenceCollection, + /// and adds it to the store along with all its sequences. + /// + /// Args: + /// file_path (str or Path): Path to the FASTA file to import. + /// force (bool, optional): If True, overwrite existing collections/sequences. + /// If False (default), skip duplicates. + /// + /// Raises: + /// IOError: If the file cannot be read or processed. + /// + /// Example: + /// >>> store = GlobalRefgetStore.in_memory() + /// >>> store.add_sequence_collection_from_fasta("genome.fa") # skip duplicates + /// >>> store.add_sequence_collection_from_fasta("genome.fa", force=True) # overwrite + #[pyo3(signature = (file_path, force=false))] + fn add_sequence_collection_from_fasta(&mut self, file_path: &Bound<'_, PyAny>, force: bool) -> PyResult<()> { + let file_path = file_path.to_string(); + let result = if force { + self.inner.add_sequence_collection_from_fasta_force(file_path) + } else { + self.inner.add_sequence_collection_from_fasta(file_path) + }; + result.map_err(|e| { PyErr::new::(format!("Error importing FASTA: {}", e)) }) } - fn get_sequence_by_id(&self, digest: &str) -> PyResult> { + /// Retrieve a sequence record by its digest (SHA-512/24u or MD5). + /// + /// Searches for a sequence by its GA4GH SHA-512/24u digest. If not found + /// and the input looks like an MD5 digest (32 hex characters), tries MD5 lookup. + /// + /// Args: + /// digest: Sequence digest (SHA-512/24u base64url or MD5 hex string). + /// + /// Returns: + /// Optional[SequenceRecord]: The sequence record if found, None otherwise. + /// + /// Example: + /// >>> record = store.get_sequence_by_id("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2") + /// >>> if record: + /// ... print(f"Found: {record.metadata.name}") + fn get_sequence_by_id(&mut self, digest: &str) -> PyResult> { // Try as SHA512t24u first (32 bytes) let result = self .inner @@ -383,8 +784,26 @@ impl PyGlobalRefgetStore { Ok(result) } + /// Retrieve a sequence by collection digest and sequence name. + /// + /// Looks up a sequence within a specific collection using its name + /// (e.g., "chr1", "chrM"). This is useful when you know the genome assembly + /// (collection) and chromosome name. + /// + /// Args: + /// collection_digest: The collection's SHA-512/24u digest. + /// sequence_name: Name of the sequence within that collection. + /// + /// Returns: + /// Optional[SequenceRecord]: The sequence record if found, None otherwise. + /// + /// Example: + /// >>> record = store.get_sequence_by_collection_and_name( + /// ... "uC_UorBNf3YUu1YIDainBhI94CedlNeH", + /// ... "chr1" + /// ... ) fn get_sequence_by_collection_and_name( - &self, + &mut self, collection_digest: &str, sequence_name: &str, ) -> Option { @@ -393,82 +812,477 @@ impl PyGlobalRefgetStore { .map(|record| PySequenceRecord::from(record.clone())) } - fn get_substring(&self, seq_digest: &str, start: usize, end: usize) -> Option { + /// Extract a substring from a sequence. + /// + /// Retrieves a specific region from a sequence using 0-based, half-open + /// coordinates [start, end). Automatically loads sequence data if not + /// already cached (for lazy-loaded stores). + /// + /// Args: + /// seq_digest: Sequence digest (SHA-512/24u). + /// start: Start position (0-based, inclusive). + /// end: End position (0-based, exclusive). + /// + /// Returns: + /// Optional[str]: The substring sequence if found, None otherwise. + /// + /// Example: + /// >>> # Get first 1000 bases of chr1 + /// >>> seq = store.get_substring("chr1_digest", 0, 1000) + /// >>> print(f"First 50bp: {seq[:50]}") + fn get_substring(&mut self, seq_digest: &str, start: usize, end: usize) -> Option { self.inner.get_substring(seq_digest, start, end) } - fn write_store_to_directory( + #[getter] + fn cache_path(&self) -> Option { + self.inner.local_path().map(|p| p.display().to_string()) + } + + #[getter] + fn remote_url(&self) -> Option { + self.inner.remote_source().map(|s| s.to_string()) + } + + #[getter] + fn storage_mode(&self) -> PyStorageMode { + self.inner.storage_mode().into() + } + + /// Returns a list of sequence metadata for all sequences in the store. + /// + /// This is a lightweight operation that returns only metadata (name, length, digests) + /// without loading sequence data. Use `sequence_records()` if you need full records. + /// + /// Returns: + /// list[SequenceMetadata]: List of sequence metadata objects. + /// + /// Example: + /// >>> for metadata in store.sequence_metadata(): + /// ... print(f"{metadata.name}: {metadata.length} bp") + fn sequence_metadata(&self) -> Vec { + self.inner + .sequence_metadata() + .map(|meta| PySequenceMetadata::from(meta.clone())) + .collect() + } + + /// Returns a list of complete sequence records from the store. + /// + /// This returns full `SequenceRecord` objects including both metadata and sequence data. + /// Use `sequence_metadata()` if you only need metadata (more efficient). + /// + /// Returns: + /// list[SequenceRecord]: List of sequence record objects. + /// + /// Example: + /// >>> for record in store.sequence_records(): + /// ... print(f"{record.metadata.name}: {record.decode()}") + fn sequence_records(&self) -> Vec { + self.inner + .sequence_records() + .map(|rec| PySequenceRecord::from(rec.clone())) + .collect() + } + + fn collections(&self) -> Vec { + self.inner + .collections() + .map(|col| PySequenceCollection::from(col.clone())) + .collect() + } + + /// Returns statistics about the store. + /// + /// Returns: + /// dict: Dictionary with keys 'n_sequences', 'n_collections', 'storage_mode' + /// + /// Example: + /// >>> stats = store.stats() + /// >>> print(f"Store has {stats['n_sequences']} sequences in {stats['n_collections']} collections") + /// >>> print(f"Storage mode: {stats['storage_mode']}") + fn stats(&self) -> std::collections::HashMap { + let (n_sequences, n_collections, mode_str) = self.inner.stats(); + let mut stats = std::collections::HashMap::new(); + stats.insert("n_sequences".to_string(), n_sequences.to_string()); + stats.insert("n_collections".to_string(), n_collections.to_string()); + stats.insert("storage_mode".to_string(), mode_str.to_string()); + stats + } + + /// Write the store using its configured paths. + /// + /// Convenience method for disk-backed stores. + /// Uses the store's own local_path and seqdata_path_template. + fn write(&self) -> PyResult<()> { + self.inner.write().map_err(|e| { + PyErr::new::(format!("Error writing store: {}", e)) + }) + } + + #[pyo3(signature = (root_path, seqdata_path_template=None))] + fn write_store_to_dir( &self, - root_path: &str, - seqdata_path_template: &str, + root_path: &Bound<'_, PyAny>, + seqdata_path_template: Option<&str>, ) -> PyResult<()> { + let root_path = root_path.to_string(); self.inner - .write_store_to_directory(root_path, seqdata_path_template) + .write_store_to_dir(root_path, seqdata_path_template) .map_err(|e| { PyErr::new::(format!("Error writing store: {}", e)) }) } + /// Load a local RefgetStore from a directory. + /// + /// Loads metadata from the local store immediately; sequence data is loaded + /// on-demand when first accessed. This is efficient for large genomes where + /// you may only need specific sequences. + /// + /// Args: + /// cache_path (str or Path): Local directory containing the refget store (must have + /// index.json and sequences.farg files). + /// + /// Returns: + /// GlobalRefgetStore: Store with metadata loaded, sequences lazy-loaded. + /// + /// Raises: + /// IOError: If the store directory or index files cannot be read. + /// + /// Example: + /// >>> from gtars.refget import GlobalRefgetStore + /// >>> store = GlobalRefgetStore.load_local("/data/hg38_store") + /// >>> seq = store.get_substring("chr1_digest", 0, 1000) #[classmethod] - fn load_from_directory(_cls: &Bound<'_, PyType>, root_path: &str) -> PyResult { - let store = GlobalRefgetStore::load_from_directory(root_path).map_err(|e| { - PyErr::new::(format!("Error loading store: {}", e)) + fn load_local(_cls: &Bound<'_, PyType>, cache_path: &Bound<'_, PyAny>) -> PyResult { + let cache_path = cache_path.to_string(); + let store = GlobalRefgetStore::load_local(cache_path).map_err(|e| { + PyErr::new::(format!("Error loading local store: {}", e)) })?; Ok(Self { inner: store }) } - fn get_seqs_bed_file( - &self, + /// Load a remote RefgetStore with local caching. + /// + /// Fetches metadata (index.json, sequences.farg) from a remote URL immediately. + /// Sequence data (.seq files) are downloaded on-demand when first accessed and + /// cached locally. This is ideal for working with large remote genomes where + /// you only need specific sequences. + /// + /// Args: + /// cache_path (str or Path): Local directory to cache downloaded metadata and sequences. + /// Created if it doesn't exist. + /// remote_url (str): Base URL of the remote refget store (e.g., + /// "https://example.com/hg38" or "s3://bucket/hg38"). + /// + /// Returns: + /// GlobalRefgetStore: Store with metadata loaded, sequences fetched on-demand. + /// + /// Raises: + /// IOError: If remote metadata cannot be fetched or cache cannot be written. + /// + /// Args: + /// cache_path (str or Path): Local directory for caching + /// remote_url (str): Remote URL to fetch data from + /// cache_to_disk: If True (default), cache sequence data to disk. If False, keep only in memory. + /// + /// Example: + /// >>> from gtars.refget import GlobalRefgetStore + /// >>> # With disk caching (default) + /// >>> store = GlobalRefgetStore.load_remote( + /// ... "/data/cache/hg38", + /// ... "https://refget-server.com/hg38" + /// ... ) + /// >>> # Memory-only mode (no sequence data caching to disk) + /// >>> store = GlobalRefgetStore.load_remote( + /// ... "/tmp/cache", + /// ... "https://refget-server.com/hg38", + /// ... cache_to_disk=False + /// ... ) + #[classmethod] + #[pyo3(signature = (cache_path, remote_url, cache_to_disk=true))] + fn load_remote(_cls: &Bound<'_, PyType>, cache_path: &Bound<'_, PyAny>, remote_url: &Bound<'_, PyAny>, cache_to_disk: bool) -> PyResult { + let cache_path = cache_path.to_string(); + let remote_url = remote_url.to_string(); + let store = GlobalRefgetStore::load_remote(cache_path, remote_url, cache_to_disk).map_err(|e| { + PyErr::new::(format!("Error loading remote store: {}", e)) + })?; + Ok(Self { inner: store }) + } + + /// Export sequences from BED file regions to a FASTA file. + /// + /// Reads a BED file defining genomic regions and exports the sequences + /// for those regions to a FASTA file. + /// + /// Args: + /// collection_digest: The collection's SHA-512/24u digest. + /// bed_file_path: Path to BED file defining regions. + /// output_file_path: Path to write the output FASTA file. + /// + /// Raises: + /// IOError: If files cannot be read/written or sequences not found. + /// + /// Example: + /// >>> store.export_fasta_from_regions( + /// ... "uC_UorBNf3YUu1YIDainBhI94CedlNeH", + /// ... "regions.bed", + /// ... "output.fa" + /// ... ) + fn export_fasta_from_regions( + &mut self, collection_digest: &str, - bed_file_path: &str, - output_file_path: &str, + bed_file_path: &Bound<'_, PyAny>, + output_file_path: &Bound<'_, PyAny>, ) -> PyResult<()> { - // Rust function expects K: AsRef<[u8]>, &str works directly + let bed_file_path = bed_file_path.to_string(); + let output_file_path = output_file_path.to_string(); self.inner - .get_seqs_bed_file(collection_digest, bed_file_path, output_file_path) + .export_fasta_from_regions(collection_digest, &bed_file_path, &output_file_path) .map_err(|e| { PyErr::new::(format!( - "Error retrieving sequences and writing to file: {}", + "Error exporting FASTA from regions: {}", e )) }) } - fn get_seqs_bed_file_to_vec( - &self, + /// Get substrings for BED file regions as a list. + /// + /// Reads a BED file and returns a list of sequences for each region. + /// This is a convenience wrapper around the iterator for cases where + /// you want all results in memory at once. + /// + /// Note: For large BED files, consider using the Rust API's iterator + /// directly to avoid loading all results into memory. + /// + /// Args: + /// collection_digest: The collection's SHA-512/24u digest. + /// bed_file_path: Path to BED file defining regions. + /// + /// Returns: + /// list[RetrievedSequence]: List of sequences with region metadata. + /// + /// Raises: + /// IOError: If files cannot be read or sequences not found. + /// + /// Example: + /// >>> sequences = store.substrings_from_regions( + /// ... "uC_UorBNf3YUu1YIDainBhI94CedlNeH", + /// ... "regions.bed" + /// ... ) + /// >>> for seq in sequences: + /// ... print(f"{seq.chrom_name}:{seq.start}-{seq.end}") + fn substrings_from_regions( + &mut self, collection_digest: &str, - bed_file_path: &str, + bed_file_path: &Bound<'_, PyAny>, ) -> PyResult> { - // Corrected: use `let ... = ...?;` to bind the result - let rust_results = self + let bed_file_path = bed_file_path.to_string(); + // Get iterator and collect results + let iter = self .inner - .get_seqs_bed_file_to_vec(collection_digest, bed_file_path) + .substrings_from_regions(collection_digest, &bed_file_path) .map_err(|e| { PyErr::new::(format!( - "Error retrieving sequences to list: {}", + "Error getting substrings from regions: {}", e )) })?; - // Now `rust_results` is available - let py_results: Vec = rust_results - .into_iter() + // Collect results, filtering out errors + let py_results: Vec = iter + .filter_map(Result::ok) .map(PyRetrievedSequence::from) .collect(); Ok(py_results) } + /// Export sequences from a collection to a FASTA file. + /// + /// This method exports sequences from a specific sequence collection (genome assembly) + /// using the collection digest. You can optionally filter which sequences to export + /// by providing sequence names. + /// + /// **Use this method when:** You have a collection digest and want to export sequences + /// from that specific collection, optionally filtering by chromosome/sequence names. + /// + /// **Contrast with export_fasta_by_digests:** That method exports sequences by their + /// individual sequence digests (not collection digest) and bypasses collection + /// information entirely. + /// + /// Args: + /// collection_digest: The collection's SHA-512/24u digest (identifies the genome + /// assembly/sequence collection). + /// output_path: Path to write the output FASTA file. + /// sequence_names: Optional list of sequence names to export (e.g., ["chr1", "chr2"]). + /// If None, exports all sequences in the collection. + /// line_width: Number of bases per line in output FASTA (default: 80). + /// + /// Raises: + /// IOError: If file cannot be written or collection/sequences not found. + /// + /// Example: + /// >>> # Export all sequences from a collection + /// >>> store.export_fasta("uC_UorBNf3YUu1YIDainBhI94CedlNeH", "output.fa", None, None) + /// >>> # Export only chr1 and chr2 + /// >>> store.export_fasta("uC_UorBNf3YUu1YIDainBhI94CedlNeH", "output.fa", ["chr1", "chr2"], None) + fn export_fasta( + &mut self, + collection_digest: &str, + output_path: &Bound<'_, PyAny>, + sequence_names: Option>, + line_width: Option, + ) -> PyResult<()> { + let output_path = output_path.to_string(); + let sequence_names_refs = sequence_names.as_ref().map(|names| { + names.iter().map(|s| s.as_str()).collect::>() + }); + self.inner + .export_fasta(collection_digest, &output_path, sequence_names_refs, line_width) + .map_err(|e| { + PyErr::new::(format!( + "Error exporting FASTA: {}", + e + )) + }) + } + + /// Export sequences by their individual sequence digests to a FASTA file. + /// + /// This method exports sequences directly by their SHA-512/24u sequence digests, + /// bypassing collection information. Useful when you have specific sequence + /// identifiers and want to retrieve them regardless of which collection(s) they + /// belong to. + /// + /// **Use this method when:** You have a list of sequence digests (SHA-512/24u) + /// and want to export those specific sequences from the global sequence store. + /// + /// **Contrast with export_fasta:** That method uses a collection digest and + /// optionally filters by sequence names within that collection. + /// + /// Args: + /// seq_digests: List of SHA-512/24u sequence digests (not collection digests) + /// to export. Each digest identifies an individual sequence. + /// output_path: Path to write the output FASTA file. + /// line_width: Number of bases per line in output FASTA (default: 80). + /// + /// Raises: + /// IOError: If file cannot be written or sequences not found. + /// + /// Example: + /// >>> # Export specific sequences by their digests + /// >>> seq_digests = [ + /// ... "aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2", + /// ... "bXE123dAxcJAqme6QYQ7EZ07-fiw8Kw2" + /// ... ] + /// >>> store.export_fasta_by_digests(seq_digests, "output.fa", None) + fn export_fasta_by_digests( + &mut self, + seq_digests: Vec, + output_path: &Bound<'_, PyAny>, + line_width: Option, + ) -> PyResult<()> { + let output_path = output_path.to_string(); + let digests_refs: Vec<&str> = seq_digests.iter().map(|s| s.as_str()).collect(); + self.inner + .export_fasta_by_digests(digests_refs, &output_path, line_width) + .map_err(|e| { + PyErr::new::(format!( + "Error exporting FASTA by digests: {}", + e + )) + }) + } + fn __str__(&self) -> String { format!("{}", self.inner) } fn __repr__(&self) -> String { + let (n_sequences, n_collections, mode_str) = self.inner.stats(); + + let location = if let Some(remote) = self.inner.remote_source() { + let cache = self.inner.local_path() + .map(|p| p.display().to_string()) + .unwrap_or_else(|| "None".to_string()); + format!("cache='{}', remote='{}'", cache, remote) + } else if let Some(path) = self.inner.local_path() { + format!("cache='{}'", path.display()) + } else { + "memory-only".to_string() + }; + format!( - "", - self.inner.list_sequence_digests().len() + "GlobalRefgetStore(n_sequences={}, n_collections={}, mode={}, {})", + n_sequences, n_collections, mode_str, location ) } + + /// Return the number of sequences in the store. + /// + /// Allows using `len(store)` in Python to get the total number of sequences + /// across all collections. + /// + /// Returns: + /// int: Total number of sequences in the store. + /// + /// Example: + /// >>> store = GlobalRefgetStore(StorageMode.Encoded) + /// >>> store.import_fasta("genome.fa") + /// >>> print(f"Store contains {len(store)} sequences") + fn __len__(&self) -> usize { + self.inner.sequence_digests().count() + } + + /// Iterate over all sequences in the store. + /// + /// Allows using `for sequence in store:` in Python to iterate over all + /// sequence metadata in the store. + /// + /// Yields: + /// SequenceMetadata: Metadata for each sequence in the store. + /// + /// Example: + /// >>> store = GlobalRefgetStore(StorageMode.Encoded) + /// >>> store.import_fasta("genome.fa") + /// >>> for seq_meta in store: + /// ... print(f"{seq_meta.name}: {seq_meta.length} bp") + fn __iter__(slf: PyRef<'_, Self>) -> PyResult { + let sequences = slf.inner.sequence_metadata() + .map(|meta| PySequenceMetadata::from(meta.clone())) + .collect(); + Ok(PyRefgetStoreIterator { + sequences, + index: 0, + }) + } +} + +/// Iterator for GlobalRefgetStore that yields SequenceMetadata. +#[pyclass] +pub struct PyRefgetStoreIterator { + sequences: Vec, + index: usize, +} + +#[pymethods] +impl PyRefgetStoreIterator { + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(mut slf: PyRefMut<'_, Self>) -> Option { + if slf.index < slf.sequences.len() { + let item = slf.sequences[slf.index].clone(); + slf.index += 1; + Some(item) + } else { + None + } + } } // This represents the Python module to be created @@ -477,8 +1291,12 @@ pub fn refget(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(sha512t24u_digest, m)?)?; m.add_function(wrap_pyfunction!(md5_digest, m)?)?; m.add_function(wrap_pyfunction!(digest_fasta, m)?)?; + m.add_function(wrap_pyfunction!(compute_fai, m)?)?; + m.add_function(wrap_pyfunction!(load_fasta, m)?)?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/gtars-python/tests/test_refget.py b/gtars-python/tests/test_refget.py index 2e26e54d..aa0b3cf7 100644 --- a/gtars-python/tests/test_refget.py +++ b/gtars-python/tests/test_refget.py @@ -3,6 +3,7 @@ GlobalRefgetStore, StorageMode, digest_fasta, + load_fasta, sha512t24u_digest, md5_digest, RetrievedSequence, @@ -25,7 +26,6 @@ def test_digest_fasta_basic(self): assert hasattr(result, "digest") assert hasattr(result, "lvl1") assert hasattr(result, "file_path") - assert hasattr(result, "has_data") # Test that sequences is a list assert isinstance(result.sequences, list) @@ -79,9 +79,11 @@ def test_global_refget_store_basic(self): store_encoded = GlobalRefgetStore(StorageMode.Encoded) # Test string representations - # print(store_raw.__repr__) - assert repr(store_raw).startswith(" length @@ -161,7 +163,7 @@ def test_store_collection_operations(self): fasta_path = "../../gtars/tests/data/fasta/base.fa" # Import sequences and get sequence by collection and name - store.import_fasta(fasta_path) + store.add_sequence_collection_from_fasta(fasta_path) # Get sequence from default collection result = digest_fasta(fasta_path) @@ -175,8 +177,8 @@ def test_store_collection_operations(self): def test_get_seqs_from_bed_file_bindings(self): """ - Test both get_seqs_bed_file (writes to file) - and get_seqs_bed_file_to_vec (returns list) bindings. + Test both export_fasta_from_regions (writes to file) + and substrings_from_regions (returns list) bindings. """ with tempfile.TemporaryDirectory() as temp_dir_str: @@ -189,7 +191,7 @@ def test_get_seqs_from_bed_file_bindings(self): store = GlobalRefgetStore(StorageMode.Encoded) - imported_collection = store.import_fasta(temp_fasta_path) + imported_collection = store.add_sequence_collection_from_fasta(temp_fasta_path) result = digest_fasta(temp_fasta_path) collection_digest = result.digest @@ -212,7 +214,7 @@ def test_get_seqs_from_bed_file_bindings(self): f.write(bed_content) temp_output_fa_path = os.path.join(temp_dir, "output.fa") - store.get_seqs_bed_file( + store.export_fasta_from_regions( collection_digest, temp_bed_path, temp_output_fa_path ) @@ -228,9 +230,9 @@ def test_get_seqs_from_bed_file_bindings(self): assert ( output_fa_content.strip() == expected_fa_content.strip() ), "Output FASTA file content mismatch" - print("✓ get_seqs_bed_file binding test passed.") + print("✓ export_fasta_from_regions binding test passed.") - vec_result = store.get_seqs_bed_file_to_vec( + vec_result = store.substrings_from_regions( collection_digest, temp_bed_path ) @@ -249,4 +251,103 @@ def test_get_seqs_from_bed_file_bindings(self): assert vec_result[i].start == expected_vec[i].start assert vec_result[i].end == expected_vec[i].end - print("✓ get_seqs_bed_file_to_vec binding test passed.") + print("✓ substrings_from_regions binding test passed.") + + def test_decode_with_no_data(self): + """Test that decode() returns None when sequence data is not loaded""" + fasta_path = "../../gtars/tests/data/fasta/base.fa" + + # digest_fasta should not load sequence data + result = digest_fasta(fasta_path) + + for seq_record in result.sequences: + assert seq_record.data is None, "digest_fasta should not load sequence data" + assert seq_record.decode() is None, "decode() should return None when data is None" + + def test_decode_with_loaded_data(self): + """Test that decode() returns correct sequences when data is loaded""" + fasta_path = "../../gtars/tests/data/fasta/base.fa" + + # load_fasta should load sequence data + result = load_fasta(fasta_path) + + # Expected sequences from base.fa + expected_sequences = [ + ("chrX", "TTGGGGAA"), + ("chr1", "GGAA"), + ("chr2", "GCGC"), + ] + + assert len(result.sequences) == len(expected_sequences) + + for seq_record, (expected_name, expected_seq) in zip(result.sequences, expected_sequences): + assert seq_record.metadata.name == expected_name + assert seq_record.data is not None, "load_fasta should load sequence data" + + decoded = seq_record.decode() + assert decoded is not None, "decode() should return Some when data is present" + assert decoded == expected_seq, f"Decoded sequence for {expected_name} should match expected" + + def test_decode_with_store_sequences(self): + """Test decode() with sequences retrieved from a store""" + store = GlobalRefgetStore(StorageMode.Encoded) + fasta_path = "../../gtars/tests/data/fasta/base.fa" + store.add_sequence_collection_from_fasta(fasta_path) + + # Get sequence by ID + sha512 = "iYtREV555dUFKg2_agSJW6suquUyPpMw" + seq = store.get_sequence_by_id(sha512) + + assert seq is not None + assert seq.data is not None, "Store should have loaded sequence data" + + decoded = seq.decode() + assert decoded is not None + assert decoded == "TTGGGGAA", "Should correctly decode sequence from encoded store" + + def test_decode_raw_vs_encoded_storage(self): + """Test that decode() works with both Raw and Encoded storage modes""" + fasta_path = "../../gtars/tests/data/fasta/base.fa" + + # Test with Raw storage mode + store_raw = GlobalRefgetStore(StorageMode.Raw) + store_raw.add_sequence_collection_from_fasta(fasta_path) + sha512 = "iYtREV555dUFKg2_agSJW6suquUyPpMw" + seq_raw = store_raw.get_sequence_by_id(sha512) + decoded_raw = seq_raw.decode() + + # Test with Encoded storage mode + store_encoded = GlobalRefgetStore(StorageMode.Encoded) + store_encoded.add_sequence_collection_from_fasta(fasta_path) + seq_encoded = store_encoded.get_sequence_by_id(sha512) + decoded_encoded = seq_encoded.decode() + + # Both should produce the same decoded sequence + assert decoded_raw == decoded_encoded + assert decoded_raw == "TTGGGGAA" + + def test_load_fasta_function(self): + """Test the new load_fasta() function""" + fasta_path = "../../gtars/tests/data/fasta/base.fa" + + # Test that load_fasta returns a SequenceCollection with data + result = load_fasta(fasta_path) + + # Should have same structure as digest_fasta + assert hasattr(result, "sequences") + assert hasattr(result, "digest") + assert hasattr(result, "lvl1") + assert hasattr(result, "file_path") + + # But sequences should have data loaded + assert len(result.sequences) == 3 + for seq_record in result.sequences: + assert seq_record.data is not None, "load_fasta should load sequence data" + assert seq_record.decode() is not None, "Should be able to decode loaded data" + + # Verify digests match digest_fasta + digest_result = digest_fasta(fasta_path) + assert result.digest == digest_result.digest + assert result.lvl1.sequences_digest == digest_result.lvl1.sequences_digest + assert result.lvl1.names_digest == digest_result.lvl1.names_digest + assert result.lvl1.lengths_digest == digest_result.lvl1.lengths_digest diff --git a/gtars-r/src/rust/src/refget.rs b/gtars-r/src/rust/src/refget.rs index e598510a..07c56888 100644 --- a/gtars-r/src/rust/src/refget.rs +++ b/gtars-r/src/rust/src/refget.rs @@ -287,8 +287,7 @@ fn sequence_collection_to_list(collection: SequenceCollection) -> List { file_path = collection .file_path .map(|p| p.to_string_lossy().to_string()) - .unwrap_or_default(), - has_data = collection.has_data + .unwrap_or_default() ) } diff --git a/gtars-refget/Cargo.toml b/gtars-refget/Cargo.toml index 9c687039..f1e75aa6 100644 --- a/gtars-refget/Cargo.toml +++ b/gtars-refget/Cargo.toml @@ -13,6 +13,7 @@ md-5 = "0.10.5" base64-url = "2.0.0" memmap2 = "0.5" # for memory-mapped file I/O chrono = { version = "0.4", features = ["serde"] } # For timestamps +ureq = "2.10" # Lightweight HTTP for remote fetching # shared workspace dependencies anyhow = { workspace = true } diff --git a/gtars-refget/src/alphabet.rs b/gtars-refget/src/alphabet.rs index f6c66ed9..9558473e 100644 --- a/gtars-refget/src/alphabet.rs +++ b/gtars-refget/src/alphabet.rs @@ -101,6 +101,20 @@ impl FromStr for AlphabetType { } } +impl AlphabetType { + /// Returns the number of bits used per symbol for this alphabet type + pub fn bits_per_symbol(&self) -> usize { + match self { + AlphabetType::Dna2bit => 2, + AlphabetType::Dna3bit => 3, + AlphabetType::DnaIupac => 4, + AlphabetType::Protein => 5, + AlphabetType::Ascii => 8, + AlphabetType::Unknown => 8, // Default to 8 bits for unknown + } + } +} + /// A lookup table that maps ASCII characters representing DNA bases /// (T, C, A, G, and their lowercase counterparts) to their 2-bit encoding. /// This constant is used for efficient conversion of DNA sequences into a diff --git a/gtars-refget/src/collection.rs b/gtars-refget/src/collection.rs index 94d5a3f0..63d57910 100644 --- a/gtars-refget/src/collection.rs +++ b/gtars-refget/src/collection.rs @@ -26,10 +26,6 @@ pub struct SequenceCollection { /// Optional path to the source file from which this collection was loaded. /// Used for caching and reference purposes. pub file_path: Option, - - /// Flag indicating whether sequence data is actually loaded in memory. - /// When false, only metadata is available. - pub has_data: bool, } /// A struct representing the first level of digests for a refget sequence collection. @@ -105,11 +101,20 @@ impl SeqColDigestLvl1 { } /// A representation of a single sequence that includes metadata and optionally data. -/// Combines sequence metadata with optional raw/encoded data +/// Combines sequence metadata with optional raw/encoded data. +/// +/// This enum has two variants: +/// - `Stub`: Contains only metadata, no sequence data loaded +/// - `Full`: Contains both metadata and the actual sequence data #[derive(Clone, Debug)] -pub struct SequenceRecord { - pub metadata: SequenceMetadata, - pub data: Option>, +pub enum SequenceRecord { + /// A sequence record with only metadata, no sequence data + Stub(SequenceMetadata), + /// A sequence record with both metadata and sequence data + Full { + metadata: SequenceMetadata, + sequence: Vec, + }, } use std::fs::{self, File}; @@ -121,22 +126,157 @@ pub struct SequenceMetadata { pub sha512t24u: String, pub md5: String, pub alphabet: AlphabetType, + pub fai: Option, +} + +/// FASTA index (FAI) metadata for a sequence. +/// This data is only present when a sequence was loaded from a FASTA file. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct FaiMetadata { + pub offset: u64, // byte offset to first base of sequence data + pub line_bases: u32, // number of bases per line + pub line_bytes: u32, // number of bytes per line (including newline chars) +} + +impl SequenceMetadata { + /// Calculate the disk size in bytes for this sequence + /// + /// # Arguments + /// * `mode` - The storage mode (Raw or Encoded) + /// + /// # Returns + /// The number of bytes this sequence occupies on disk + /// + /// # Examples + /// ```ignore + /// // For a 1000bp DNA sequence in Encoded mode with Dna2bit alphabet: + /// // disk_size = (1000 * 2 bits).div_ceil(8) = 250 bytes + /// let size = metadata.disk_size(&StorageMode::Encoded); + /// ``` + pub fn disk_size(&self, mode: &crate::store::StorageMode) -> usize { + match mode { + crate::store::StorageMode::Raw => self.length, + crate::store::StorageMode::Encoded => { + let bits_per_symbol = self.alphabet.bits_per_symbol(); + let total_bits = self.length * bits_per_symbol; + total_bits.div_ceil(8) + } + } + } } impl SequenceRecord { + /// Get metadata regardless of variant + pub fn metadata(&self) -> &SequenceMetadata { + match self { + SequenceRecord::Stub(meta) => meta, + SequenceRecord::Full { metadata, .. } => metadata, + } + } + + /// Get sequence data if present + pub fn sequence(&self) -> Option<&[u8]> { + match self { + SequenceRecord::Stub(_) => None, + SequenceRecord::Full { sequence, .. } => Some(sequence), + } + } + + /// Check if data is loaded + pub fn has_data(&self) -> bool { + matches!(self, SequenceRecord::Full { .. }) + } + + /// Load data into a Stub record, or replace data in a Full record + pub fn with_data(self, sequence: Vec) -> Self { + let metadata = match self { + SequenceRecord::Stub(m) => m, + SequenceRecord::Full { metadata, .. } => metadata, + }; + SequenceRecord::Full { metadata, sequence } + } + /// Utility function to write a single sequence to a file pub fn to_file>(&self, path: P) -> anyhow::Result<()> { + let data = match self { + SequenceRecord::Stub(_) => { + return Err(anyhow::anyhow!("Cannot write file: sequence data not loaded")); + } + SequenceRecord::Full { sequence, .. } => sequence, + }; + // Create parent directories if they don't exist if let Some(parent) = path.as_ref().parent() { fs::create_dir_all(parent)?; } let mut file = File::create(path)?; - if let Some(data) = &self.data { - file.write_all(data)?; - } + file.write_all(data)?; Ok(()) } + + /// Decodes the sequence data to a string. + /// + /// This method attempts to decode the sequence data stored in this record. + /// It handles both raw (uncompressed UTF-8) and encoded (bit-packed) data. + /// The decoding strategy depends on the alphabet type: + /// - For ASCII alphabet: data is already in raw form, just convert to string + /// - For other alphabets: attempt encoded decoding first, fall back to raw + /// + /// # Returns + /// + /// * `Some(String)` - The decoded sequence if data is loaded + /// * `None` - If no data is loaded in this record + /// + /// # Example + /// + /// ```rust,no_run + /// # use gtars_refget::collection::SequenceRecord; + /// # let record: SequenceRecord = todo!(); + /// let sequence = record.decode(); + /// if let Some(seq) = sequence { + /// println!("Sequence: {}", seq); + /// } + /// ``` + pub fn decode(&self) -> Option { + use crate::alphabet::lookup_alphabet; + use crate::encoder::decode_substring_from_bytes; + + let (metadata, data) = match self { + SequenceRecord::Stub(_) => return None, + SequenceRecord::Full { metadata, sequence } => (metadata, sequence), + }; + + // For ASCII alphabet (8 bits per symbol), the data is always stored raw + if metadata.alphabet == crate::alphabet::AlphabetType::Ascii { + return String::from_utf8(data.clone()).ok(); + } + + // Try to detect if data is raw or encoded + // Heuristic: for encoded data, the size should be approximately length * bits_per_symbol / 8 + // For raw data, the size should be approximately equal to length + let alphabet = lookup_alphabet(&metadata.alphabet); + + // If data size matches the expected length (not the encoded size), it's probably raw + if data.len() == metadata.length { + // Try to decode as UTF-8 + if let Ok(raw_string) = String::from_utf8(data.clone()) { + // Data appears to be raw UTF-8 + return Some(raw_string); + } + } + + // Data is probably encoded (size matches expected encoded size), try to decode it + let decoded_bytes = decode_substring_from_bytes( + data, + 0, + metadata.length, + alphabet + ); + + // Convert to string + String::from_utf8(decoded_bytes).ok() + } } impl SequenceCollection { @@ -164,7 +304,7 @@ impl SequenceCollection { /// Create a SequenceCollection from a vector of SequenceRecords. pub fn from_records(records: Vec) -> Self { // Compute lvl1 digests from the metadata - let metadata_refs: Vec<&SequenceMetadata> = records.iter().map(|r| &r.metadata).collect(); + let metadata_refs: Vec<&SequenceMetadata> = records.iter().map(|r| r.metadata()).collect(); let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs); // Compute collection digest from lvl1 digests @@ -175,7 +315,6 @@ impl SequenceCollection { digest: collection_digest, lvl1, file_path: None, - has_data: true, } } @@ -214,7 +353,7 @@ impl SequenceCollection { // Write the SequenceCollection to the FARG file if write_cache && !farg_file_path.exists() { - seqcol.to_farg()?; + seqcol.write_farg()?; println!("Farg file written to {:?}", farg_file_path); } else { println!( @@ -225,24 +364,37 @@ impl SequenceCollection { Ok(seqcol) } - /// Write the SequenceCollection to a FASTA refget (FARG) file - /// * `file_path` - The path to the FARG file to be written. - pub fn to_farg_path>(&self, file_path: P) -> Result<()> { - // Write the FARGI file + /// Write the SequenceCollection to a collection FARG file. + /// + /// Creates a FARG file with collection-level digest headers followed by + /// sequence metadata for all sequences in this collection. + /// + /// # Arguments + /// * `file_path` - The path to the FARG file to be written + /// + /// # Returns + /// Result indicating success or error + /// + /// # Format + /// The file includes: + /// - Collection digest headers (##seqcol_digest, ##names_digest, etc.) + /// - Column header (#name, length, alphabet, sha512t24u, md5) + /// - One line per sequence with metadata + pub fn write_collection_farg>(&self, file_path: P) -> Result<()> { let file_path = file_path.as_ref(); - println!("Writing farg file: {:?}", file_path); + println!("Writing collection farg file: {:?}", file_path); let mut file = std::fs::File::create(file_path)?; - // Write header with digest metadata + // Write collection digest headers writeln!(file, "##seqcol_digest={}", self.digest)?; writeln!(file, "##names_digest={}", self.lvl1.names_digest)?; writeln!(file, "##sequences_digest={}", self.lvl1.sequences_digest)?; writeln!(file, "##lengths_digest={}", self.lvl1.lengths_digest)?; writeln!(file, "#name\tlength\talphabet\tsha512t24u\tmd5")?; - // Write sequence data + // Write sequence metadata for result_sr in &self.sequences { - let result = result_sr.metadata.clone(); + let result = result_sr.metadata().clone(); writeln!( file, "{}\t{}\t{}\t{}\t{}", @@ -252,17 +404,65 @@ impl SequenceCollection { Ok(()) } - /// Write the SeqColDigest to a FARG file, using the file path stored in the struct. - pub fn to_farg(&self) -> Result<()> { + /// Write the SequenceCollection to a FARG file, using the file path stored in the struct. + pub fn write_farg(&self) -> Result<()> { if let Some(ref file_path) = self.file_path { let farg_file_path = file_path.replace_exts_with("farg"); - self.to_farg_path(farg_file_path) + self.write_collection_farg(farg_file_path) } else { Err(anyhow::anyhow!( - "No file path specified for FARG output. Use `to_farg_path` to specify a file path." + "No file path specified for FARG output. Use `write_collection_farg` to specify a file path." )) } } + + /// Write the SequenceCollection to a FASTA file. + /// + /// Writes all sequences in this collection to a FASTA file with the specified line width. + /// Only sequences with loaded data (Full variant) will be written. + /// + /// # Arguments + /// * `file_path` - The path to the output FASTA file + /// * `line_width` - Number of bases per line (default: 70) + /// + /// # Returns + /// Result indicating success or error + /// + /// # Errors + /// Returns an error if any sequence doesn't have data loaded (Stub variant) + pub fn write_fasta>(&self, file_path: P, line_width: Option) -> Result<()> { + let line_width = line_width.unwrap_or(70); + let mut output_file = File::create(file_path)?; + + for record in &self.sequences { + // Check if record has data + if !record.has_data() { + return Err(anyhow::anyhow!( + "Cannot write FASTA: sequence '{}' does not have data loaded", + record.metadata().name + )); + } + + // Decode the sequence + let decoded_sequence = record.decode().ok_or_else(|| { + anyhow::anyhow!( + "Failed to decode sequence '{}'", + record.metadata().name + ) + })?; + + // Write FASTA header + writeln!(output_file, ">{}", record.metadata().name)?; + + // Write sequence with line wrapping + for chunk in decoded_sequence.as_bytes().chunks(line_width) { + output_file.write_all(chunk)?; + output_file.write_all(b"\n")?; + } + } + + Ok(()) + } } impl Display for SequenceCollection { @@ -286,12 +486,265 @@ impl Display for SequenceRecord { write!( f, "SequenceRecord: {} (length: {}, alphabet: {}, ga4gh: {:02x?}, md5: {:02x?})", - &self.metadata.name, - &self.metadata.length, - &self.metadata.alphabet, - &self.metadata.sha512t24u, - &self.metadata.md5 + &self.metadata().name, + &self.metadata().length, + &self.metadata().alphabet, + &self.metadata().sha512t24u, + &self.metadata().md5 )?; Ok(()) } } + +// Iterator implementations for SequenceCollection +// Allows: for seq in &collection { ... } +impl<'a> IntoIterator for &'a SequenceCollection { + type Item = &'a SequenceRecord; + type IntoIter = std::slice::Iter<'a, SequenceRecord>; + + fn into_iter(self) -> Self::IntoIter { + self.sequences.iter() + } +} + +// Consuming iterator +// Allows: for seq in collection { ... } (consumes the collection) +impl IntoIterator for SequenceCollection { + type Item = SequenceRecord; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.sequences.into_iter() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::fasta::{digest_fasta, load_fasta}; + use crate::encoder::encode_sequence; + + #[test] + fn test_decode_returns_none_when_no_data() { + // Test that decode() returns None when data is None + let seqcol = digest_fasta("../tests/data/fasta/base.fa") + .expect("Failed to load test FASTA file"); + + for seq_record in &seqcol.sequences { + assert!(!seq_record.has_data(), "digest_fasta should not load sequence data"); + assert!(matches!(seq_record, SequenceRecord::Stub(_)), "digest_fasta should create Stub records"); + assert_eq!(seq_record.decode(), None, "decode() should return None for Stub records"); + } + } + + #[test] + fn test_decode_with_loaded_data() { + // Test that decode() returns the correct sequence when data is loaded + let seqcol = load_fasta("../tests/data/fasta/base.fa") + .expect("Failed to load test FASTA file"); + + // Expected sequences from base.fa + let expected_sequences = vec![ + ("chrX", "TTGGGGAA"), + ("chr1", "GGAA"), + ("chr2", "GCGC"), + ]; + + for (seq_record, (expected_name, expected_seq)) in seqcol.sequences.iter().zip(expected_sequences.iter()) { + assert_eq!(seq_record.metadata().name, *expected_name); + assert!(seq_record.has_data(), "load_fasta should load sequence data"); + assert!(matches!(seq_record, SequenceRecord::Full { .. }), "load_fasta should create Full records"); + + let decoded = seq_record.decode().expect("decode() should return Some when data is present"); + assert_eq!(decoded, *expected_seq, + "Decoded sequence for {} should match expected sequence", expected_name); + } + } + + #[test] + fn test_decode_handles_encoded_data() { + // Test that decode() correctly handles bit-packed encoded data + use crate::alphabet::{lookup_alphabet, AlphabetType}; + + let sequence = b"ACGT"; + let alphabet = lookup_alphabet(&AlphabetType::Dna2bit); + let encoded_data = encode_sequence(sequence, alphabet); + + let record = SequenceRecord::Full { + metadata: SequenceMetadata { + name: "test_seq".to_string(), + length: 4, + sha512t24u: "test_digest".to_string(), + md5: "test_md5".to_string(), + alphabet: AlphabetType::Dna2bit, + fai: None, + }, + sequence: encoded_data, + }; + + let decoded = record.decode().expect("Should decode encoded data"); + assert_eq!(decoded, "ACGT", "Should correctly decode bit-packed DNA sequence"); + } + + #[test] + fn test_decode_handles_raw_utf8_data() { + // Test that decode() handles raw UTF-8 data (not encoded) + let raw_sequence = b"ACGTACGT".to_vec(); + + let record = SequenceRecord::Full { + metadata: SequenceMetadata { + name: "test_seq".to_string(), + length: 8, + sha512t24u: "test_digest".to_string(), + md5: "test_md5".to_string(), + alphabet: AlphabetType::Ascii, + fai: None, + }, + sequence: raw_sequence, + }; + + let decoded = record.decode().expect("Should decode raw UTF-8 data"); + assert_eq!(decoded, "ACGTACGT", "Should correctly decode raw sequence data"); + } + + #[test] + fn test_decode_with_iupac_alphabet() { + // Test decode with IUPAC DNA alphabet (4-bit encoding) + use crate::alphabet::{lookup_alphabet, AlphabetType}; + + let sequence = b"ACGTRYMK"; + let alphabet = lookup_alphabet(&AlphabetType::DnaIupac); + let encoded_data = encode_sequence(sequence, alphabet); + + let record = SequenceRecord::Full { + metadata: SequenceMetadata { + name: "iupac_test".to_string(), + length: 8, + sha512t24u: "test_digest".to_string(), + md5: "test_md5".to_string(), + alphabet: AlphabetType::DnaIupac, + fai: None, + }, + sequence: encoded_data, + }; + + let decoded = record.decode().expect("Should decode IUPAC encoded data"); + assert_eq!(decoded, "ACGTRYMK", "Should correctly decode IUPAC DNA sequence"); + } + + #[test] + fn test_decode_with_protein_alphabet() { + // Test decode with protein alphabet (5-bit encoding) + use crate::alphabet::{lookup_alphabet, AlphabetType}; + + let sequence = b"ACDEFGHIKLMNPQRSTVWY"; + let alphabet = lookup_alphabet(&AlphabetType::Protein); + let encoded_data = encode_sequence(sequence, alphabet); + + let record = SequenceRecord::Full { + metadata: SequenceMetadata { + name: "protein_test".to_string(), + length: 20, + sha512t24u: "test_digest".to_string(), + md5: "test_md5".to_string(), + alphabet: AlphabetType::Protein, + fai: None, + }, + sequence: encoded_data, + }; + + let decoded = record.decode().expect("Should decode protein encoded data"); + assert_eq!(decoded, "ACDEFGHIKLMNPQRSTVWY", "Should correctly decode protein sequence"); + } + + #[test] + fn test_decode_empty_sequence() { + // Test decode with empty data + let record = SequenceRecord::Full { + metadata: SequenceMetadata { + name: "empty_seq".to_string(), + length: 0, + sha512t24u: "test_digest".to_string(), + md5: "test_md5".to_string(), + alphabet: AlphabetType::Dna2bit, + fai: None, + }, + sequence: Vec::new(), + }; + + let decoded = record.decode().expect("Should handle empty sequence"); + assert_eq!(decoded, "", "Empty sequence should decode to empty string"); + } + + #[test] + fn test_decode_detects_raw_vs_encoded() { + // Test that decode() correctly distinguishes between raw and encoded data + use crate::alphabet::{lookup_alphabet, AlphabetType}; + + // Create raw UTF-8 data that matches the sequence length (heuristic for raw data) + let raw_sequence = b"GGGGGGGG"; // 8 bytes for 8-symbol sequence + let record_raw = SequenceRecord::Full { + metadata: SequenceMetadata { + name: "raw_test".to_string(), + length: 8, + sha512t24u: "test_digest".to_string(), + md5: "test_md5".to_string(), + alphabet: AlphabetType::Dna2bit, + fai: None, + }, + sequence: raw_sequence.to_vec(), + }; + + let decoded_raw = record_raw.decode().expect("Should decode raw data"); + assert_eq!(decoded_raw, "GGGGGGGG"); + + // Create encoded data (2 bits per symbol, so 8 symbols = 2 bytes) + let alphabet = lookup_alphabet(&AlphabetType::Dna2bit); + let encoded_data = encode_sequence(b"GGGGGGGG", alphabet); + assert_eq!(encoded_data.len(), 2, "Encoded 2-bit DNA should be 2 bytes for 8 symbols"); + + let record_encoded = SequenceRecord::Full { + metadata: SequenceMetadata { + name: "encoded_test".to_string(), + length: 8, + sha512t24u: "test_digest".to_string(), + md5: "test_md5".to_string(), + alphabet: AlphabetType::Dna2bit, + fai: None, + }, + sequence: encoded_data, + }; + + let decoded_encoded = record_encoded.decode().expect("Should decode encoded data"); + assert_eq!(decoded_encoded, "GGGGGGGG"); + } + + #[test] + fn test_sequence_collection_iterator() { + // Test that SequenceCollection can be iterated over + let collection = load_fasta("../tests/data/fasta/base.fa") + .expect("Failed to load test FASTA file"); + + // Test borrowing iterator (&collection) + let mut count = 0; + for seq in &collection { + assert!(seq.metadata().length > 0, "Sequence should have length"); + count += 1; + } + assert_eq!(count, 3, "base.fa should have 3 sequences"); + + // Collection should still be usable after borrowing iteration + assert_eq!(collection.sequences.len(), 3); + + // Test consuming iterator (collection) + let names: Vec = collection + .into_iter() + .map(|seq| seq.metadata().name.clone()) + .collect(); + + assert_eq!(names.len(), 3); + assert!(names.contains(&"chrX".to_string())); + assert!(names.contains(&"chr1".to_string())); + assert!(names.contains(&"chr2".to_string())); + } +} diff --git a/gtars-refget/src/encoder.rs b/gtars-refget/src/encoder.rs index 94f6ddc3..ead77310 100644 --- a/gtars-refget/src/encoder.rs +++ b/gtars-refget/src/encoder.rs @@ -80,11 +80,16 @@ impl SequenceEncoder { /// is specified by the `bits_per_symbol` parameter. The function returns /// a bit-packed vector of bytes containing the encoded sequence. /// +/// **Bit Ordering: MSB-first (Most Significant Bit first)** +/// +/// Example with 2-bit DNA encoding (A=00, C=01, G=10, T=11): +/// - Sequence "ACGT" → byte 0x1B (00011011) +/// - A (00) in bits 7-6, C (01) in bits 5-4, G (10) in bits 3-2, T (11) in bits 1-0 +/// /// # Arguments /// /// * `sequence` - The sequence to encode -/// * `encoding_array` - The encoding array to use -/// * `bits_per_symbol` - The number of bits used to represent each symbol +/// * `alphabet` - The alphabet defining the encoding /// /// # Returns /// diff --git a/gtars-refget/src/fasta.rs b/gtars-refget/src/fasta.rs index 91d27749..3ca5e1a3 100644 --- a/gtars-refget/src/fasta.rs +++ b/gtars-refget/src/fasta.rs @@ -1,16 +1,22 @@ -use anyhow::{Context, Result}; +use anyhow::Result; use std::io::BufRead; use std::path::Path; use super::alphabet::{AlphabetGuesser, AlphabetType}; -use super::collection::{SeqColDigestLvl1, SequenceCollection, SequenceMetadata, SequenceRecord}; - -use gtars_core::utils::get_dynamic_reader; +use super::collection::{FaiMetadata, SeqColDigestLvl1, SequenceCollection, SequenceMetadata, SequenceRecord}; use md5::Md5; -use seq_io::fasta::{Reader, Record}; use sha2::{Digest, Sha512}; +/// A lightweight record containing only FAI (FASTA index) metadata for a sequence. +/// Used by `compute_fai()` for fast FAI-only computation without digest overhead. +#[derive(Clone, Debug)] +pub struct FaiRecord { + pub name: String, + pub length: usize, + pub fai: Option, +} + /// Processes a FASTA file to compute the digests of each sequence in the file. /// /// This function reads a FASTA file, computes the SHA-512 and MD5 digests for each sequence, @@ -37,58 +43,520 @@ use sha2::{Digest, Sha512}; /// /// pub fn digest_fasta>(file_path: T) -> Result { + use gtars_core::utils::get_dynamic_reader; + + println!("Processing FASTA file: {}", file_path.as_ref().display()); + + // Detect if file is gzipped + let is_gzipped = file_path.as_ref().extension() + .and_then(|s| s.to_str()) + .map(|s| s == "gz") + .unwrap_or(false); + + // Skip FAI data for gzipped files (can't seek into compressed files) + let fai_enabled = !is_gzipped; + + let mut byte_position: u64 = 0; + let file_reader = get_dynamic_reader(file_path.as_ref())?; - let mut fasta_reader = Reader::new(file_reader); + let mut reader = std::io::BufReader::new(file_reader); let mut results = Vec::new(); - println!("Processing FASTA file: {}", file_path.as_ref().display()); - while let Some(record) = fasta_reader.next() { - // returns a RefRecord object - let record = record.with_context(|| { - format!( - "Failed to read FASTA record from file: {}", - file_path.as_ref().display() - ) - })?; - let id = record.id().with_context(|| { - format!( - "FASTA record #{} is missing a sequence ID", - results.len() + 1 - ) - })?; - let mut sha512_hasher = Sha512::new(); - let mut md5_hasher = Md5::new(); - let mut length = 0; - let mut sequence = Vec::new(); - let mut alphabet_guesser = AlphabetGuesser::new(); - for seq_line in record.seq_lines() { - let seq_line = seq_line.to_ascii_uppercase(); - sha512_hasher.update(&seq_line); - md5_hasher.update(&seq_line); - length += seq_line.len(); - sequence.extend_from_slice(&seq_line); - alphabet_guesser.update(&seq_line); + let mut line = String::new(); + + let mut current_id: Option = None; + let mut current_offset: u64 = 0; + let mut current_line_bases: Option = None; + let mut current_line_bytes: Option = None; + let mut sha512_hasher = Sha512::new(); + let mut md5_hasher = Md5::new(); + let mut length = 0; + let mut alphabet_guesser = AlphabetGuesser::new(); + + loop { + let bytes_read = reader.read_line(&mut line)?; + if bytes_read == 0 { + // EOF - finalize the last sequence if any + if let Some(id) = current_id.take() { + let sha512 = base64_url::encode(&sha512_hasher.finalize_reset()[0..24]); + let md5 = format!("{:x}", md5_hasher.finalize_reset()); + let alphabet = alphabet_guesser.guess(); + + let fai = if fai_enabled { + if let (Some(lb), Some(lby)) = (current_line_bases, current_line_bytes) { + Some(super::collection::FaiMetadata { + offset: current_offset, + line_bases: lb, + line_bytes: lby, + }) + } else { + None + } + } else { + None + }; + + let metadata = SequenceMetadata { + name: id.to_string(), + length, + sha512t24u: sha512, + md5, + alphabet, + fai, + }; + + results.push(SequenceRecord::Stub(metadata)); + } + break; } - let sha512 = base64_url::encode(&sha512_hasher.finalize_reset()[0..24]); - let md5 = format!("{:x}", md5_hasher.finalize_reset()); - let alphabet = alphabet_guesser.guess(); - - // Create SequenceRecord - let metadata = SequenceMetadata { - name: id.to_string(), - length, - sha512t24u: sha512, - md5, - alphabet, - }; - results.push(SequenceRecord { - metadata, - data: None, - }); + if line.starts_with('>') { + // Save previous sequence if any + if let Some(id) = current_id.take() { + let sha512 = base64_url::encode(&sha512_hasher.finalize_reset()[0..24]); + let md5 = format!("{:x}", md5_hasher.finalize_reset()); + let alphabet = alphabet_guesser.guess(); + + let fai = if fai_enabled { + if let (Some(lb), Some(lby)) = (current_line_bases, current_line_bytes) { + Some(super::collection::FaiMetadata { + offset: current_offset, + line_bases: lb, + line_bytes: lby, + }) + } else { + None + } + } else { + None + }; + + let metadata = SequenceMetadata { + name: id.to_string(), + length, + sha512t24u: sha512, + md5, + alphabet, + fai, + }; + + results.push(SequenceRecord::Stub(metadata)); + } + + // Start new sequence + let id = line[1..].trim().to_string(); + current_id = Some(id); + + // Track position for FAI + if fai_enabled { + byte_position += bytes_read as u64; + current_offset = byte_position; + } + current_line_bases = None; + current_line_bytes = None; + sha512_hasher = Sha512::new(); + md5_hasher = Md5::new(); + length = 0; + alphabet_guesser = AlphabetGuesser::new(); + } else if current_id.is_some() && !line.trim().is_empty() { + // Sequence line + let trimmed = line.trim_end(); + let line_len_bytes = bytes_read as u32; + let line_len_bases = trimmed.len() as u32; + + // Record line dimensions from first sequence line + if current_line_bases.is_none() { + current_line_bases = Some(line_len_bases); + current_line_bytes = Some(line_len_bytes); + } + + // Update digests and length + let seq_upper = trimmed.to_ascii_uppercase(); + sha512_hasher.update(seq_upper.as_bytes()); + md5_hasher.update(seq_upper.as_bytes()); + length += trimmed.len(); + alphabet_guesser.update(seq_upper.as_bytes()); + + // Track position for FAI + if fai_enabled { + byte_position += bytes_read as u64; + } + } else { + // Track position for empty lines or other content + if fai_enabled { + byte_position += bytes_read as u64; + } + } + + line.clear(); } // Compute lvl1 digests from the sequence records - let metadata_refs: Vec<&SequenceMetadata> = results.iter().map(|r| &r.metadata).collect(); + let metadata_refs: Vec<&SequenceMetadata> = results.iter().map(|r| r.metadata()).collect(); + let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs); + + // Compute collection digest from lvl1 digests + let collection_digest = lvl1.to_digest(); + + Ok(SequenceCollection { + sequences: results, + digest: collection_digest, + lvl1, + file_path: Some(file_path.as_ref().to_path_buf()), + }) +} + +/// Computes FAI (FASTA index) metadata for sequences in a FASTA file without computing digests. +/// +/// This is a lightweight alternative to `digest_fasta()` for cases where only FAI metadata +/// is needed. It skips the expensive digest computation (SHA512, MD5) and alphabet detection, +/// making it much faster for FAI-only indexing (similar to `samtools faidx`). +/// +/// # Arguments +/// +/// * `file_path` - Path to the FASTA file to index +/// +/// # Returns +/// +/// A vector of `FaiRecord` structs containing sequence name, length, and FAI metadata. +/// For gzipped files, the `fai` field will be `None` since byte offsets are not meaningful +/// for compressed data. +/// +/// # Examples +/// +/// ``` +/// use gtars_refget::fasta::compute_fai; +/// +/// // Using a test file from the repository +/// let fai_records = compute_fai("../tests/data/fasta/base.fa").expect("Failed to compute FAI"); +/// assert_eq!(fai_records.len(), 3); +/// +/// for record in fai_records { +/// println!("{}: {} bp", record.name, record.length); +/// if let Some(fai) = record.fai { +/// println!(" offset: {}, line_bases: {}, line_bytes: {}", +/// fai.offset, fai.line_bases, fai.line_bytes); +/// } +/// } +/// ``` +pub fn compute_fai>(file_path: T) -> Result> { + use gtars_core::utils::get_dynamic_reader; + + // Detect if file is gzipped + let is_gzipped = file_path.as_ref().extension() + .and_then(|s| s.to_str()) + .map(|s| s == "gz") + .unwrap_or(false); + + // Skip FAI data for gzipped files (can't seek into compressed files) + let fai_enabled = !is_gzipped; + + let mut byte_position: u64 = 0; + + let file_reader = get_dynamic_reader(file_path.as_ref())?; + let mut reader = std::io::BufReader::new(file_reader); + let mut results = Vec::new(); + let mut line = String::new(); + + let mut current_id: Option = None; + let mut current_offset: u64 = 0; + let mut current_line_bases: Option = None; + let mut current_line_bytes: Option = None; + let mut length = 0; + + loop { + let bytes_read = reader.read_line(&mut line)?; + if bytes_read == 0 { + // EOF - finalize the last sequence if any + if let Some(id) = current_id.take() { + let fai = if fai_enabled { + if let (Some(lb), Some(lby)) = (current_line_bases, current_line_bytes) { + Some(FaiMetadata { + offset: current_offset, + line_bases: lb, + line_bytes: lby, + }) + } else { + None + } + } else { + None + }; + + results.push(FaiRecord { + name: id, + length, + fai, + }); + } + break; + } + + if line.starts_with('>') { + // Save previous sequence if any + if let Some(id) = current_id.take() { + let fai = if fai_enabled { + if let (Some(lb), Some(lby)) = (current_line_bases, current_line_bytes) { + Some(FaiMetadata { + offset: current_offset, + line_bases: lb, + line_bytes: lby, + }) + } else { + None + } + } else { + None + }; + + results.push(FaiRecord { + name: id, + length, + fai, + }); + } + + // Start new sequence + let id = line[1..].trim().to_string(); + current_id = Some(id); + + // Track position for FAI + if fai_enabled { + byte_position += bytes_read as u64; + current_offset = byte_position; + } + current_line_bases = None; + current_line_bytes = None; + length = 0; + } else if current_id.is_some() && !line.trim().is_empty() { + // Sequence line + let trimmed = line.trim_end(); + let line_len_bytes = bytes_read as u32; + let line_len_bases = trimmed.len() as u32; + + // Record line dimensions from first sequence line + if current_line_bases.is_none() { + current_line_bases = Some(line_len_bases); + current_line_bytes = Some(line_len_bytes); + } + + // Update length (no need for uppercase or hashing) + length += trimmed.len(); + + // Track position for FAI + if fai_enabled { + byte_position += bytes_read as u64; + } + } else { + // Track position for empty lines or other content + if fai_enabled { + byte_position += bytes_read as u64; + } + } + + line.clear(); + } + + Ok(results) +} + +/// Loads a FASTA file with sequence data in memory (not just metadata). +/// +/// This function reads a FASTA file and returns a SequenceCollection where each +/// SequenceRecord contains both metadata AND the actual sequence data. This is useful +/// when you need direct access to sequences without going through a RefgetStore. +/// +/// # Arguments +/// +/// * `file_path` - A path to the FASTA file to be loaded. +/// +/// # Returns +/// +/// A SequenceCollection with: +/// - `metadata`: Complete SequenceMetadata (name, length, digests, alphabet) +/// - `fai`: FAI metadata if not gzipped +/// - `data`: Some(Vec) containing the raw sequence bytes (uppercased) +/// +/// # Errors +/// +/// This function will return an error if: +/// - The file cannot be opened or read +/// - The FASTA format is invalid or corrupted +/// - A sequence record is missing an ID +/// +/// # Examples +/// +/// ``` +/// use gtars_refget::fasta::load_fasta; +/// +/// let seqcol = load_fasta("../tests/data/fasta/base.fa").expect("Failed to load FASTA"); +/// for seq_record in &seqcol.sequences { +/// println!("Sequence: {}", seq_record.metadata().name); +/// if let Some(data) = seq_record.sequence() { +/// println!(" Data: {} bytes", data.len()); +/// } +/// } +/// ``` +pub fn load_fasta>(file_path: P) -> Result { + use gtars_core::utils::get_dynamic_reader; + + println!("Loading FASTA file with data: {}", file_path.as_ref().display()); + + // Detect if file is gzipped + let is_gzipped = file_path.as_ref().extension() + .and_then(|s| s.to_str()) + .map(|s| s == "gz") + .unwrap_or(false); + + // Skip FAI data for gzipped files (can't seek into compressed files) + let fai_enabled = !is_gzipped; + + let mut byte_position: u64 = 0; + + let file_reader = get_dynamic_reader(file_path.as_ref())?; + let mut reader = std::io::BufReader::new(file_reader); + let mut results = Vec::new(); + let mut line = String::new(); + + let mut current_id: Option = None; + let mut current_offset: u64 = 0; + let mut current_line_bases: Option = None; + let mut current_line_bytes: Option = None; + let mut sha512_hasher = Sha512::new(); + let mut md5_hasher = Md5::new(); + let mut length = 0; + let mut alphabet_guesser = AlphabetGuesser::new(); + let mut sequence_data = Vec::new(); // Accumulate sequence data + + loop { + let bytes_read = reader.read_line(&mut line)?; + if bytes_read == 0 { + // EOF - finalize the last sequence if any + if let Some(id) = current_id.take() { + let sha512 = base64_url::encode(&sha512_hasher.finalize_reset()[0..24]); + let md5 = format!("{:x}", md5_hasher.finalize_reset()); + let alphabet = alphabet_guesser.guess(); + + let fai = if fai_enabled { + if let (Some(lb), Some(lby)) = (current_line_bases, current_line_bytes) { + Some(super::collection::FaiMetadata { + offset: current_offset, + line_bases: lb, + line_bytes: lby, + }) + } else { + None + } + } else { + None + }; + + let metadata = SequenceMetadata { + name: id.to_string(), + length, + sha512t24u: sha512, + md5, + alphabet, + fai, + }; + + results.push(SequenceRecord::Full { + metadata, + sequence: sequence_data.clone(), + }); + } + break; + } + + if line.starts_with('>') { + // Save previous sequence if any + if let Some(id) = current_id.take() { + let sha512 = base64_url::encode(&sha512_hasher.finalize_reset()[0..24]); + let md5 = format!("{:x}", md5_hasher.finalize_reset()); + let alphabet = alphabet_guesser.guess(); + + let fai = if fai_enabled { + if let (Some(lb), Some(lby)) = (current_line_bases, current_line_bytes) { + Some(super::collection::FaiMetadata { + offset: current_offset, + line_bases: lb, + line_bytes: lby, + }) + } else { + None + } + } else { + None + }; + + let metadata = SequenceMetadata { + name: id.to_string(), + length, + sha512t24u: sha512, + md5, + alphabet, + fai, + }; + + results.push(SequenceRecord::Full { + metadata, + sequence: sequence_data.clone(), + }); + } + + // Start new sequence + let id = line[1..].trim().to_string(); + current_id = Some(id); + + // Track position for FAI + if fai_enabled { + byte_position += bytes_read as u64; + current_offset = byte_position; + } + current_line_bases = None; + current_line_bytes = None; + sha512_hasher = Sha512::new(); + md5_hasher = Md5::new(); + length = 0; + alphabet_guesser = AlphabetGuesser::new(); + sequence_data = Vec::new(); // Reset for new sequence + } else if current_id.is_some() && !line.trim().is_empty() { + // Sequence line + let trimmed = line.trim_end(); + let line_len_bytes = bytes_read as u32; + let line_len_bases = trimmed.len() as u32; + + // Record line dimensions from first sequence line + if current_line_bases.is_none() { + current_line_bases = Some(line_len_bases); + current_line_bytes = Some(line_len_bytes); + } + + // Update digests and length + let seq_upper = trimmed.to_ascii_uppercase(); + sha512_hasher.update(seq_upper.as_bytes()); + md5_hasher.update(seq_upper.as_bytes()); + length += trimmed.len(); + alphabet_guesser.update(seq_upper.as_bytes()); + + // Accumulate sequence data + sequence_data.extend_from_slice(seq_upper.as_bytes()); + + // Track position for FAI + if fai_enabled { + byte_position += bytes_read as u64; + } + } else { + // Track position for empty lines or other content + if fai_enabled { + byte_position += bytes_read as u64; + } + } + + line.clear(); + } + + // Compute lvl1 digests from the sequence records + let metadata_refs: Vec<&SequenceMetadata> = results.iter().map(|r| r.metadata()).collect(); let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs); // Compute collection digest from lvl1 digests @@ -99,7 +567,6 @@ pub fn digest_fasta>(file_path: T) -> Result digest: collection_digest, lvl1, file_path: Some(file_path.as_ref().to_path_buf()), - has_data: true, }) } @@ -124,15 +591,15 @@ pub fn read_fasta_refget_file>(file_path: T) -> Result seqcol_digest = value.to_string(), - "names_digest" => names_digest = value.to_string(), - "sequences_digest" => sequences_digest = value.to_string(), - "lengths_digest" => lengths_digest = value.to_string(), - _ => {} // Ignore unknown metadata keys + if let Some(stripped) = line.strip_prefix("##") { + if let Some((key, value)) = stripped.split_once('=') { + match key { + "seqcol_digest" => seqcol_digest = value.to_string(), + "names_digest" => names_digest = value.to_string(), + "sequences_digest" => sequences_digest = value.to_string(), + "lengths_digest" => lengths_digest = value.to_string(), + _ => {} // Ignore unknown metadata keys + } } } continue; @@ -155,12 +622,10 @@ pub fn read_fasta_refget_file>(file_path: T) -> Result>(file_path: T) -> Result = results.iter().map(|r| &r.metadata).collect(); + let metadata_vec: Vec<&SequenceMetadata> = results.iter().map(|r| r.metadata()).collect(); lvl1 = SeqColDigestLvl1::from_metadata(&metadata_vec); } else { lvl1 = SeqColDigestLvl1 { @@ -190,7 +655,6 @@ pub fn read_fasta_refget_file>(file_path: T) -> ResultchrX\n" + assert_eq!(fai0.line_bases, 8); + assert_eq!(fai0.line_bytes, 9); // Includes \n + + assert!(results[1].metadata().fai.is_some()); + let fai1 = results[1].metadata().fai.as_ref().unwrap(); + assert_eq!(fai1.offset, 21); // After ">chr1\n" + assert_eq!(fai1.line_bases, 4); + assert_eq!(fai1.line_bytes, 5); + + assert!(results[2].metadata().fai.is_some()); + let fai2 = results[2].metadata().fai.as_ref().unwrap(); + assert_eq!(fai2.offset, 32); // After ">chr2\n" + assert_eq!(fai2.line_bases, 4); + assert_eq!(fai2.line_bytes, 5); } #[test] @@ -234,13 +717,13 @@ mod tests { digest_fasta("../tests/data/fasta/base.fa.gz").expect("Can't open test fasta file"); let results = seqcol.sequences; println!("{:?}", results); - assert_eq!(results[0].metadata.length, 8); + assert_eq!(results[0].metadata().length, 8); assert_eq!( - results[0].metadata.sha512t24u, + results[0].metadata().sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw" ); - assert_eq!(results[0].metadata.md5, "5f63cfaa3ef61f88c9635fb9d18ec945"); - assert_eq!(results[0].metadata.alphabet, AlphabetType::Dna2bit); + assert_eq!(results[0].metadata().md5, "5f63cfaa3ef61f88c9635fb9d18ec945"); + assert_eq!(results[0].metadata().alphabet, AlphabetType::Dna2bit); } #[test] @@ -253,7 +736,7 @@ mod tests { fn digests_fa_to_farg() { let seqcol = SequenceCollection::from_path_no_cache("../tests/data/fasta/base.fa") .expect("Failed to create SequenceCollection from FASTA file"); - seqcol.to_farg().expect("Failed to write farg file"); + seqcol.write_farg().expect("Failed to write farg file"); let loaded_seqcol = read_fasta_refget_file("../tests/data/fasta/base.farg") .expect("Failed to read refget file"); @@ -261,11 +744,11 @@ mod tests { println!("Loaded SequenceCollection: {}", loaded_seqcol); // Test round-trip integrity for (original, read) in seqcol.sequences.iter().zip(loaded_seqcol.sequences.iter()) { - assert_eq!(original.metadata.name, read.metadata.name); - assert_eq!(original.metadata.length, read.metadata.length); - assert_eq!(original.metadata.sha512t24u, read.metadata.sha512t24u); - assert_eq!(original.metadata.md5, read.metadata.md5); - assert_eq!(original.metadata.alphabet, read.metadata.alphabet); + assert_eq!(original.metadata().name, read.metadata().name); + assert_eq!(original.metadata().length, read.metadata().length); + assert_eq!(original.metadata().sha512t24u, read.metadata().sha512t24u); + assert_eq!(original.metadata().md5, read.metadata().md5); + assert_eq!(original.metadata().alphabet, read.metadata().alphabet); } } @@ -289,4 +772,434 @@ mod tests { "cGRMZIb3AVgkcAfNv39RN7hnT5Chk7RX" ); } + + // Helper function to parse samtools FAI file + fn parse_fai_file(fai_path: &str) -> Vec<(String, u64, u64, u32, u32)> { + use std::fs::File; + use std::io::{BufRead, BufReader}; + + let file = File::open(fai_path).expect("Failed to open FAI file"); + let reader = BufReader::new(file); + let mut results = Vec::new(); + + for line in reader.lines() { + let line = line.expect("Failed to read line"); + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() >= 5 { + let name = parts[0].to_string(); + let length = parts[1].parse::().expect("Failed to parse length"); + let offset = parts[2].parse::().expect("Failed to parse offset"); + let line_bases = parts[3].parse::().expect("Failed to parse line_bases"); + let line_bytes = parts[4].parse::().expect("Failed to parse line_bytes"); + results.push((name, length, offset, line_bases, line_bytes)); + } + } + results + } + + #[test] + fn fai_wrapped_40() { + let seqcol = digest_fasta("../tests/data/fasta/wrapped_40.fa") + .expect("Failed to digest FASTA file"); + let fai_data = parse_fai_file("../tests/data/fasta/wrapped_40.fa.fai"); + + assert_eq!(seqcol.sequences.len(), fai_data.len()); + + for (seq, (name, length, offset, line_bases, line_bytes)) in + seqcol.sequences.iter().zip(fai_data.iter()) + { + assert_eq!(seq.metadata().name, *name); + assert_eq!(seq.metadata().length, *length as usize); + assert!(seq.metadata().fai.is_some(), "FAI data should be present"); + let fai = seq.metadata().fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {}", name); + } + } + + #[test] + fn fai_wrapped_20() { + let seqcol = digest_fasta("../tests/data/fasta/wrapped_20.fa") + .expect("Failed to digest FASTA file"); + let fai_data = parse_fai_file("../tests/data/fasta/wrapped_20.fa.fai"); + + assert_eq!(seqcol.sequences.len(), fai_data.len()); + + for (seq, (name, length, offset, line_bases, line_bytes)) in + seqcol.sequences.iter().zip(fai_data.iter()) + { + assert_eq!(seq.metadata().name, *name); + assert_eq!(seq.metadata().length, *length as usize); + assert!(seq.metadata().fai.is_some(), "FAI data should be present"); + let fai = seq.metadata().fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {}", name); + } + } + + #[test] + fn fai_unwrapped() { + let seqcol = digest_fasta("../tests/data/fasta/unwrapped.fa") + .expect("Failed to digest FASTA file"); + let fai_data = parse_fai_file("../tests/data/fasta/unwrapped.fa.fai"); + + assert_eq!(seqcol.sequences.len(), fai_data.len()); + + for (seq, (name, length, offset, line_bases, line_bytes)) in + seqcol.sequences.iter().zip(fai_data.iter()) + { + assert_eq!(seq.metadata().name, *name); + assert_eq!(seq.metadata().length, *length as usize); + assert!(seq.metadata().fai.is_some(), "FAI data should be present"); + let fai = seq.metadata().fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {}", name); + } + } + + #[test] + fn fai_base_mixed_lengths() { + // base.fa has mixed sequence lengths (8bp, 4bp, 4bp) on single lines + let seqcol = digest_fasta("../tests/data/fasta/base.fa") + .expect("Failed to digest FASTA file"); + let fai_data = parse_fai_file("../tests/data/fasta/base.fa.fai"); + + assert_eq!(seqcol.sequences.len(), fai_data.len()); + assert_eq!(seqcol.sequences.len(), 3, "base.fa should have 3 sequences"); + + for (seq, (name, length, offset, line_bases, line_bytes)) in + seqcol.sequences.iter().zip(fai_data.iter()) + { + assert_eq!(seq.metadata().name, *name); + assert_eq!(seq.metadata().length, *length as usize); + assert!(seq.metadata().fai.is_some(), "FAI data should be present"); + let fai = seq.metadata().fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {}", name); + } + } + + #[test] + fn fai_gzipped_returns_none() { + let seqcol = digest_fasta("../tests/data/fasta/base.fa.gz") + .expect("Failed to digest gzipped FASTA file"); + + // All sequences should have None for FAI data + for seq in seqcol.sequences.iter() { + assert!(seq.metadata().fai.is_none(), + "Gzipped files should not have FAI data for sequence {}", + seq.metadata().name); + } + } + + #[test] + fn fai_crlf_endings() { + let seqcol = digest_fasta("../tests/data/fasta/crlf_endings.fa") + .expect("Failed to digest FASTA file with CRLF endings"); + let fai_data = parse_fai_file("../tests/data/fasta/crlf_endings.fa.fai"); + + assert_eq!(seqcol.sequences.len(), fai_data.len()); + + for (seq, (name, length, offset, line_bases, line_bytes)) in + seqcol.sequences.iter().zip(fai_data.iter()) + { + assert_eq!(seq.metadata().name, *name); + assert_eq!(seq.metadata().length, *length as usize); + assert!(seq.metadata().fai.is_some(), "FAI data should be present"); + let fai = seq.metadata().fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {} (should be +2 for CRLF)", name); + // Verify CRLF: line_bytes should be line_bases + 2 + assert_eq!(fai.line_bytes, fai.line_bases + 2, + "CRLF files should have line_bytes = line_bases + 2 for {}", + name); + } + } + + #[test] + fn fai_same_sequence_different_wrapping() { + // Test that wrapped_40, wrapped_20, and unwrapped all contain identical sequences + // despite different line wrapping styles + let seqcol_40 = digest_fasta("../tests/data/fasta/wrapped_40.fa") + .expect("Failed to digest wrapped_40.fa"); + let seqcol_20 = digest_fasta("../tests/data/fasta/wrapped_20.fa") + .expect("Failed to digest wrapped_20.fa"); + let seqcol_unwrapped = digest_fasta("../tests/data/fasta/unwrapped.fa") + .expect("Failed to digest unwrapped.fa"); + + // All should have same number of sequences + assert_eq!(seqcol_40.sequences.len(), seqcol_20.sequences.len()); + assert_eq!(seqcol_40.sequences.len(), seqcol_unwrapped.sequences.len()); + + // Compare each sequence across all three files + for ((seq40, seq20), seq_unwrap) in seqcol_40.sequences.iter() + .zip(seqcol_20.sequences.iter()) + .zip(seqcol_unwrapped.sequences.iter()) + { + // Same name + assert_eq!(seq40.metadata().name, seq20.metadata().name); + assert_eq!(seq40.metadata().name, seq_unwrap.metadata().name); + + // Same length + assert_eq!(seq40.metadata().length, seq20.metadata().length, + "Length mismatch for {}", seq40.metadata().name); + assert_eq!(seq40.metadata().length, seq_unwrap.metadata().length, + "Length mismatch for {}", seq40.metadata().name); + + // Same digests (proves sequences are identical) + assert_eq!(seq40.metadata().sha512t24u, seq20.metadata().sha512t24u, + "SHA512 digest mismatch for {} - sequences not identical", + seq40.metadata().name); + assert_eq!(seq40.metadata().sha512t24u, seq_unwrap.metadata().sha512t24u, + "SHA512 digest mismatch for {} - sequences not identical", + seq40.metadata().name); + assert_eq!(seq40.metadata().md5, seq20.metadata().md5, + "MD5 digest mismatch for {} - sequences not identical", + seq40.metadata().name); + assert_eq!(seq40.metadata().md5, seq_unwrap.metadata().md5, + "MD5 digest mismatch for {} - sequences not identical", + seq40.metadata().name); + + // But different FAI data (different wrapping) + assert!(seq40.metadata().fai.is_some() && seq20.metadata().fai.is_some() && seq_unwrap.metadata().fai.is_some()); + let fai40 = seq40.metadata().fai.as_ref().unwrap(); + let fai20 = seq20.metadata().fai.as_ref().unwrap(); + let fai_unwrap = seq_unwrap.metadata().fai.as_ref().unwrap(); + + // Different line_bases due to different wrapping (for multi-line sequences) + // Note: short sequences like chr3 that fit on one line may have same line_bases + // across different wrapping styles, which is correct behavior + if seq40.metadata().length > 40 { + // Long enough to require wrapping in wrapped_40 + assert_ne!(fai40.line_bases, fai20.line_bases, + "wrapped_40 and wrapped_20 should have different line_bases for {}", + seq40.metadata().name); + } + if seq40.metadata().length > 20 { + // Long enough to require wrapping in wrapped_20 + assert_ne!(fai20.line_bases, fai_unwrap.line_bases, + "wrapped_20 and unwrapped should have different line_bases for {}", + seq40.metadata().name); + } + } + } + + // Tests for compute_fai() function + + #[test] + fn test_compute_fai_wrapped_40() { + let fai_records = compute_fai("../tests/data/fasta/wrapped_40.fa") + .expect("Failed to compute FAI"); + let fai_data = parse_fai_file("../tests/data/fasta/wrapped_40.fa.fai"); + + assert_eq!(fai_records.len(), fai_data.len()); + + for (record, (name, length, offset, line_bases, line_bytes)) in + fai_records.iter().zip(fai_data.iter()) + { + assert_eq!(record.name, *name); + assert_eq!(record.length, *length as usize); + assert!(record.fai.is_some(), "FAI data should be present"); + let fai = record.fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {}", name); + } + } + + #[test] + fn test_compute_fai_wrapped_20() { + let fai_records = compute_fai("../tests/data/fasta/wrapped_20.fa") + .expect("Failed to compute FAI"); + let fai_data = parse_fai_file("../tests/data/fasta/wrapped_20.fa.fai"); + + assert_eq!(fai_records.len(), fai_data.len()); + + for (record, (name, length, offset, line_bases, line_bytes)) in + fai_records.iter().zip(fai_data.iter()) + { + assert_eq!(record.name, *name); + assert_eq!(record.length, *length as usize); + assert!(record.fai.is_some(), "FAI data should be present"); + let fai = record.fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {}", name); + } + } + + #[test] + fn test_compute_fai_unwrapped() { + let fai_records = compute_fai("../tests/data/fasta/unwrapped.fa") + .expect("Failed to compute FAI"); + let fai_data = parse_fai_file("../tests/data/fasta/unwrapped.fa.fai"); + + assert_eq!(fai_records.len(), fai_data.len()); + + for (record, (name, length, offset, line_bases, line_bytes)) in + fai_records.iter().zip(fai_data.iter()) + { + assert_eq!(record.name, *name); + assert_eq!(record.length, *length as usize); + assert!(record.fai.is_some(), "FAI data should be present"); + let fai = record.fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {}", name); + } + } + + #[test] + fn test_compute_fai_base() { + let fai_records = compute_fai("../tests/data/fasta/base.fa") + .expect("Failed to compute FAI"); + let fai_data = parse_fai_file("../tests/data/fasta/base.fa.fai"); + + assert_eq!(fai_records.len(), fai_data.len()); + assert_eq!(fai_records.len(), 3, "base.fa should have 3 sequences"); + + for (record, (name, length, offset, line_bases, line_bytes)) in + fai_records.iter().zip(fai_data.iter()) + { + assert_eq!(record.name, *name); + assert_eq!(record.length, *length as usize); + assert!(record.fai.is_some(), "FAI data should be present"); + let fai = record.fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {}", name); + } + } + + #[test] + fn test_compute_fai_crlf() { + let fai_records = compute_fai("../tests/data/fasta/crlf_endings.fa") + .expect("Failed to compute FAI"); + let fai_data = parse_fai_file("../tests/data/fasta/crlf_endings.fa.fai"); + + assert_eq!(fai_records.len(), fai_data.len()); + + for (record, (name, length, offset, line_bases, line_bytes)) in + fai_records.iter().zip(fai_data.iter()) + { + assert_eq!(record.name, *name); + assert_eq!(record.length, *length as usize); + assert!(record.fai.is_some(), "FAI data should be present"); + let fai = record.fai.as_ref().unwrap(); + assert_eq!(fai.offset, *offset, "Offset mismatch for {}", name); + assert_eq!(fai.line_bases, *line_bases, "Line bases mismatch for {}", name); + assert_eq!(fai.line_bytes, *line_bytes, "Line bytes mismatch for {} (should be +2 for CRLF)", name); + // Verify CRLF: line_bytes should be line_bases + 2 + assert_eq!(fai.line_bytes, fai.line_bases + 2, + "CRLF files should have line_bytes = line_bases + 2 for {}", + name); + } + } + + #[test] + fn test_compute_fai_gzipped() { + let fai_records = compute_fai("../tests/data/fasta/base.fa.gz") + .expect("Failed to compute FAI for gzipped file"); + + // All sequences should have None for FAI data + for record in fai_records.iter() { + assert!(record.fai.is_none(), + "Gzipped files should not have FAI data for sequence {}", + record.name); + } + } + + #[test] + fn test_load_fasta_with_data() { + let seqcol = load_fasta("../tests/data/fasta/base.fa") + .expect("Failed to load FASTA with data"); + + assert_eq!(seqcol.sequences.len(), 3); + + // Check first sequence (chrX) + let seq0 = &seqcol.sequences[0]; + assert_eq!(seq0.metadata().name, "chrX"); + assert_eq!(seq0.metadata().length, 8); + assert!(seq0.has_data(), "Sequence should have data"); + + if let Some(data) = seq0.sequence() { + assert_eq!(data.len(), 8); + assert_eq!(std::str::from_utf8(data).unwrap(), "TTGGGGAA"); + } + + // Check second sequence (chr1) + let seq1 = &seqcol.sequences[1]; + assert_eq!(seq1.metadata().name, "chr1"); + assert_eq!(seq1.metadata().length, 4); + assert!(seq1.has_data(), "Sequence should have data"); + + if let Some(data) = seq1.sequence() { + assert_eq!(data.len(), 4); + assert_eq!(std::str::from_utf8(data).unwrap(), "GGAA"); + } + + // Check third sequence (chr2) + let seq2 = &seqcol.sequences[2]; + assert_eq!(seq2.metadata().name, "chr2"); + assert_eq!(seq2.metadata().length, 4); + assert!(seq2.has_data(), "Sequence should have data"); + + if let Some(data) = seq2.sequence() { + assert_eq!(data.len(), 4); + assert_eq!(std::str::from_utf8(data).unwrap(), "GCGC"); + } + + // Verify digests match digest_fasta() output + let digest_seqcol = digest_fasta("../tests/data/fasta/base.fa") + .expect("Failed to digest FASTA"); + + for (loaded, digested) in seqcol.sequences.iter().zip(digest_seqcol.sequences.iter()) { + assert_eq!(loaded.metadata().name, digested.metadata().name); + assert_eq!(loaded.metadata().length, digested.metadata().length); + assert_eq!(loaded.metadata().sha512t24u, digested.metadata().sha512t24u); + assert_eq!(loaded.metadata().md5, digested.metadata().md5); + assert_eq!(loaded.metadata().alphabet, digested.metadata().alphabet); + } + } + + #[test] + fn test_load_fasta_wrapped() { + let seqcol = load_fasta("../tests/data/fasta/wrapped_40.fa") + .expect("Failed to load wrapped FASTA"); + + // Verify all sequences have data + for seq in &seqcol.sequences { + assert!(seq.has_data(), "Sequence {} should have data", seq.metadata().name); + + if let Some(data) = seq.sequence() { + assert_eq!(data.len(), seq.metadata().length, + "Data length mismatch for {}", seq.metadata().name); + + // Verify data is uppercase + let data_str = std::str::from_utf8(data).unwrap(); + assert_eq!(data_str, data_str.to_uppercase(), + "Sequence data should be uppercased"); + } + } + } + + #[test] + fn test_load_fasta_gzipped() { + let seqcol = load_fasta("../tests/data/fasta/base.fa.gz") + .expect("Failed to load gzipped FASTA"); + + // Should still load data from gzipped files + assert_eq!(seqcol.sequences.len(), 3); + assert!(seqcol.sequences[0].has_data(), "Should have sequence data"); + + for seq in &seqcol.sequences { + assert!(seq.has_data(), "Gzipped file should still have sequence data"); + assert!(seq.metadata().fai.is_none(), "Gzipped file should not have FAI metadata"); + } + } } diff --git a/gtars-refget/src/lib.rs b/gtars-refget/src/lib.rs index 3d0e0245..d6448083 100644 --- a/gtars-refget/src/lib.rs +++ b/gtars-refget/src/lib.rs @@ -94,16 +94,17 @@ mod tests { // Create a new sequence store, and dd sequences to the store println!("Adding sequences from FASTA file..."); let start = Instant::now(); - let mut store = GlobalRefgetStore::new(StorageMode::Encoded); - store.import_fasta(&fasta_path).unwrap(); + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta(&fasta_path).unwrap(); let duration = start.elapsed(); println!("⏱️ Time taken to load: {:.2?}", duration); - let mut store2 = GlobalRefgetStore::new(StorageMode::Raw); - store2.import_fasta(&fasta_path).unwrap(); + let mut store2 = GlobalRefgetStore::in_memory(); + store2.disable_encoding(); // Switch to Raw mode + store2.add_sequence_collection_from_fasta(&fasta_path).unwrap(); // Get list of sequences - let sequences = store.list_sequence_digests(); + let sequences: Vec<_> = store.sequence_digests().collect(); assert!(!sequences.is_empty(), "No sequences found in the store"); // Look up the first sequence by digest @@ -147,16 +148,16 @@ mod tests { let temp_dir = tempdir().expect("Failed to create temporary directory"); let temp_path = temp_dir.path(); // Create a new sequence store - let mut store = GlobalRefgetStore::new(StorageMode::Encoded); + let mut store = GlobalRefgetStore::in_memory(); // let fasta_path = "../tests/data/subset.fa.gz"; let fasta_path = "../tests/data/fasta/base.fa.gz"; let temp_fasta = temp_path.join("base.fa.gz"); std::fs::copy(fasta_path, &temp_fasta).expect("Failed to copy base.fa.gz to tempdir"); // Add sequences to the store - store.import_fasta(temp_fasta).unwrap(); + store.add_sequence_collection_from_fasta(temp_fasta).unwrap(); println!("Listing sequences in the store..."); - // let sequences = store.list_sequence_digests(); + // let sequences = store.sequence_digests(); // let digest = &sequences[0]; // let digest_str = String::from_utf8(digest.to_vec()).expect("Invalid ASCII data"); // let digest = "Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"; // from subset.fa.gz diff --git a/gtars-refget/src/store.rs b/gtars-refget/src/store.rs index 900a2eb3..5b07a669 100644 --- a/gtars-refget/src/store.rs +++ b/gtars-refget/src/store.rs @@ -1,23 +1,26 @@ use super::alphabet::{AlphabetType, lookup_alphabet}; use seq_io::fasta::{Reader, Record}; use std::collections::HashMap; +use std::ffi::OsStr; use std::fmt::{Display, Formatter}; use std::path::{Path, PathBuf}; +use std::time::Instant; use super::encoder::SequenceEncoder; -use super::encoder::decode_substring_from_bytes; +use super::encoder::{decode_substring_from_bytes, decode_string_from_bytes, encode_sequence}; use crate::fasta::read_fasta_refget_file; use crate::hashkeyable::HashKeyable; use anyhow::anyhow; use anyhow::{Context, Result}; use chrono::Utc; use flate2::read::GzDecoder; +use flate2::write::GzEncoder; +use flate2::Compression; use gtars_core::utils::{get_dynamic_reader, get_file_info, parse_bedlike_file}; -use memmap2::Mmap; use serde::{Deserialize, Serialize}; -use std::fs::{self, File, OpenOptions, create_dir_all}; +use std::fs::{self, File, create_dir_all}; use std::io::{BufRead, BufReader, Read, Write}; -use std::{io, str}; +use std::str; // Import the HashKeyable trait for converting types to a 32-byte key // Import collection types @@ -26,9 +29,10 @@ use super::collection::{SequenceCollection, SequenceMetadata, SequenceRecord}; // const DEFAULT_COLLECTION_ID: [u8; 32] = [0u8; 32]; // Default collection ID for the name lookup table const DEFAULT_COLLECTION_ID: &str = "DEFAULT_REFGET_SEQUENCE_COLLECTION"; // Default collection ID for the name lookup table +const DEFAULT_SEQDATA_PATH_TEMPLATE: &str = "sequences/%s2/%s.seq"; // Default template for sequence file paths /// Enum storing whether sequences will be stored in Raw or Encoded form -#[derive(Serialize, Deserialize, Debug, Clone, Copy)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)] pub enum StorageMode { Raw, Encoded, @@ -60,6 +64,14 @@ pub struct GlobalRefgetStore { collections: HashMap<[u8; 32], SequenceCollection>, /// Storage strategy for sequences mode: StorageMode, + /// Where the store lives on disk (local store or cache directory) + local_path: Option, + /// Where to pull sequences from (if remote-backed) + remote_source: Option, + /// Template for sequence file paths (e.g., "sequences/%s2/%s.seq") + seqdata_path_template: Option, + /// Whether to cache fetched remote files to disk + cache_to_disk: bool, } /// Metadata for the entire store. @@ -80,11 +92,11 @@ struct StoreMetadata { created_at: String, } -pub struct GetSeqsBedFileIter<'a, K> +pub struct SubstringsFromRegions<'a, K> where K: AsRef<[u8]>, { - store: &'a GlobalRefgetStore, + store: &'a mut GlobalRefgetStore, reader: BufReader>, collection_digest: K, previous_parsed_chr: String, @@ -92,7 +104,7 @@ where line_num: usize, } -impl Iterator for GetSeqsBedFileIter<'_, K> +impl Iterator for SubstringsFromRegions<'_, K> where K: AsRef<[u8]>, { @@ -153,7 +165,7 @@ where } }; - self.current_seq_digest = result.metadata.sha512t24u.clone(); + self.current_seq_digest = result.metadata().sha512t24u.clone(); } let retrieved_substring = match self.store.get_substring( @@ -185,7 +197,8 @@ where impl GlobalRefgetStore { /// Generic constructor. Creates a new, empty `GlobalRefgetStore`. - pub fn new(mode: StorageMode) -> Self { + /// This is a private helper - use `on_disk()` or `in_memory()` instead. + fn new(mode: StorageMode) -> Self { // Initialize the name lookup with a default collection let mut name_lookup = HashMap::new(); name_lookup.insert(DEFAULT_COLLECTION_ID.to_key(), HashMap::new()); @@ -196,18 +209,147 @@ impl GlobalRefgetStore { name_lookup, collections: HashMap::new(), mode, + local_path: None, + remote_source: None, + seqdata_path_template: None, + cache_to_disk: false, // Default to false - users should explicitly choose } } + /// Create a disk-backed RefgetStore + /// + /// Sequences are written to disk immediately and loaded on-demand (lazy loading). + /// Only metadata is kept in memory. + /// + /// # Arguments + /// * `cache_path` - Directory for storing sequences and metadata + /// * `mode` - Storage mode (Raw or Encoded) + /// + /// # Returns + /// Result with a configured disk-backed store + /// + /// # Example + /// ```ignore + /// let store = GlobalRefgetStore::on_disk("/data/store")?; + /// store.add_sequence_collection_from_fasta("genome.fa")?; + /// ``` + pub fn on_disk>(cache_path: P) -> Result { + let cache_path = cache_path.as_ref(); + let index_path = cache_path.join("index.json"); + + if index_path.exists() { + // Load existing store + Self::load_local(cache_path) + } else { + // Create new store with default Encoded mode + let mode = StorageMode::Encoded; + create_dir_all(cache_path)?; + + // Use private new() helper + let mut store = Self::new(mode); + store.local_path = Some(cache_path.to_path_buf()); + store.seqdata_path_template = Some(DEFAULT_SEQDATA_PATH_TEMPLATE.to_string()); + store.cache_to_disk = true; // Always true for on_disk + + // Create directory structure + create_dir_all(cache_path.join("sequences"))?; + create_dir_all(cache_path.join("collections"))?; + + Ok(store) + } + } + + /// Create an in-memory RefgetStore + /// + /// All sequences kept in RAM for fast access. + /// Defaults to Encoded storage mode (2-bit packing for space efficiency). + /// Use set_mode() to change storage mode after creation. + /// + /// # Example + /// ```ignore + /// let store = GlobalRefgetStore::in_memory(); + /// store.add_sequence_collection_from_fasta("genome.fa")?; + /// ``` + pub fn in_memory() -> Self { + Self::new(StorageMode::Encoded) + } + + /// Change the storage mode, re-encoding/decoding existing sequences as needed. + /// + /// When switching from Raw to Encoded: + /// - All Full sequences in memory are encoded (2-bit packed) + /// + /// When switching from Encoded to Raw: + /// - All Full sequences in memory are decoded back to raw bytes + /// + /// Note: Stub sequences (lazy-loaded from disk) are not affected. + /// They will be loaded in the NEW mode when accessed. + /// + /// # Arguments + /// * `new_mode` - The storage mode to switch to + pub fn set_mode(&mut self, new_mode: StorageMode) { + if self.mode == new_mode { + return; // No change needed + } + + // Re-encode/decode all Full sequences in memory + for record in self.sequence_store.values_mut() { + match record { + SequenceRecord::Full { metadata, sequence } => { + match (self.mode, new_mode) { + (StorageMode::Raw, StorageMode::Encoded) => { + // Encode: raw bytes -> 2-bit packed + let alphabet = lookup_alphabet(&metadata.alphabet); + *sequence = encode_sequence(&*sequence, alphabet); + } + (StorageMode::Encoded, StorageMode::Raw) => { + // Decode: 2-bit packed -> raw bytes + let alphabet = lookup_alphabet(&metadata.alphabet); + *sequence = decode_string_from_bytes( + &*sequence, + metadata.length, + alphabet + ); + } + _ => {} // Same mode, no conversion needed + } + } + SequenceRecord::Stub(_) => { + // Stubs don't hold sequence data, nothing to convert + } + } + } + + self.mode = new_mode; + } + + /// Enable 2-bit encoding for space efficiency. + /// Re-encodes any existing Raw sequences in memory. + pub fn enable_encoding(&mut self) { + self.set_mode(StorageMode::Encoded); + } + + /// Disable encoding, use raw byte storage. + /// Decodes any existing Encoded sequences in memory. + pub fn disable_encoding(&mut self) { + self.set_mode(StorageMode::Raw); + } + /// Adds a sequence to the Store /// Ensure that it is added to the appropriate collection. /// If no collection is specified, it will be added to the default collection. + /// + /// # Arguments + /// * `sequence_record` - The sequence to add + /// * `collection_digest` - Collection to add to (or None for default) + /// * `force` - If true, overwrite existing sequences. If false, skip duplicates. // Using Into here instead of the Option direction allows us to accept // either None or [u8; 32], without having to wrap it in Some(). pub fn add_sequence>>( &mut self, sequence_record: SequenceRecord, collection_digest: T, + force: bool, ) -> Result<()> { // Ensure collection exists; otherwise use the default collection let collection_digest = collection_digest @@ -217,31 +359,72 @@ impl GlobalRefgetStore { anyhow::anyhow!("Collection not found for digest: {:?}", collection_digest) })?; + // Get metadata from the record (works for both Stub and Full variants) + let metadata = sequence_record.metadata(); + // Add to name lookup for the collection self.name_lookup .entry(collection_digest) .or_default() .insert( - sequence_record.metadata.name.clone(), - sequence_record.metadata.sha512t24u.to_key(), + metadata.name.clone(), + metadata.sha512t24u.to_key(), ); // Finally, add SequenceRecord to store (consuming the object) - self.add_sequence_record(sequence_record)?; + self.add_sequence_record(sequence_record, force)?; Ok(()) } /// Adds a collection, and all sequences in it, to the store. + /// + /// Skips collections and sequences that already exist. + /// Use `add_sequence_collection_force()` to overwrite existing data. + /// + /// # Arguments + /// * `collection` - The sequence collection to add pub fn add_sequence_collection(&mut self, collection: SequenceCollection) -> Result<()> { + self.add_sequence_collection_internal(collection, false) + } + + /// Adds a collection, and all sequences in it, to the store, overwriting existing data. + /// + /// Forces overwrite of collections and sequences that already exist. + /// Use `add_sequence_collection()` to skip duplicates (safer default). + /// + /// # Arguments + /// * `collection` - The sequence collection to add + pub fn add_sequence_collection_force(&mut self, collection: SequenceCollection) -> Result<()> { + self.add_sequence_collection_internal(collection, true) + } + + /// Internal implementation for adding a sequence collection. + fn add_sequence_collection_internal(&mut self, collection: SequenceCollection, force: bool) -> Result<()> { let coll_digest = collection.digest.to_key(); + // Check if collection already exists + if !force && self.collections.contains_key(&coll_digest) { + // Skip - collection already exists and force=false + return Ok(()); + } + + // Write collection to disk if cache_to_disk is enabled (before moving sequences) + if self.cache_to_disk && self.local_path.is_some() { + self.write_collection_to_disk_single(&collection)?; + } + // Register the collection self.collections.insert(coll_digest, collection.clone()); // Add all sequences in the collection to the store for sequence_record in collection.sequences { - self.add_sequence(sequence_record, coll_digest)?; + self.add_sequence(sequence_record, coll_digest, force)?; + } + + // Write index files so store is immediately loadable + if self.cache_to_disk && self.local_path.is_some() { + self.write_index_files()?; } Ok(()) @@ -249,43 +432,133 @@ impl GlobalRefgetStore { // Adds SequenceRecord to the store. // Should only be used internally, via `add_sequence`, which ensures sequences are added to collections. - fn add_sequence_record(&mut self, sr: SequenceRecord) -> Result<()> { + // If the store is disk-backed (cache_to_disk=true), Full records are written to disk and replaced with Stubs. + fn add_sequence_record(&mut self, sr: SequenceRecord, force: bool) -> Result<()> { + let metadata = sr.metadata(); + let key = metadata.sha512t24u.to_key(); + + // Check if sequence already exists + if !force && self.sequence_store.contains_key(&key) { + // Skip - sequence already exists and force=false + return Ok(()); + } + self.md5_lookup - .insert(sr.metadata.md5.to_key(), sr.metadata.sha512t24u.to_key()); - self.sequence_store - .insert(sr.metadata.sha512t24u.to_key(), sr); + .insert(metadata.md5.to_key(), metadata.sha512t24u.to_key()); + + // Check if we should write Full records to disk + if self.cache_to_disk && self.local_path.is_some() { + match &sr { + SequenceRecord::Full { metadata, sequence } => { + // Write to disk + self.write_sequence_to_disk_single(metadata, sequence)?; + // Store as stub instead + let stub = SequenceRecord::Stub(metadata.clone()); + self.sequence_store.insert(key, stub); + return Ok(()); + } + SequenceRecord::Stub(_) => { + // Already a stub, just add it normally below + } + } + } + + // Add as-is (either memory-only mode, or already a Stub) + self.sequence_store.insert(key, sr); Ok(()) } - // Loading the sequence data requires 2 passes through the FASTA file, because we - // have to guess the alphabet before we can encode. So, the first pass is the digesting - // function, which digests and guesses the alphabet, and calculates length, to produce the - // SequenceDigest object. Then, using this object, we can go through the sequence a second time and encode it. - pub fn import_fasta>(&mut self, file_path: P) -> Result<()> { + /// Add a sequence collection from a FASTA file. + /// + /// Skips sequences and collections that already exist in the store. + /// Use `add_sequence_collection_from_fasta_force()` to overwrite existing data. + /// + /// # Arguments + /// * `file_path` - Path to the FASTA file + /// + /// # Returns + /// Result indicating success or error + /// + /// # Notes + /// Loading sequence data requires 2 passes through the FASTA file: + /// 1. First pass digests and guesses the alphabet to produce SequenceMetadata + /// 2. Second pass encodes the sequences based on the detected alphabet + pub fn add_sequence_collection_from_fasta>(&mut self, file_path: P) -> Result<()> { + self.add_sequence_collection_from_fasta_internal(file_path, false) + } + + /// Add a sequence collection from a FASTA file, overwriting existing data. + /// + /// Forces overwrite of collections and sequences that already exist in the store. + /// Use `add_sequence_collection_from_fasta()` to skip duplicates (safer default). + /// + /// # Arguments + /// * `file_path` - Path to the FASTA file + /// + /// # Returns + /// Result indicating success or error + pub fn add_sequence_collection_from_fasta_force>(&mut self, file_path: P) -> Result<()> { + self.add_sequence_collection_from_fasta_internal(file_path, true) + } + + /// Internal implementation for adding a sequence collection from FASTA. + fn add_sequence_collection_from_fasta_internal>(&mut self, file_path: P, force: bool) -> Result<()> { println!("Loading farg index..."); let seqcol = SequenceCollection::from_fasta(&file_path)?; + // Check if collection already exists and skip if not forcing + if !force && self.collections.contains_key(&seqcol.digest.to_key()) { + println!("Collection already exists, skipping (use force=true to overwrite)"); + return Ok(()); + } + // Register the collection - self.add_sequence_collection(seqcol.clone())?; + self.add_sequence_collection_internal(seqcol.clone(), force)?; // Local hashmap to store SequenceMetadata (digests) let mut seqmeta_hashmap: HashMap = HashMap::new(); let seqcol_sequences = seqcol.sequences.clone(); // Clone to avoid partial move for record in seqcol_sequences { - let seqmeta = record.metadata; + let seqmeta = record.metadata().clone(); seqmeta_hashmap.insert(seqmeta.name.clone(), seqmeta); } let file_reader = get_dynamic_reader(file_path.as_ref())?; let mut fasta_reader = Reader::new(file_reader); - println!("Preparing to load sequences into refget SeqColStore..."); + println!("Loading sequences into GlobalRefgetStore..."); + let start_time = Instant::now(); + let mut seq_count = 0; while let Some(record) = fasta_reader.next() { let record = record?; - let id = record.id()?; - let dr = seqmeta_hashmap[id].clone(); - println!("Digest result: {:?}", dr); + let id = std::str::from_utf8(record.head())?; + let dr = seqmeta_hashmap.get(id) + .ok_or_else(|| { + let available_keys: Vec<_> = seqmeta_hashmap.keys().collect(); + let total = available_keys.len(); + let sample: Vec<_> = available_keys.iter().take(3).collect(); + anyhow::anyhow!( + "Sequence '{}' not found in metadata. Available ({} total): {:?}{}", + id, + total, + sample, + if total > 3 { " ..." } else { "" } + ) + })? + .clone(); + + seq_count += 1; + if seq_count <= 3 { + let display_name = if dr.name.len() > 120 { + format!("{}...", &dr.name[..117]) + } else { + dr.name.clone() + }; + println!(" [{}] {} ({} bp)", seq_count, display_name, dr.length); + } else if seq_count == 4 { + println!(" ..."); + } match self.mode { StorageMode::Raw => { @@ -294,17 +567,15 @@ impl GlobalRefgetStore { for seq_line in record.seq_lines() { raw_sequence.extend(seq_line); } - println!( - "Storing raw sequence. Name: {}; Alphabet: {}; Digest: {}", - id, dr.alphabet, dr.sha512t24u - ); + // Always replace Stubs with Full sequences from FASTA self.add_sequence( - SequenceRecord { + SequenceRecord::Full { metadata: dr, - data: Some(raw_sequence), + sequence: raw_sequence, }, seqcol.digest.to_key(), + true, // Always replace Stubs with Full )?; } StorageMode::Encoded => { @@ -316,57 +587,196 @@ impl GlobalRefgetStore { // let encoded_sequence = BitVec::::from_vec(encoder.finalize()); let encoded_sequence = encoder.finalize(); - println!( - "Storing encoded sequence. Name: {}; Alphabet: {}; Digest: {}", - id, dr.alphabet, dr.sha512t24u - ); + // Always replace Stubs with Full sequences from FASTA self.add_sequence( - SequenceRecord { + SequenceRecord::Full { metadata: dr, - data: Some(encoded_sequence), + sequence: encoded_sequence, }, seqcol.digest.to_key(), + true, // Always replace Stubs with Full )?; } } } - println!("Finished loading sequences into refget SeqColStore."); + let elapsed = start_time.elapsed(); + let mode_str = match self.mode { + StorageMode::Raw => "Raw", + StorageMode::Encoded => "Encoded", + }; + println!("Loaded {} sequences into GlobalRefgetStore ({}) in {:.2}s.", seq_count, mode_str, elapsed.as_secs_f64()); + + // Note: If cache_to_disk=true, sequences were already written to disk + // and replaced with stubs by add_sequence_record() + Ok(()) } - /// Returns a list of all sequence names in the store - pub fn list_sequence_digests(&self) -> Vec<[u8; 32]> { - self.sequence_store.keys().cloned().collect() + /// Returns an iterator over all sequence digests in the store + pub fn sequence_digests(&self) -> impl Iterator + '_ { + self.sequence_store.keys().cloned() + } + + /// Returns an iterator over sequence metadata for all sequences in the store. + /// + /// This is a lightweight operation that returns only metadata (name, length, digests) + /// without loading sequence data. + /// + /// # Returns + /// An iterator over `SequenceMetadata` references. + /// + /// # Example + /// ```ignore + /// for metadata in store.sequence_metadata() { + /// println!("{}: {} bp", metadata.name, metadata.length); + /// } + /// ``` + pub fn sequence_metadata(&self) -> impl Iterator + '_ { + self.sequence_store.values().map(|rec| rec.metadata()) + } + + /// Calculate the total disk size of all sequences in the store + /// + /// This computes the disk space used by sequence data based on: + /// - Sequence length + /// - Alphabet type (bits per symbol) + /// - Storage mode (Raw or Encoded) + /// + /// # Returns + /// Total bytes used for sequence data on disk + /// + /// # Note + /// This only accounts for sequence data files (.seq), not metadata files + /// like FARG files, index.json, or directory overhead. + /// + /// # Examples + /// ```ignore + /// let store = GlobalRefgetStore::on_disk("store"); + /// store.add_sequence_collection_from_fasta("genome.fa")?; + /// let disk_size = store.total_disk_size(); + /// println!("Sequences use {} bytes on disk", disk_size); + /// ``` + pub fn total_disk_size(&self) -> usize { + self.sequence_store + .values() + .map(|rec| rec.metadata().disk_size(&self.mode)) + .sum() + } + + /// Returns an iterator over all complete sequence records in the store. + /// + /// This returns full `SequenceRecord` objects including both metadata and sequence data. + /// Use `sequence_metadata()` if you only need metadata. + /// + /// # Returns + /// An iterator over `SequenceRecord` references. + /// + /// # Example + /// ```ignore + /// for record in store.sequence_records() { + /// println!("{}: has_data={}", record.metadata().name, record.has_data()); + /// } + /// ``` + pub fn sequence_records(&self) -> impl Iterator + '_ { + self.sequence_store.values() + } + + /// Returns an iterator over all collection digests + pub fn collection_digests(&self) -> impl Iterator + '_ { + self.collections.keys().cloned() + } + + /// Returns an iterator over all collections + pub fn collections(&self) -> impl Iterator + '_ { + self.collections.values() + } + + /// Returns the local path where the store is located (if any) + pub fn local_path(&self) -> Option<&PathBuf> { + self.local_path.as_ref() + } + + /// Returns the remote source URL (if any) + pub fn remote_source(&self) -> Option<&str> { + self.remote_source.as_deref() + } + + /// Returns the storage mode used by this store + pub fn storage_mode(&self) -> StorageMode { + self.mode } /// Retrieve a SequenceRecord from the store by its SHA512t24u digest - pub fn get_sequence_by_id>(&self, seq_digest: K) -> Option<&SequenceRecord> { - self.sequence_store.get(&seq_digest.to_key()) + pub fn get_sequence_by_id>(&mut self, seq_digest: K) -> Option<&SequenceRecord> { + let digest_key = seq_digest.to_key(); + + // Ensure the sequence data is loaded + if let Err(e) = self.ensure_sequence_loaded(&digest_key) { + eprintln!("Failed to load sequence: {}", e); + return None; + } + + self.sequence_store.get(&digest_key) } /// Retrieve a SequenceRecord from the store by its collection digest and name pub fn get_sequence_by_collection_and_name>( - &self, + &mut self, collection_digest: K, sequence_name: &str, ) -> Option<&SequenceRecord> { + let collection_key = collection_digest.to_key(); + + // Try to ensure collection is loaded (lazy-load from remote if needed) + if let Err(e) = self.ensure_collection_loaded(&collection_key) { + eprintln!("Failed to load collection: {}", e); + return None; + } + // Look up the collection by digest - if let Some(name_map) = self.name_lookup.get(&collection_digest.to_key()) { + let digest_key = if let Some(name_map) = self.name_lookup.get(&collection_key) { // Look up the sequence name in the collection's name map - if let Some(sha512t24u) = name_map.get(sequence_name) { - // Retrieve the sequence record from the store - return self.sequence_store.get(sha512t24u); - } + name_map.get(sequence_name).cloned()? + } else { + return None; + }; + + // Ensure the sequence data is loaded + if let Err(e) = self.ensure_sequence_loaded(&digest_key) { + eprintln!("Failed to load sequence: {}", e); + return None; } - None + + // Retrieve the sequence record from the store + self.sequence_store.get(&digest_key) } - pub fn get_seqs_in_bed_file_iter<'a, K: AsRef<[u8]>>( - &'a self, + /// Get an iterator over substrings defined by BED file regions. + /// + /// Reads a BED file line-by-line and yields substrings for each region. + /// This is memory-efficient for large BED files as it streams results. + /// + /// # Arguments + /// * `collection_digest` - The collection digest containing the sequences + /// * `bed_file_path` - Path to the BED file defining regions + /// + /// # Returns + /// Iterator yielding `Result` for each BED region + /// + /// # Example + /// ```ignore + /// let iter = store.substrings_from_regions(digest, "regions.bed")?; + /// for result in iter { + /// let seq = result?; + /// println!("{}:{}-{}: {}", seq.chrom_name, seq.start, seq.end, seq.sequence); + /// } + /// ``` + pub fn substrings_from_regions<'a, K: AsRef<[u8]>>( + &'a mut self, collection_digest: K, bed_file_path: &str, - ) -> Result, Box> { + ) -> Result, Box> { let path = Path::new(bed_file_path); let file_info = get_file_info(path); let is_gzipped = file_info.is_gzipped; @@ -379,7 +789,7 @@ impl GlobalRefgetStore { }; let reader = BufReader::new(reader); - Ok(GetSeqsBedFileIter { + Ok(SubstringsFromRegions { store: self, reader, collection_digest, @@ -389,31 +799,77 @@ impl GlobalRefgetStore { }) } - /// Given a Sequence Collection digest and a bed file path - /// retrieve sequences and write to a file - pub fn get_seqs_bed_file>( - &self, + /// Export sequences from BED file regions to a FASTA file. + /// + /// Reads a BED file defining genomic regions and exports the sequences + /// for those regions to a FASTA file. This is useful for extracting + /// specific regions of interest from a genome. + /// + /// # Arguments + /// * `collection_digest` - The collection digest containing the sequences + /// * `bed_file_path` - Path to the BED file defining regions + /// * `output_file_path` - Path to write the output FASTA file + /// + /// # Returns + /// Result indicating success or error + /// + /// # Example + /// ```ignore + /// store.export_fasta_from_regions( + /// digest, + /// "regions.bed", + /// "output.fa" + /// )?; + /// ``` + pub fn export_fasta_from_regions>( + &mut self, collection_digest: K, bed_file_path: &str, output_file_path: &str, ) -> Result<(), Box> { // Set up the output path and create directories if they don't exist - let output_path = Path::new(output_file_path).parent().ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidInput, - "Invalid output file path: parent directory not found", - ) - })?; + let output_path_obj = Path::new(output_file_path); + if let Some(parent) = output_path_obj.parent() { + create_dir_all(parent)?; + } - create_dir_all(output_path)?; + // Create output file with optional gzip compression + let file = File::create(output_file_path)?; - // Open file for writing to - let mut output_file = OpenOptions::new() - .create(true) - .append(true) - .open(output_file_path)?; + let mut writer: Box = if output_path_obj.extension() == Some(OsStr::new("gz")) { + Box::new(GzEncoder::new(file, Compression::default())) + } else { + Box::new(file) + }; + + // Pre-fetch all sequence metadata from the collection to avoid borrowing issues + let collection_key = collection_digest.as_ref().to_key(); + let name_to_metadata: HashMap = self + .name_lookup + .get(&collection_key) + .map(|name_map| { + name_map + .iter() + .filter_map(|(name, seq_digest)| { + self.sequence_store.get(seq_digest).map(|record| { + let metadata = record.metadata(); + ( + name.clone(), + ( + metadata.name.clone(), + metadata.length, + metadata.alphabet, + metadata.sha512t24u.clone(), + metadata.md5.clone(), + ), + ) + }) + }) + .collect() + }) + .unwrap_or_default(); - let seq_iter = self.get_seqs_in_bed_file_iter(&collection_digest, bed_file_path)?; + let seq_iter = self.substrings_from_regions(&collection_digest, bed_file_path)?; let mut previous_parsed_chr = String::new(); let mut current_header: String = String::new(); @@ -429,18 +885,15 @@ impl GlobalRefgetStore { if previous_parsed_chr != rs.chrom_name { previous_parsed_chr = rs.chrom_name.clone(); - // If we get this far, the metadata **must** exist, so we will unwrap the present - let seq_info = self - .get_sequence_by_collection_and_name(&collection_digest, &rs.chrom_name) - .unwrap(); - current_header = format!( - ">{} {} {} {} {}", - seq_info.metadata.name, - seq_info.metadata.length, - seq_info.metadata.alphabet, - seq_info.metadata.sha512t24u, - seq_info.metadata.md5 - ); + // Look up metadata from our pre-fetched map + if let Some((name, length, alphabet, sha512, md5)) = + name_to_metadata.get(&rs.chrom_name) + { + current_header = format!( + ">{} {} {} {} {}", + name, length, alphabet, sha512, md5 + ); + } } let retrieved_substring = rs.sequence; @@ -452,47 +905,31 @@ impl GlobalRefgetStore { // Combine the prefix, current_header, and a trailing newline let header_to_be_written = format!("{}{}\n", prefix, current_header); - output_file.write_all(header_to_be_written.as_bytes())?; + writer.write_all(header_to_be_written.as_bytes())?; } - output_file.write_all(retrieved_substring.as_ref())?; + writer.write_all(retrieved_substring.as_ref())?; } - Ok(()) - } + // Ensure all data is flushed (important for gzip) + writer.flush()?; - /// Given a Sequence Collection digest and a bed file path, - /// retrieve sequences and return them as a Vec. - /// Lines that cannot be parsed or sequences that cannot be retrieved will be skipped - pub fn get_seqs_bed_file_to_vec>( - &self, - collection_digest: K, - bed_file_path: &str, - ) -> Result, Box> { - let seq_iter = self.get_seqs_in_bed_file_iter(collection_digest, bed_file_path)?; - - let retrieved_sequences = seq_iter - .into_iter() - .filter_map(|rs| match rs { - Ok(rs) => Some(rs), - Err(err) => { - eprintln!("{err}"); - None - } - }) - .collect::>(); - - Ok(retrieved_sequences) + Ok(()) } /// Retrieve a SequenceRecord from the store by its MD5 digest - pub fn get_sequence_by_md5>(&self, seq_md5: K) -> Option<&SequenceRecord> { + pub fn get_sequence_by_md5>(&mut self, seq_md5: K) -> Option<&SequenceRecord> { // Look up the SHA512t24u digest using the MD5 lookup - if let Some(sha512_digest) = self.md5_lookup.get(&seq_md5.to_key()) { - // Retrieve the sequence record from the store - return self.sequence_store.get(sha512_digest); + let sha512_digest = self.md5_lookup.get(&seq_md5.to_key()).cloned()?; + + // Ensure the sequence data is loaded + if let Err(e) = self.ensure_sequence_loaded(&sha512_digest) { + eprintln!("Failed to load sequence: {}", e); + return None; } - None + + // Retrieve the sequence record from the store + self.sequence_store.get(&sha512_digest) } /// Retrieves a substring from an encoded sequence by its SHA512t24u digest. @@ -507,25 +944,36 @@ impl GlobalRefgetStore { /// /// The substring if the sequence is found, or None if not found pub fn get_substring>( - &self, + &mut self, sha512_digest: K, start: usize, end: usize, ) -> Option { - let record = self.sequence_store.get(&sha512_digest.to_key())?; - let sequence = record.data.as_ref()?; + let digest_key = sha512_digest.to_key(); + + // Ensure the sequence data is loaded + if let Err(e) = self.ensure_sequence_loaded(&digest_key) { + eprintln!("Failed to load sequence: {}", e); + return None; + } - if start >= record.metadata.length || end > record.metadata.length || start >= end { + let record = self.sequence_store.get(&digest_key)?; + let (metadata, sequence) = match record { + SequenceRecord::Stub(_) => return None, + SequenceRecord::Full { metadata, sequence } => (metadata, sequence), + }; + + if start >= metadata.length || end > metadata.length || start >= end { println!( "Invalid substring range: start={}, end={}, sequence length={}", - start, end, record.metadata.length + start, end, metadata.length ); return None; } match self.mode { StorageMode::Encoded => { - let alphabet = lookup_alphabet(&record.metadata.alphabet); + let alphabet = lookup_alphabet(&metadata.alphabet); let decoded_sequence = decode_substring_from_bytes(sequence, start, end, alphabet); Some(String::from_utf8(decoded_sequence).expect("Invalid UTF-8")) } @@ -543,6 +991,196 @@ impl GlobalRefgetStore { } } + /// Export sequences from a collection to a FASTA file + /// + /// # Arguments + /// * `collection_digest` - The digest of the collection to export from + /// * `output_path` - Path to write the FASTA file + /// * `sequence_names` - Optional list of sequence names to export. + /// If None, exports all sequences in the collection. + /// * `line_width` - Optional line width for wrapping sequences (default: 80) + /// + /// # Returns + /// Result indicating success or error + pub fn export_fasta, P: AsRef>( + &mut self, + collection_digest: K, + output_path: P, + sequence_names: Option>, + line_width: Option, + ) -> Result<()> { + let line_width = line_width.unwrap_or(80); + let output_path = output_path.as_ref(); + let collection_key = collection_digest.as_ref().to_key(); + + // Get the name map for this collection and build a map of name -> digest + let name_to_digest: HashMap = self + .name_lookup + .get(&collection_key) + .ok_or_else(|| { + anyhow!("Collection not found: {:?}", String::from_utf8_lossy(collection_digest.as_ref())) + })? + .clone(); + + // Determine which sequences to export + let names_to_export: Vec = if let Some(names) = sequence_names { + // Filter to only requested names + names.iter().map(|s| s.to_string()).collect() + } else { + // Export all sequences in the collection + name_to_digest.keys().cloned().collect() + }; + + // Create output file with optional gzip compression + let file = File::create(output_path) + .context(format!("Failed to create output file: {}", output_path.display()))?; + + let mut writer: Box = if output_path.extension() == Some(OsStr::new("gz")) { + Box::new(GzEncoder::new(file, Compression::default())) + } else { + Box::new(file) + }; + + // Export each sequence + for seq_name in names_to_export { + // Get the sequence digest from the name map + let seq_digest = name_to_digest.get(&seq_name).ok_or_else(|| { + anyhow!("Sequence '{}' not found in collection", seq_name) + })?; + + // Ensure sequence is loaded + self.ensure_sequence_loaded(seq_digest)?; + + // Get the sequence record + let record = self.sequence_store.get(seq_digest).ok_or_else(|| { + anyhow!("Sequence record not found for digest: {:?}", seq_digest) + })?; + + // Get the sequence data + let (metadata, sequence_data) = match record { + SequenceRecord::Stub(_) => { + return Err(anyhow!("Sequence data not loaded for '{}'", seq_name)); + } + SequenceRecord::Full { metadata, sequence } => (metadata, sequence), + }; + + // Decode the sequence based on storage mode + let decoded_sequence = match self.mode { + StorageMode::Encoded => { + let alphabet = lookup_alphabet(&metadata.alphabet); + let decoded = decode_substring_from_bytes( + sequence_data, + 0, + metadata.length, + alphabet, + ); + String::from_utf8(decoded).context("Failed to decode sequence as UTF-8")? + } + StorageMode::Raw => { + String::from_utf8(sequence_data.clone()) + .context("Failed to decode raw sequence as UTF-8")? + } + }; + + // Write FASTA header + writeln!(writer, ">{}", seq_name)?; + + // Write sequence with line wrapping + for chunk in decoded_sequence.as_bytes().chunks(line_width) { + writer.write_all(chunk)?; + writer.write_all(b"\n")?; + } + } + + // Ensure all data is flushed (important for gzip) + writer.flush()?; + + Ok(()) + } + + /// Export sequences by their sequence digests to a FASTA file + /// + /// Bypasses collection information and exports sequences directly via sequence digests. + /// # Arguments + /// * `seq_digests` - List of SHA512t24u sequence digests (not collection digests) to export + /// * `output_path` - Path to write the FASTA file + /// * `line_width` - Optional line width for wrapping sequences (default: 80) + /// + /// # Returns + /// Result indicating success or error + pub fn export_fasta_by_digests>( + &mut self, + seq_digests: Vec<&str>, + output_path: P, + line_width: Option, + ) -> Result<()> { + let line_width = line_width.unwrap_or(80); + let output_path = output_path.as_ref(); + + // Create output file with optional gzip compression + let file = File::create(output_path) + .context(format!("Failed to create output file: {}", output_path.display()))?; + + let mut writer: Box = if output_path.extension() == Some(OsStr::new("gz")) { + Box::new(GzEncoder::new(file, Compression::default())) + } else { + Box::new(file) + }; + + // Export each sequence + for digest_str in seq_digests { + let digest_key = digest_str.as_bytes().to_key(); + + // Ensure sequence is loaded + self.ensure_sequence_loaded(&digest_key)?; + + // Get the sequence record + let record = self.sequence_store.get(&digest_key).ok_or_else(|| { + anyhow!("Sequence record not found for digest: {}", digest_str) + })?; + + // Get the sequence data + let (metadata, sequence_data) = match record { + SequenceRecord::Stub(_) => { + return Err(anyhow!("Sequence data not loaded for digest: {}", digest_str)); + } + SequenceRecord::Full { metadata, sequence } => (metadata, sequence), + }; + + // Decode the sequence based on storage mode + let decoded_sequence = match self.mode { + StorageMode::Encoded => { + let alphabet = lookup_alphabet(&metadata.alphabet); + let decoded = decode_substring_from_bytes( + sequence_data, + 0, + metadata.length, + alphabet, + ); + String::from_utf8(decoded).context("Failed to decode sequence as UTF-8")? + } + StorageMode::Raw => { + String::from_utf8(sequence_data.clone()) + .context("Failed to decode raw sequence as UTF-8")? + } + }; + + // Write FASTA header with sequence name + writeln!(writer, ">{}", metadata.name)?; + + // Write sequence with line wrapping + for chunk in decoded_sequence.as_bytes().chunks(line_width) { + writer.write_all(chunk)?; + writer.write_all(b"\n")?; + } + } + + // Ensure all data is flushed (important for gzip) + writer.flush()?; + + Ok(()) + } + /// Helper function to get the relative path for a sequence based on its SHA512t24u digest string fn get_sequence_path(digest_str: &str, template: &str) -> PathBuf { let path_str = template @@ -552,9 +1190,153 @@ impl GlobalRefgetStore { PathBuf::from(path_str) } - /// Load a GlobalRefgetStore from a directory - pub fn load_from_directory>(root_path: P) -> Result { - let root_path = root_path.as_ref(); + /// Write a single sequence to disk using the configured path template + fn write_sequence_to_disk_single( + &self, + metadata: &SequenceMetadata, + sequence: &[u8], + ) -> Result<()> { + let template = self.seqdata_path_template.as_ref() + .context("seqdata_path_template not set")?; + let local_path = self.local_path.as_ref() + .context("local_path not set")?; + + // Build path using template + let seq_file_path = Self::get_sequence_path(&metadata.sha512t24u, template); + let full_path = local_path.join(&seq_file_path); + + // Create parent directory + if let Some(parent) = full_path.parent() { + create_dir_all(parent)?; + } + + // Write sequence data + let mut file = File::create(&full_path)?; + file.write_all(sequence)?; + + Ok(()) + } + + /// Write a single collection FARG file to disk + /// Used when cache_to_disk=true to persist collections incrementally + fn write_collection_to_disk_single(&self, collection: &SequenceCollection) -> Result<()> { + let local_path = self.local_path.as_ref() + .context("local_path not set")?; + + // Build path: collections/{digest}.farg + let coll_file_path = format!("collections/{}.farg", collection.digest); + let full_path = local_path.join(&coll_file_path); + + // Create parent directory + if let Some(parent) = full_path.parent() { + create_dir_all(parent)?; + } + + // Write collection FARG file + collection.write_collection_farg(&full_path)?; + + Ok(()) + } + + /// Write index files (sequences.farg and index.json) to disk + /// + /// This allows the store to be loaded later via load_local(). + /// Called automatically when adding collections in disk-backed mode. + fn write_index_files(&self) -> Result<()> { + let local_path = self.local_path.as_ref() + .context("local_path not set")?; + let template = self.seqdata_path_template.as_ref() + .context("seqdata_path_template not set")?; + + // Write the sequence metadata index file + let sequence_index_path = local_path.join("sequences.farg"); + self.write_sequences_farg(&sequence_index_path)?; + + // Create the metadata structure + let metadata = StoreMetadata { + version: 1, + seqdata_path_template: template.clone(), + collections_path_template: "collections/%s.farg".to_string(), + sequence_index: "sequences.farg".to_string(), + mode: self.mode, + created_at: Utc::now().to_rfc3339(), + }; + + // Write metadata to index.json + let json = serde_json::to_string_pretty(&metadata) + .context("Failed to serialize metadata to JSON")?; + fs::write(local_path.join("index.json"), json) + .context("Failed to write index.json")?; + + Ok(()) + } + + /// Helper function to fetch a file from local path or remote source + /// Returns the file contents as Vec + fn fetch_file( + local_path: &Option, + remote_source: &Option, + relative_path: &str, + cache_to_disk: bool, + ) -> Result> { + // Check if file exists locally (only if caching is enabled and path exists) + if cache_to_disk { + if let Some(local_path) = local_path { + let full_local_path = local_path.join(relative_path); + if full_local_path.exists() { + return fs::read(&full_local_path) + .context(format!("Failed to read local file: {}", full_local_path.display())); + } + } + } + + // If not local and we have a remote source, fetch from remote + if let Some(remote_url) = remote_source { + let full_remote_url = if remote_url.ends_with('/') { + format!("{}{}", remote_url, relative_path) + } else { + format!("{}/{}", remote_url, relative_path) + }; + + let response = ureq::get(&full_remote_url) + .call() + .map_err(|e| anyhow!("Failed to fetch from remote: {}", e))?; + + let mut data = Vec::new(); + response + .into_reader() + .read_to_end(&mut data) + .context("Failed to read response body")?; + + // Save to local cache only if caching is enabled + if cache_to_disk { + if let Some(local_path) = local_path { + let full_local_path = local_path.join(relative_path); + + // Create parent directory if needed + if let Some(parent) = full_local_path.parent() { + create_dir_all(parent)?; + } + + // Save to disk + fs::write(&full_local_path, &data) + .context(format!("Failed to cache file to: {}", full_local_path.display()))?; + } + } + + Ok(data) + } else { + Err(anyhow!( + "File not found locally and no remote source configured: {}", + relative_path + )) + } + } + + /// Load a local RefgetStore from a directory + /// This loads metadata only; sequence data is loaded on-demand + pub fn load_local>(cache_path: P) -> Result { + let root_path = cache_path.as_ref(); // Read and parse index.json let index_path = root_path.join("index.json"); @@ -568,8 +1350,11 @@ impl GlobalRefgetStore { // Create a new empty store with the correct mode let mut store = GlobalRefgetStore::new(metadata.mode); + store.local_path = Some(root_path.to_path_buf()); + store.seqdata_path_template = Some(metadata.seqdata_path_template.clone()); + store.cache_to_disk = true; // Local stores always use disk - // Load sequence metadata from the sequence index file + // Load sequence metadata from the sequence index file (metadata only, no data) let sequence_index_path = root_path.join(&metadata.sequence_index); if sequence_index_path.exists() { let file = std::fs::File::open(&sequence_index_path)?; @@ -595,33 +1380,11 @@ impl GlobalRefgetStore { alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown), sha512t24u: parts[3].to_string(), md5: parts[4].to_string(), + fai: None, // Lazy loading doesn't preserve FAI data }; - // Generate the path using the template - let path_str = metadata - .seqdata_path_template - .replace("%s2", &seq_metadata.sha512t24u[0..2]) - .replace("%s4", &seq_metadata.sha512t24u[0..4]) - .replace("%s", &seq_metadata.sha512t24u); - let seq_path = root_path.join(path_str); - - // Open and memory-map the sequence file - let file = File::open(&seq_path).context(format!( - "Failed to open sequence file: {}", - seq_path.display() - ))?; - - let mmap = - unsafe { Mmap::map(&file) }.context("Failed to memory-map sequence file")?; - - // Convert to Vec for storage - let data = mmap.to_vec(); - - // Create a SequenceRecord - let record = SequenceRecord { - metadata: seq_metadata.clone(), - data: Some(data), - }; + // Create a SequenceRecord with no data (lazy loading) + let record = SequenceRecord::Stub(seq_metadata.clone()); // Add to store let sha512_key = seq_metadata.sha512t24u.to_key(); @@ -653,8 +1416,127 @@ impl GlobalRefgetStore { // Build name lookup for this collection let mut name_map = HashMap::new(); for sequence_record in &collection.sequences { - let sha512_key = sequence_record.metadata.sha512t24u.to_key(); - name_map.insert(sequence_record.metadata.name.clone(), sha512_key); + let metadata = sequence_record.metadata(); + let sha512_key = metadata.sha512t24u.to_key(); + name_map.insert(metadata.name.clone(), sha512_key); + } + store.name_lookup.insert(collection_digest, name_map); + } + } + } + + Ok(store) + } + + /// Load a remote-backed RefgetStore + /// This loads metadata from remote and caches sequence data on-demand + /// + /// # Arguments + /// * `cache_path` - Local directory for caching (used even if cache_to_disk is false for temp operations) + /// * `remote_url` - Remote URL to fetch data from + /// * `cache_to_disk` - If true, cache fetched data to disk; if false, keep only in memory + pub fn load_remote, S: AsRef>( + cache_path: P, + remote_url: S, + cache_to_disk: bool, + ) -> Result { + let cache_path = cache_path.as_ref(); + let remote_url = remote_url.as_ref().to_string(); + + // Create cache directory (needed for metadata even in memory-only mode) + create_dir_all(cache_path)?; + + // Fetch index.json from remote (always cache metadata - it's small) + let index_data = Self::fetch_file(&Some(cache_path.to_path_buf()), &Some(remote_url.clone()), "index.json", true)?; + let json = String::from_utf8(index_data) + .context("index.json contains invalid UTF-8")?; + + let metadata: StoreMetadata = + serde_json::from_str(&json).context("Failed to parse index.json")?; + + // Create a new empty store with the correct mode + let mut store = GlobalRefgetStore::new(metadata.mode); + store.local_path = Some(cache_path.to_path_buf()); + store.remote_source = Some(remote_url.clone()); + store.seqdata_path_template = Some(metadata.seqdata_path_template.clone()); + store.cache_to_disk = cache_to_disk; + + // Fetch sequence index from remote (always cache metadata - it's small) + let sequence_index_data = Self::fetch_file( + &Some(cache_path.to_path_buf()), + &Some(remote_url.clone()), + &metadata.sequence_index, + true, // Always cache metadata + )?; + let sequence_index_str = String::from_utf8(sequence_index_data) + .context("sequence index contains invalid UTF-8")?; + + // Parse sequence metadata (metadata only, no data) + for line in sequence_index_str.lines() { + // Skip comment lines + if line.starts_with('#') { + continue; + } + + // Parse sequence metadata lines + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() != 5 { + continue; + } + + let seq_metadata = SequenceMetadata { + name: parts[0].to_string(), + length: parts[1].parse().unwrap_or(0), + alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown), + sha512t24u: parts[3].to_string(), + md5: parts[4].to_string(), + fai: None, // Lazy loading doesn't preserve FAI data + }; + + // Create a SequenceRecord with no data (lazy loading) + let record = SequenceRecord::Stub(seq_metadata.clone()); + + // Add to store + let sha512_key = seq_metadata.sha512t24u.to_key(); + store.sequence_store.insert(sha512_key, record); + + // Add to MD5 lookup + let md5_key = seq_metadata.md5.to_key(); + store.md5_lookup.insert(md5_key, sha512_key); + } + + // Load collections from the collections directory + let collections_dir_path = "collections"; + let local_collections_dir = cache_path.join(collections_dir_path); + + // Try to read from local cache first, or create if doesn't exist + if !local_collections_dir.exists() { + create_dir_all(&local_collections_dir)?; + } + + // Note: We'll try to fetch collection files on-demand when needed + // For now, just check if any exist locally + if local_collections_dir.exists() { + for entry in fs::read_dir(&local_collections_dir)? { + let entry = entry?; + let path = entry.path(); + + if path.is_file() && path.extension() == Some(std::ffi::OsStr::new("farg")) { + // Load the collection from the .farg file + let collection = read_fasta_refget_file(&path)?; + let collection_digest = collection.digest.to_key(); + + // Add collection to store + store + .collections + .insert(collection_digest, collection.clone()); + + // Build name lookup for this collection + let mut name_map = HashMap::new(); + for sequence_record in &collection.sequences { + let metadata = sequence_record.metadata(); + let sha512_key = metadata.sha512t24u.to_key(); + name_map.insert(metadata.name.clone(), sha512_key); } store.name_lookup.insert(collection_digest, name_map); } @@ -664,20 +1546,124 @@ impl GlobalRefgetStore { Ok(store) } - /// Write the sequence_store component to a FARG file (without collection headers). - /// * `file_path` - The path to the FARG file to be written. - pub fn to_farg>(&self, file_path: P) -> Result<()> { - // Write the FARGI file + /// Ensure a collection is loaded into the store + /// If the collection is not already loaded, try to fetch it from local or remote + fn ensure_collection_loaded(&mut self, collection_digest: &[u8; 32]) -> Result<()> { + // Check if collection is already loaded + if self.name_lookup.contains_key(collection_digest) { + return Ok(()); + } + + // Collection not found - try to fetch it if we have a remote source + // Convert the collection digest to a string for the path + let digest_str = String::from_utf8_lossy(collection_digest); + + // Build the collection file path using the template + // Default template is "collections/%s.farg" + let relative_path = format!("collections/{}.farg", digest_str); + + // Try to fetch the collection file + // Always cache metadata files (they're small), even when cache_to_disk is false + // The memory-only benefit comes from not caching large sequence data files + let _collection_data = Self::fetch_file(&self.local_path, &self.remote_source, &relative_path, true)?; + + // Read the collection from the cached file + let collection_file_path = self.local_path + .as_ref() + .ok_or_else(|| anyhow!("No local path configured"))? + .join(&relative_path); + + let collection = read_fasta_refget_file(&collection_file_path)?; + + // Verify the collection digest matches what we requested + let loaded_digest = collection.digest.to_key(); + if loaded_digest != *collection_digest { + eprintln!( + "Warning: Collection digest mismatch. Expected {:?}, got {:?}", + String::from_utf8_lossy(collection_digest), + String::from_utf8_lossy(&loaded_digest) + ); + } + + // Add collection to store + self.collections.insert(*collection_digest, collection.clone()); + + // Build name lookup for this collection + let mut name_map = HashMap::new(); + for sequence_record in &collection.sequences { + let metadata = sequence_record.metadata(); + let sha512_key = metadata.sha512t24u.to_key(); + name_map.insert(metadata.name.clone(), sha512_key); + } + self.name_lookup.insert(*collection_digest, name_map); + + Ok(()) + } + + /// Ensure a sequence is loaded into memory + /// If the sequence data is not already loaded, fetch it from local or remote + fn ensure_sequence_loaded(&mut self, digest: &[u8; 32]) -> Result<()> { + // Check if sequence exists + let record = self + .sequence_store + .get(digest) + .ok_or_else(|| anyhow!("Sequence not found in store"))?; + + // If data is already loaded, return early + if matches!(record, SequenceRecord::Full { .. }) { + return Ok(()); + } + + // Get the necessary information before borrowing mutably + let digest_str = &record.metadata().sha512t24u; + let template = self + .seqdata_path_template + .as_ref() + .ok_or_else(|| anyhow!("No sequence data path template configured"))?; + + // Build the relative path using the template + let relative_path = template + .replace("%s2", &digest_str[0..2]) + .replace("%s4", &digest_str[0..4]) + .replace("%s", digest_str); + + // Fetch the sequence data + // Use cache_to_disk flag - this is where memory-only mode saves disk I/O + let data = Self::fetch_file(&self.local_path, &self.remote_source, &relative_path, self.cache_to_disk)?; + + // Update the record with the loaded data + self.sequence_store.entry(*digest).and_modify(|r| { + *r = r.clone().with_data(data); + }); + + Ok(()) + } + + /// Write all sequence metadata to a FARG file (without collection headers). + /// + /// Creates a global sequence index file containing metadata for all sequences + /// in the store across all collections. Does not include collection-level digest headers. + /// + /// # Arguments + /// * `file_path` - The path to the FARG file to be written + /// + /// # Returns + /// Result indicating success or error + /// + /// # Notes + /// For writing individual collection FARG files with collection headers, + /// use `SequenceCollection::write_collection_farg()` instead. + pub fn write_sequences_farg>(&self, file_path: P) -> Result<()> { let file_path = file_path.as_ref(); - println!("Writing farg file: {:?}", file_path); + println!("Writing sequences farg file: {:?}", file_path); let mut file = std::fs::File::create(file_path)?; - // Write header with digest metadata + // Write header with column names writeln!(file, "#name\tlength\talphabet\tsha512t24u\tmd5")?; - // Write sequence data + // Write sequence metadata for all sequences for result_sr in self.sequence_store.values() { - let result = result_sr.metadata.clone(); + let result = result_sr.metadata().clone(); writeln!( file, "{}\t{}\t{}\t{}\t{}", @@ -687,17 +1673,50 @@ impl GlobalRefgetStore { Ok(()) } + /// Write the store using its configured paths + /// + /// For disk-backed stores (on_disk), this updates index files only since + /// sequences/collections are already written incrementally. + /// For in-memory stores, this is not supported (use write_store_to_dir instead). + /// + /// # Returns + /// Result indicating success or error + /// + /// # Errors + /// Returns an error if `local_path` is not set. + /// + /// # Example + /// ```ignore + /// let store = GlobalRefgetStore::on_disk("/data/store")?; + /// store.add_sequence_collection_from_fasta("genome.fa")?; + /// store.write()?; // Updates index files + /// ``` + pub fn write(&self) -> Result<()> { + if !self.cache_to_disk { + return Err(anyhow!("write() only works with disk-backed stores - use write_store_to_dir() instead")); + } + + // For disk-backed stores, just update indexes (sequences/collections already written) + self.write_index_files() + } + /// Write a GlobalRefgetStore object to a directory - pub fn write_store_to_directory>( + pub fn write_store_to_dir>( &self, root_path: P, - seqdata_path_template: &str, + seqdata_path_template: Option<&str>, ) -> Result<()> { let root_path = root_path.as_ref(); + + // Use provided template, or store's template, or default + let template = seqdata_path_template + .or(self.seqdata_path_template.as_deref()) + .unwrap_or(DEFAULT_SEQDATA_PATH_TEMPLATE); + println!( "Writing store to directory: {}; Using seqdata path template: {}", root_path.display(), - seqdata_path_template + template ); // Create the root directory if it doesn't exist @@ -713,19 +1732,20 @@ impl GlobalRefgetStore { // Write each sequence to its own file for record in self.sequence_store.values() { - if record.data.is_some() { - // Get the path for this sequence using the template and base64url-encoded digest - let rel_path = - Self::get_sequence_path(&record.metadata.sha512t24u, seqdata_path_template); - let full_path = root_path.join(&rel_path); - - // Write the sequence data to file - record.to_file(full_path)?; - } else { - return Err(anyhow!( - "Sequence data is missing for digest: {}", - &record.metadata.sha512t24u - )); + match record { + SequenceRecord::Full { metadata, .. } => { + // Get the path for this sequence using the template and base64url-encoded digest + let rel_path = + Self::get_sequence_path(&metadata.sha512t24u, template); + let full_path = root_path.join(&rel_path); + + // Write the sequence data to file + record.to_file(full_path)?; + } + SequenceRecord::Stub(_metadata) => { + // Stub means sequence already on disk - skip writing + continue; + } } } @@ -733,17 +1753,17 @@ impl GlobalRefgetStore { for collection in self.collections.values() { let collection_file_path = root_path.join(format!("collections/{}.farg", collection.digest)); - collection.to_farg_path(&collection_file_path)?; + collection.write_collection_farg(&collection_file_path)?; } // Write the sequence metadata index file let sequence_index_path = root_path.join("sequences.farg"); - self.to_farg(&sequence_index_path)?; + self.write_sequences_farg(&sequence_index_path)?; // Create the metadata structure let metadata = StoreMetadata { version: 1, - seqdata_path_template: seqdata_path_template.to_string(), + seqdata_path_template: template.to_string(), collections_path_template: "collections/%s.farg".to_string(), sequence_index: "sequences.farg".to_string(), mode: self.mode, @@ -757,30 +1777,71 @@ impl GlobalRefgetStore { Ok(()) } + + /// Returns statistics about the store + /// + /// # Returns + /// A tuple of (n_sequences, n_collections, storage_mode_str) + pub fn stats(&self) -> (usize, usize, &'static str) { + let n_sequences = self.sequence_store.len(); + let n_collections = self.collections.len(); + let mode_str = match self.mode { + StorageMode::Raw => "Raw", + StorageMode::Encoded => "Encoded", + }; + (n_sequences, n_collections, mode_str) + } +} + +/// Format bytes into human-readable size (KB, MB, GB, etc.) +fn format_bytes(bytes: usize) -> String { + const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"]; + let mut size = bytes as f64; + let mut unit_idx = 0; + + while size >= 1024.0 && unit_idx < UNITS.len() - 1 { + size /= 1024.0; + unit_idx += 1; + } + + if unit_idx == 0 { + format!("{} {}", bytes, UNITS[0]) + } else { + format!("{:.2} {}", size, UNITS[unit_idx]) + } } impl Display for GlobalRefgetStore { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let total_size = self.total_disk_size(); + let size_str = format_bytes(total_size); writeln!(f, "SeqColStore object:")?; + writeln!(f, " Mode: {:?}", self.mode)?; + writeln!(f, " Disk size: {} ({} bytes)", size_str, total_size)?; writeln!(f, ">Sequences (n={}):", self.sequence_store.len())?; // Print out the sequences in the store for (i, (sha512_digest, sequence_record)) in self.sequence_store.iter().take(10).enumerate() { - let seq = sequence_record.data.as_ref().unwrap(); - // Extract the first 3 characters of the sequence (or fewer if the sequence is shorter) - let first_8_chars = match self.mode { - StorageMode::Encoded => { - let alphabet = lookup_alphabet(&sequence_record.metadata.alphabet); - let decoded = decode_substring_from_bytes( - seq, - 0, - 8.min(sequence_record.metadata.length), - alphabet, - ); - String::from_utf8(decoded).unwrap_or_else(|_| "???".to_string()) + let metadata = sequence_record.metadata(); + let first_8_chars = match sequence_record { + SequenceRecord::Stub(_) => "".to_string(), + SequenceRecord::Full { metadata, sequence: seq } => { + // Extract the first 8 characters of the sequence (or fewer if the sequence is shorter) + match self.mode { + StorageMode::Encoded => { + let alphabet = lookup_alphabet(&metadata.alphabet); + let decoded = decode_substring_from_bytes( + seq, + 0, + 8.min(metadata.length), + alphabet, + ); + String::from_utf8(decoded).unwrap_or_else(|_| "???".to_string()) + } + StorageMode::Raw => String::from_utf8(seq[0..8.min(seq.len())].to_vec()) + .unwrap_or_else(|_| "???".to_string()), + } } - StorageMode::Raw => String::from_utf8(seq[0..8.min(seq.len())].to_vec()) - .unwrap_or_else(|_| "???".to_string()), }; writeln!( @@ -788,9 +1849,9 @@ impl Display for GlobalRefgetStore { " - {}. {:02x?}, MD5: {:02x?}, Length: {}, Alphabet: {:?}, Start: {}", i + 1, std::str::from_utf8(sha512_digest).unwrap(), - &sequence_record.metadata.md5, - &sequence_record.metadata.length, - &sequence_record.metadata.alphabet, + &metadata.md5, + &metadata.length, + &metadata.alphabet, first_8_chars )?; } @@ -801,15 +1862,20 @@ impl Display for GlobalRefgetStore { let seqcol_digest_str = String::from_utf8_lossy(digest); writeln!( f, - " {}. Collection Digest: {:02x?}", + " {}. Collection Digest: {:02x?} ({} sequences)", i + 1, - seqcol_digest_str + seqcol_digest_str, + name_map.len() )?; - for (name, sha512_digest) in name_map { + // Only show first 5 sequences in each collection + for (j, (name, sha512_digest)) in name_map.iter().enumerate().take(5) { // Convert the sha512_digest to a hex string let sha512_str = String::from_utf8_lossy(sha512_digest); writeln!(f, " - Name: {}, SHA512: {:02x?}", name, sha512_str)?; } + if name_map.len() > 5 { + writeln!(f, " - ... and {} more", name_map.len() - 5)?; + } } Ok(()) @@ -844,18 +1910,18 @@ mod tests { .expect("Failed to create SeqColDigest from FASTA file"); // Write FARG to temporary directory - seqcol.to_farg().expect("Failed to write farg file"); + seqcol.write_farg().expect("Failed to write farg file"); // Load and verify let loaded_seqcol = read_fasta_refget_file(&temp_farg).expect("Failed to read refget file"); // Test round-trip integrity for (original, loaded) in seqcol.sequences.iter().zip(loaded_seqcol.sequences.iter()) { - assert_eq!(original.metadata.name, loaded.metadata.name); - assert_eq!(original.metadata.length, loaded.metadata.length); - assert_eq!(original.metadata.sha512t24u, loaded.metadata.sha512t24u); - assert_eq!(original.metadata.md5, loaded.metadata.md5); - assert_eq!(original.metadata.alphabet, loaded.metadata.alphabet); + assert_eq!(original.metadata().name, loaded.metadata().name); + assert_eq!(original.metadata().length, loaded.metadata().length); + assert_eq!(original.metadata().sha512t24u, loaded.metadata().sha512t24u); + assert_eq!(original.metadata().md5, loaded.metadata().md5); + assert_eq!(original.metadata().alphabet, loaded.metadata().alphabet); } } @@ -864,6 +1930,107 @@ mod tests { (sha512t24u(sequence), md5(sequence)) } + #[test] + fn test_mode_basics() { + // Test default mode and convenience methods (no sequences needed) + let mut store = GlobalRefgetStore::in_memory(); + + // Default is Encoded + assert_eq!(store.mode, StorageMode::Encoded); + + // Convenience methods + store.disable_encoding(); + assert_eq!(store.mode, StorageMode::Raw); + store.enable_encoding(); + assert_eq!(store.mode, StorageMode::Encoded); + + // set_mode() also works + store.set_mode(StorageMode::Raw); + assert_eq!(store.mode, StorageMode::Raw); + store.set_mode(StorageMode::Encoded); + assert_eq!(store.mode, StorageMode::Encoded); + } + + #[test] + fn test_mode_switching_raw_to_encoded() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + let fasta_content = ">chr1\nATGCATGCATGC\n>chr2\nGGGGAAAA\n"; + let temp_fasta_path = temp_path.join("test.fa"); + fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); + + // Create store in Raw mode and load sequences + let mut store = GlobalRefgetStore::in_memory(); + store.disable_encoding(); + assert_eq!(store.mode, StorageMode::Raw); + + store.add_sequence_collection_from_fasta(&temp_fasta_path).unwrap(); + + let (chr1_sha, _) = calculate_test_digests(b"ATGCATGCATGC"); + let chr1_key = chr1_sha.as_bytes().to_key(); + + // Verify raw bytes in Raw mode - plain ASCII + if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key) { + assert_eq!(sequence, b"ATGCATGCATGC"); + assert_eq!(sequence.len(), 12); + } + + let seq1 = store.get_sequence_by_id(&chr1_sha).unwrap().decode().unwrap(); + + // Switch to Encoded + store.set_mode(StorageMode::Encoded); + + // Verify raw bytes are now bit-packed + if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key) { + assert_eq!(sequence.len(), 3); // 12 bases * 2 bits = 3 bytes + assert!(sequence.len() < 12); + } + + // Verify decoded sequence is identical + let seq2 = store.get_sequence_by_id(&chr1_sha).unwrap().decode().unwrap(); + assert_eq!(seq1, seq2); + } + + #[test] + fn test_mode_switching_encoded_to_raw() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + let fasta_content = ">chr1\nATGCATGCATGC\n>chr2\nGGGGAAAA\n"; + let temp_fasta_path = temp_path.join("test.fa"); + fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); + + // Create store in default Encoded mode and load sequences + let mut store = GlobalRefgetStore::in_memory(); + assert_eq!(store.mode, StorageMode::Encoded); + + store.add_sequence_collection_from_fasta(&temp_fasta_path).unwrap(); + + let (chr1_sha, _) = calculate_test_digests(b"ATGCATGCATGC"); + let chr1_key = chr1_sha.as_bytes().to_key(); + + // Verify raw bytes in Encoded mode - bit-packed + if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key) { + assert_eq!(sequence.len(), 3); + assert!(sequence.len() < 12); + } + + let seq1 = store.get_sequence_by_id(&chr1_sha).unwrap().decode().unwrap(); + + // Switch to Raw + store.disable_encoding(); + + // Verify raw bytes are now plain ASCII + if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key) { + assert_eq!(sequence, b"ATGCATGCATGC"); + assert_eq!(sequence.len(), 12); + } + + // Verify decoded sequence is identical + let seq2 = store.get_sequence_by_id(&chr1_sha).unwrap().decode().unwrap(); + assert_eq!(seq1, seq2); + assert!(!seq2.is_empty()); + } + #[test] fn test_refget_store_retrieve_seq_and_vec() { // Create temporary directory for all test files @@ -882,8 +2049,8 @@ GGGGAAAA fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); // --- 2. Initialize GlobalRefgetStore and import FASTA --- - let mut store = GlobalRefgetStore::new(StorageMode::Encoded); - store.import_fasta(&temp_fasta_path).unwrap(); + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta(&temp_fasta_path).unwrap(); let sequence_keys: Vec<[u8; 32]> = store.sequence_store.keys().cloned().collect(); @@ -912,12 +2079,12 @@ chr1\t-5\t100 let temp_output_fa_path = temp_path.join("output.fa"); store - .get_seqs_bed_file( + .export_fasta_from_regions( collection_digest_ref, temp_bed_path.to_str().unwrap(), temp_output_fa_path.to_str().unwrap(), ) - .expect("get_seqs_bed_file failed"); + .expect("export_fasta_from_regions failed"); // Read the output FASTA file and verify its content let output_fa_content = @@ -933,12 +2100,14 @@ chr1\t-5\t100 expected_fa_content.trim(), "Output FASTA file content mismatch" ); - println!("✓ get_seqs_bed_file test passed."); + println!("✓ export_fasta_from_regions test passed."); - // --- Test get_seqs_bed_file_to_vec (returns Vec) --- - let vec_result = store - .get_seqs_bed_file_to_vec(collection_digest_ref, temp_bed_path.to_str().unwrap()) - .expect("get_seqs_bed_file_to_vec failed"); + // --- Test substrings_from_regions iterator (returns iterator of RetrievedSequence) --- + let vec_result: Vec<_> = store + .substrings_from_regions(collection_digest_ref, temp_bed_path.to_str().unwrap()) + .expect("substrings_from_regions failed") + .filter_map(Result::ok) // Skip errors + .collect(); // Define the expected vector of RetrievedSequence structs let expected_vec = vec![ @@ -967,7 +2136,7 @@ chr1\t-5\t100 vec_result, expected_vec, "Retrieved sequence vector mismatch" ); - println!("✓ get_seqs_bed_file_to_vec test passed."); + println!("✓ substrings_from_regions test passed."); } #[test] @@ -986,7 +2155,6 @@ chr1\t-5\t100 lengths_digest: "test".to_string(), }, file_path: None, - has_data: false, }; // Create a sequence record @@ -996,17 +2164,18 @@ chr1\t-5\t100 sha512t24u: sha512t24u(sequence), md5: md5(sequence), alphabet: AlphabetType::Dna2bit, + fai: None, }; - let record = SequenceRecord { + let record = SequenceRecord::Full { metadata: seq_metadata.clone(), - data: Some(sequence.to_vec()), + sequence: sequence.to_vec(), }; collection.sequences.push(record); // Add the sequence to the store - let mut store = GlobalRefgetStore::new(StorageMode::Encoded); + let mut store = GlobalRefgetStore::in_memory(); store.add_sequence_collection(collection.clone()).unwrap(); // Verify the store has the sequence @@ -1017,44 +2186,44 @@ chr1\t-5\t100 store.get_sequence_by_collection_and_name(&collection.digest, name); assert!(retrieved_by_name_str.is_some()); let retrieved_record = retrieved_by_name_str.unwrap(); - assert_eq!(retrieved_record.metadata.name, name); - assert_eq!(retrieved_record.data.as_ref().unwrap(), sequence); + assert_eq!(retrieved_record.metadata().name, name); + assert_eq!(retrieved_record.sequence().unwrap(), sequence); // Test sequence lookup by collection+name (using [u8; 32] digest) let retrieved_by_name_key = store.get_sequence_by_collection_and_name(collection.digest.to_key(), name); assert!(retrieved_by_name_key.is_some()); let retrieved_record = retrieved_by_name_key.unwrap(); - assert_eq!(retrieved_record.metadata.name, name); - assert_eq!(retrieved_record.data.as_ref().unwrap(), sequence); + assert_eq!(retrieved_record.metadata().name, name); + assert_eq!(retrieved_record.sequence().unwrap(), sequence); // Test sequence lookup by SHA512 digest (using string) let retrieved_by_sha512_str = store.get_sequence_by_id(&seq_metadata.sha512t24u); assert!(retrieved_by_sha512_str.is_some()); let retrieved_record = retrieved_by_sha512_str.unwrap(); - assert_eq!(retrieved_record.metadata.name, name); - assert_eq!(retrieved_record.data.as_ref().unwrap(), sequence); + assert_eq!(retrieved_record.metadata().name, name); + assert_eq!(retrieved_record.sequence().unwrap(), sequence); // Test sequence lookup by SHA512 digest (using [u8; 32]) let retrieved_by_sha512_key = store.get_sequence_by_id(seq_metadata.sha512t24u.to_key()); assert!(retrieved_by_sha512_key.is_some()); let retrieved_record = retrieved_by_sha512_key.unwrap(); - assert_eq!(retrieved_record.metadata.name, name); - assert_eq!(retrieved_record.data.as_ref().unwrap(), sequence); + assert_eq!(retrieved_record.metadata().name, name); + assert_eq!(retrieved_record.sequence().unwrap(), sequence); // Test sequence lookup by MD5 digest (using string) let retrieved_by_md5_str = store.get_sequence_by_md5(&seq_metadata.md5); assert!(retrieved_by_md5_str.is_some()); let retrieved_record = retrieved_by_md5_str.unwrap(); - assert_eq!(retrieved_record.metadata.name, name); - assert_eq!(retrieved_record.data.as_ref().unwrap(), sequence); + assert_eq!(retrieved_record.metadata().name, name); + assert_eq!(retrieved_record.sequence().unwrap(), sequence); // Test sequence lookup by MD5 digest (using [u8; 32]) let retrieved_by_md5_key = store.get_sequence_by_md5(seq_metadata.md5.to_key()); assert!(retrieved_by_md5_key.is_some()); let retrieved_record = retrieved_by_md5_key.unwrap(); - assert_eq!(retrieved_record.metadata.name, name); - assert_eq!(retrieved_record.data.as_ref().unwrap(), sequence); + assert_eq!(retrieved_record.metadata().name, name); + assert_eq!(retrieved_record.sequence().unwrap(), sequence); } #[test] @@ -1068,10 +2237,10 @@ chr1\t-5\t100 std::fs::copy(test_fa, &temp_fa).expect("Failed to copy test FASTA file"); - let mut store = GlobalRefgetStore::new(StorageMode::Encoded); + let mut store = GlobalRefgetStore::in_memory(); // Import the FASTA file - store.import_fasta(temp_fa).unwrap(); + store.add_sequence_collection_from_fasta(temp_fa).unwrap(); // Check that the store has sequences assert!(!store.sequence_store.is_empty()); @@ -1080,7 +2249,7 @@ chr1\t-5\t100 let seq_template = "sequences/%s2/%s.seq"; // let col_template = "collections/%s.farg"; store - .write_store_to_directory(temp_path.to_str().unwrap(), seq_template) + .write_store_to_dir(temp_path.to_str().unwrap(), Some(seq_template)) .unwrap(); } @@ -1094,11 +2263,11 @@ chr1\t-5\t100 .expect("Failed to copy base.fa.gz to tempdir"); // Create a new sequence store - let mut store = GlobalRefgetStore::new(StorageMode::Encoded); + let mut store = GlobalRefgetStore::in_memory(); // Import a FASTA file into the store - // store.import_fasta("../tests/data/subset.fa.gz").unwrap(); - store.import_fasta(&temp_fasta).unwrap(); + // store.add_sequence_collection_from_fasta("../tests/data/subset.fa.gz").unwrap(); + store.add_sequence_collection_from_fasta(&temp_fasta).unwrap(); // Get the sequence keys for verification (assuming we know the test file contains 3 sequences) let sequence_keys: Vec<[u8; 32]> = store.sequence_store.keys().cloned().collect(); @@ -1118,7 +2287,7 @@ chr1\t-5\t100 // Write the store to the temporary directory let seq_template = "sequences/%s2/%s.seq"; store - .write_store_to_directory(temp_path, seq_template) + .write_store_to_dir(temp_path, Some(seq_template)) .unwrap(); // Verify that the files were created @@ -1129,7 +2298,7 @@ chr1\t-5\t100 assert!(temp_path.join("collections").exists()); // Load the store from disk - let loaded_store = GlobalRefgetStore::load_from_directory(temp_path).unwrap(); + let mut loaded_store = GlobalRefgetStore::load_local(temp_path).unwrap(); // Verify that the loaded store has the same sequences assert_eq!(loaded_store.sequence_store.len(), 3); @@ -1143,25 +2312,25 @@ chr1\t-5\t100 let loaded_seq2 = loaded_store.sequence_store.get(&sha512_key2).unwrap(); // Check metadata equality - assert_eq!(original_seq1.metadata.name, loaded_seq1.metadata.name); - assert_eq!(original_seq1.metadata.length, loaded_seq1.metadata.length); + assert_eq!(original_seq1.metadata().name, loaded_seq1.metadata().name); + assert_eq!(original_seq1.metadata().length, loaded_seq1.metadata().length); assert_eq!( - original_seq1.metadata.sha512t24u, - loaded_seq1.metadata.sha512t24u + original_seq1.metadata().sha512t24u, + loaded_seq1.metadata().sha512t24u ); - assert_eq!(original_seq1.metadata.md5, loaded_seq1.metadata.md5); + assert_eq!(original_seq1.metadata().md5, loaded_seq1.metadata().md5); - assert_eq!(original_seq2.metadata.name, loaded_seq2.metadata.name); - assert_eq!(original_seq2.metadata.length, loaded_seq2.metadata.length); + assert_eq!(original_seq2.metadata().name, loaded_seq2.metadata().name); + assert_eq!(original_seq2.metadata().length, loaded_seq2.metadata().length); assert_eq!( - original_seq2.metadata.sha512t24u, - loaded_seq2.metadata.sha512t24u + original_seq2.metadata().sha512t24u, + loaded_seq2.metadata().sha512t24u ); - assert_eq!(original_seq2.metadata.md5, loaded_seq2.metadata.md5); + assert_eq!(original_seq2.metadata().md5, loaded_seq2.metadata().md5); - // Check data equality - assert_eq!(original_seq1.data, loaded_seq1.data); - assert_eq!(original_seq2.data, loaded_seq2.data); + // Check data is not loaded initially (lazy loading) + assert_eq!(loaded_seq1.has_data(), false, "Data should not be loaded initially with lazy loading"); + assert_eq!(loaded_seq2.has_data(), false, "Data should not be loaded initially with lazy loading"); // Verify MD5 lookup is preserved assert_eq!(loaded_store.md5_lookup.len(), 3); @@ -1172,15 +2341,15 @@ chr1\t-5\t100 // Test sequence retrieval functionality for (digest, original_record) in &store.sequence_store { let loaded_record = loaded_store.get_sequence_by_id(*digest).unwrap(); - assert_eq!(original_record.metadata.name, loaded_record.metadata.name); + assert_eq!(original_record.metadata().name, loaded_record.metadata().name); assert_eq!( - original_record.metadata.length, - loaded_record.metadata.length + original_record.metadata().length, + loaded_record.metadata().length ); // Test substring retrieval works on loaded store - if original_record.metadata.length > 0 { - let substring_len = std::cmp::min(5, original_record.metadata.length); + if original_record.metadata().length > 0 { + let substring_len = std::cmp::min(5, original_record.metadata().length); let substring = loaded_store.get_substring(digest, 0, substring_len); assert!( substring.is_some(), @@ -1192,4 +2361,430 @@ chr1\t-5\t100 println!("✓ Disk persistence test passed - all data preserved correctly"); } + + #[test] + fn test_export_fasta_all_sequences() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + + // Create test FASTA + let fasta_content = "\ +>chr1 +ATGCATGCATGC +>chr2 +GGGGAAAA +>chr3 +TTTTCCCC +"; + let temp_fasta_path = temp_path.join("test.fa"); + fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); + + // Import into store + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta(&temp_fasta_path).unwrap(); + + // Get the collection digest + let collections: Vec<_> = store.collections.keys().cloned().collect(); + assert_eq!(collections.len(), 1, "Should have exactly one collection"); + let collection_digest = collections[0]; + + // Export all sequences + let output_path = temp_path.join("exported_all.fa"); + store + .export_fasta(&collection_digest, &output_path, None, Some(80)) + .expect("Failed to export FASTA"); + + // Read and verify the exported file + let exported_content = fs::read_to_string(&output_path).expect("Failed to read exported file"); + + // Check that all sequences are present + assert!(exported_content.contains(">chr1"), "Should contain chr1"); + assert!(exported_content.contains(">chr2"), "Should contain chr2"); + assert!(exported_content.contains(">chr3"), "Should contain chr3"); + assert!(exported_content.contains("ATGCATGCATGC"), "Should contain chr1 sequence"); + assert!(exported_content.contains("GGGGAAAA"), "Should contain chr2 sequence"); + assert!(exported_content.contains("TTTTCCCC"), "Should contain chr3 sequence"); + + println!("✓ Export all sequences test passed"); + } + + #[test] + fn test_export_fasta_subset_sequences() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + + // Create test FASTA + let fasta_content = "\ +>chr1 +ATGCATGCATGC +>chr2 +GGGGAAAA +>chr3 +TTTTCCCC +"; + let temp_fasta_path = temp_path.join("test.fa"); + fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); + + // Import into store + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta(&temp_fasta_path).unwrap(); + + // Get the collection digest + let collections: Vec<_> = store.collections.keys().cloned().collect(); + let collection_digest = collections[0]; + + // Export only chr1 and chr3 + let output_path = temp_path.join("exported_subset.fa"); + let subset_names = vec!["chr1", "chr3"]; + store + .export_fasta(&collection_digest, &output_path, Some(subset_names), Some(80)) + .expect("Failed to export subset FASTA"); + + // Read and verify the exported file + let exported_content = fs::read_to_string(&output_path).expect("Failed to read exported file"); + + // Check that only selected sequences are present + assert!(exported_content.contains(">chr1"), "Should contain chr1"); + assert!(!exported_content.contains(">chr2"), "Should NOT contain chr2"); + assert!(exported_content.contains(">chr3"), "Should contain chr3"); + assert!(exported_content.contains("ATGCATGCATGC"), "Should contain chr1 sequence"); + assert!(!exported_content.contains("GGGGAAAA"), "Should NOT contain chr2 sequence"); + assert!(exported_content.contains("TTTTCCCC"), "Should contain chr3 sequence"); + + println!("✓ Export subset sequences test passed"); + } + + #[test] + fn test_export_fasta_roundtrip() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + + // Create test FASTA with longer sequences + let fasta_content = "\ +>seq1 +ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC +ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC +>seq2 +GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGGAAAACCCC +"; + let temp_fasta_path = temp_path.join("original.fa"); + fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); + + // Import into store + let mut store1 = GlobalRefgetStore::in_memory(); + store1.add_sequence_collection_from_fasta(&temp_fasta_path).unwrap(); + + // Get original digests + let original_digests: Vec = store1 + .sequence_store + .values() + .map(|r| r.metadata().sha512t24u.clone()) + .collect(); + + // Export to new FASTA + let collections: Vec<_> = store1.collections.keys().cloned().collect(); + let collection_digest = collections[0]; + let exported_path = temp_path.join("exported.fa"); + store1 + .export_fasta(&collection_digest, &exported_path, None, Some(60)) + .expect("Failed to export FASTA"); + + // Re-import the exported FASTA + let mut store2 = GlobalRefgetStore::in_memory(); + store2.add_sequence_collection_from_fasta(&exported_path).unwrap(); + + // Verify digests match (same sequences) + let new_digests: Vec = store2 + .sequence_store + .values() + .map(|r| r.metadata().sha512t24u.clone()) + .collect(); + + assert_eq!(original_digests.len(), new_digests.len(), "Should have same number of sequences"); + for digest in original_digests { + assert!( + new_digests.contains(&digest), + "Digest {} should be present after roundtrip", + digest + ); + } + + println!("✓ Export/import roundtrip test passed"); + } + + #[test] + fn test_export_fasta_by_digests() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + + // Create test FASTA + let fasta_content = "\ +>chr1 +ATGCATGCATGC +>chr2 +GGGGAAAA +"; + let temp_fasta_path = temp_path.join("test.fa"); + fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); + + // Import into store + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta(&temp_fasta_path).unwrap(); + + // Get digests + let digests: Vec = store + .sequence_store + .values() + .map(|r| r.metadata().sha512t24u.clone()) + .collect(); + + // Export by digests + let output_path = temp_path.join("exported_by_digests.fa"); + let digest_refs: Vec<&str> = digests.iter().map(|s| s.as_str()).collect(); + store + .export_fasta_by_digests(digest_refs, &output_path, Some(80)) + .expect("Failed to export FASTA by digests"); + + // Read and verify the exported file + let exported_content = fs::read_to_string(&output_path).expect("Failed to read exported file"); + + // Check that all sequences are present + assert!(exported_content.contains(">chr1"), "Should contain chr1"); + assert!(exported_content.contains(">chr2"), "Should contain chr2"); + assert!(exported_content.contains("ATGCATGCATGC"), "Should contain chr1 sequence"); + assert!(exported_content.contains("GGGGAAAA"), "Should contain chr2 sequence"); + + println!("✓ Export by digests test passed"); + } + + #[test] + fn test_export_fasta_error_handling() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + + // Create test FASTA + let fasta_content = "\ +>chr1 +ATGCATGCATGC +"; + let temp_fasta_path = temp_path.join("test.fa"); + fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); + + // Import into store + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta(&temp_fasta_path).unwrap(); + + // Test with non-existent collection + let output_path = temp_path.join("should_fail.fa"); + let fake_collection = b"fake_collection_digest_12345678"; + let result = store.export_fasta(fake_collection, &output_path, None, Some(80)); + assert!(result.is_err(), "Should fail with non-existent collection"); + + // Test with non-existent sequence name + let collections: Vec<_> = store.collections.keys().cloned().collect(); + let collection_digest = collections[0]; + let result = store.export_fasta( + &collection_digest, + &output_path, + Some(vec!["nonexistent_chr"]), + Some(80), + ); + assert!(result.is_err(), "Should fail with non-existent sequence name"); + + println!("✓ Error handling test passed"); + } + + #[test] + fn test_sequence_names_with_spaces() { + // Ensure FASTA headers with spaces work correctly + // This is common in real-world files like pangenome assemblies + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + + // Create test FASTA with sequence names containing spaces + // This mimics the structure from HPRC pangenome files + let fasta_content = "\ +>JAHKSE010000016.1 unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000016.1:1:100:1 +ATGCATGCATGCATGCATGCATGCATGCATGCATGC +ATGCATGCATGCATGCATGCATGCATGCATGCATGC +>JAHKSE010000012.1 unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000012.1:1:100:1 +GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGG +GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGG +"; + let temp_fasta_path = temp_path.join("spaces_in_names.fa"); + fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file"); + + // Import FASTA with sequence names containing spaces + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta(&temp_fasta_path) + .expect("Should handle sequence names with spaces"); + + // Verify the sequences were loaded with full names including spaces + assert_eq!(store.sequence_store.len(), 2); + + let full_name1 = "JAHKSE010000016.1 unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000016.1:1:100:1"; + let full_name2 = "JAHKSE010000012.1 unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000012.1:1:100:1"; + + // Get the collection + let collections: Vec<_> = store.collections.keys().cloned().collect(); + assert_eq!(collections.len(), 1); + let collection_digest = collections[0]; + + // Verify we can retrieve sequences by their full names (with spaces) + let seq1 = store.get_sequence_by_collection_and_name(&collection_digest, full_name1); + assert!(seq1.is_some(), "Should retrieve sequence by full name with spaces"); + + let seq2 = store.get_sequence_by_collection_and_name(&collection_digest, full_name2); + assert!(seq2.is_some(), "Should retrieve sequence by full name with spaces"); + + println!("✓ Sequence names with spaces test passed"); + } + + #[test] + fn test_farg_filename_with_dots() { + // Test that FARG filenames preserve dots in the base name + // Real HPRC files like "HG002.alt.pat.f1_v2.unmasked.fa.gz" + // should create "HG002.alt.pat.f1_v2.unmasked.farg", NOT "HG002.farg" + + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let temp_path = temp_dir.path(); + + // Copy test file to temp (so .farg file gets created there, not in test data) + let test_file = "../tests/data/fasta/HG002.alt.pat.f1_v2.unmasked.fa"; + let temp_fasta = temp_path.join("HG002.alt.pat.f1_v2.unmasked.fa"); + fs::copy(test_file, &temp_fasta).expect("Failed to copy test file"); + + // Load the FASTA - this creates a .farg file + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta(&temp_fasta) + .expect("Should load FASTA"); + + // Check which .farg file was created + let correct_farg = temp_path.join("HG002.alt.pat.f1_v2.unmasked.farg"); + let wrong_farg = temp_path.join("HG002.farg"); + + let files: Vec<_> = std::fs::read_dir(temp_path).unwrap() + .map(|e| e.unwrap().file_name().to_string_lossy().to_string()) + .collect(); + + assert!( + correct_farg.exists(), + "Expected 'HG002.alt.pat.f1_v2.unmasked.farg' but found: {:?}", + files + ); + + assert!( + !wrong_farg.exists(), + "Should NOT create 'HG002.farg' (strips too many dots)" + ); + + println!("✓ FARG filename with dots test passed"); + } + + #[test] + fn test_on_disk_collection_written_incrementally() { + // Test that collection FARG files are written to disk immediately + // when using on_disk() store, not just when write_store_to_dir() is called + let temp_dir = tempdir().unwrap(); + let temp_path = temp_dir.path(); + let temp_fasta = temp_path.join("base.fa.gz"); + std::fs::copy("../tests/data/fasta/base.fa.gz", &temp_fasta) + .expect("Failed to copy base.fa.gz to tempdir"); + + let cache_path = temp_path.join("cache"); + let mut store = GlobalRefgetStore::on_disk(&cache_path).unwrap(); + + // Load FASTA file into the store + store.add_sequence_collection_from_fasta(&temp_fasta).unwrap(); + + // BEFORE calling write_store_to_dir, verify collection FARG files exist + let collections_dir = cache_path.join("collections"); + assert!(collections_dir.exists(), "Collections directory should exist"); + + let farg_files: Vec<_> = std::fs::read_dir(&collections_dir) + .unwrap() + .map(|e| e.unwrap().file_name().to_string_lossy().to_string()) + .collect(); + + assert!(!farg_files.is_empty(), "Collection FARG files should be written incrementally, found: {:?}", farg_files); + assert!(farg_files.iter().any(|f| f.ends_with(".farg")), "Should have .farg files"); + + println!("✓ On-disk collection incremental write test passed"); + } + + #[test] + fn test_disk_size_calculation() { + let mut store = GlobalRefgetStore::in_memory(); + store.add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz").unwrap(); + + let disk_size = store.total_disk_size(); + assert!(disk_size > 0, "Disk size should be greater than 0"); + + // Verify against manual calculation + let manual: usize = store.sequence_metadata() + .map(|m| (m.length * m.alphabet.bits_per_symbol()).div_ceil(8)) + .sum(); + assert_eq!(disk_size, manual); + } + + #[test] + fn test_incremental_index_writing() { + let temp_dir = tempdir().unwrap(); + let cache_path = temp_dir.path().join("store"); + let mut store = GlobalRefgetStore::on_disk(&cache_path).unwrap(); + + store.add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz").unwrap(); + + // Index files should exist immediately + assert!(cache_path.join("index.json").exists(), "index.json should exist"); + assert!(cache_path.join("sequences.farg").exists(), "sequences.farg should exist"); + + // Store should be loadable (mode ignored for existing store) + let _loaded = GlobalRefgetStore::on_disk(&cache_path).unwrap(); + } + + #[test] + fn test_write_method() { + let temp_dir = tempdir().unwrap(); + let cache_path = temp_dir.path().join("store"); + let mut store = GlobalRefgetStore::on_disk(&cache_path).unwrap(); + + store.add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz").unwrap(); + store.write().unwrap(); // Should succeed + + assert!(cache_path.join("index.json").exists()); + } + + #[test] + fn test_on_disk_smart_constructor() { + let temp_dir = tempdir().unwrap(); + let cache_path = temp_dir.path().join("store"); + + // Create new store (defaults to Encoded mode) + let mut store1 = GlobalRefgetStore::on_disk(&cache_path).unwrap(); + assert_eq!(store1.mode, StorageMode::Encoded); + store1.add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz").unwrap(); + + // Load existing store - should preserve Encoded mode + let store2 = GlobalRefgetStore::on_disk(&cache_path).unwrap(); + assert_eq!(store2.sequence_store.len(), store1.sequence_store.len()); + assert_eq!(store2.mode, StorageMode::Encoded, "Loaded store should preserve Encoded mode"); + + // Test with Raw mode + let cache_path_raw = temp_dir.path().join("store_raw"); + let mut store3 = GlobalRefgetStore::on_disk(&cache_path_raw).unwrap(); + store3.disable_encoding(); // Switch to Raw + assert_eq!(store3.mode, StorageMode::Raw); + store3.add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz").unwrap(); + + // Load and verify Raw mode is persisted + let store4 = GlobalRefgetStore::on_disk(&cache_path_raw).unwrap(); + assert_eq!(store4.mode, StorageMode::Raw, "Loaded store should preserve Raw mode"); + + // Verify index.json contains the mode + let index_path = cache_path_raw.join("index.json"); + let json = fs::read_to_string(&index_path).unwrap(); + assert!(json.contains("\"mode\":\"Raw\"") || json.contains("\"mode\": \"Raw\""), + "index.json should contain mode: Raw"); + } } diff --git a/gtars-refget/src/utils.rs b/gtars-refget/src/utils.rs index c2d552ae..7f1b4a08 100644 --- a/gtars-refget/src/utils.rs +++ b/gtars-refget/src/utils.rs @@ -11,12 +11,14 @@ impl> PathExtension for T { let path = self.as_ref(); if let Some(file_name) = path.file_name() { let file_name_str = file_name.to_string_lossy(); - // Find the first dot to get the base name without any extensions - let base_name = if let Some(dot_pos) = file_name_str.find('.') { - &file_name_str[..dot_pos] - } else { - &file_name_str - }; + + // Strip only known FASTA extensions, preserving dots in the base name + // Check longest extensions first (.fa.gz, .fasta.gz) before shorter ones + let known_extensions = [".fa.gz", ".fasta.gz", ".fa", ".fasta"]; + + let base_name = known_extensions.iter() + .find_map(|ext| file_name_str.strip_suffix(ext)) + .unwrap_or(&file_name_str); if let Some(parent) = path.parent() { parent.join(format!("{}.{}", base_name, new_extension)) diff --git a/gtars-refget/tests/test_decode.rs b/gtars-refget/tests/test_decode.rs new file mode 100644 index 00000000..18d15c1f --- /dev/null +++ b/gtars-refget/tests/test_decode.rs @@ -0,0 +1,105 @@ +//! Integration tests for decode() workflow through GlobalRefgetStore +//! +//! These tests validate the end-to-end workflow of importing FASTA files, +//! storing sequences, retrieving them, and decoding them through the public API. +//! +//! Note: Unit tests for the decode() method itself (including edge cases, +//! different alphabets, encoding schemes) are in src/collection.rs + +use gtars_refget::store::{GlobalRefgetStore, StorageMode}; +use std::io::Write; +use tempfile::NamedTempFile; + +/// Helper function to create a temporary FASTA file with test sequences +fn create_test_fasta() -> NamedTempFile { + let mut file = NamedTempFile::new().expect("Failed to create temp file"); + writeln!(file, ">seq1").expect("Failed to write"); + writeln!(file, "ACGTACGTACGT").expect("Failed to write"); + writeln!(file, ">seq2").expect("Failed to write"); + writeln!(file, "TTGGCCAA").expect("Failed to write"); + writeln!(file, ">seq3").expect("Failed to write"); + writeln!(file, "NNNNAAAA").expect("Failed to write"); + file +} + +#[test] +fn test_decode_workflow_encoded() { + // Test the complete workflow: import FASTA → store (encoded) → retrieve → decode + let fasta_file = create_test_fasta(); + let fasta_path = fasta_file.path(); + + // Create an encoded (bit-packed) store (default mode) + let mut store = GlobalRefgetStore::in_memory(); + store + .add_sequence_collection_from_fasta(fasta_path) + .expect("Failed to import FASTA"); + + // Get the collection digest + let collections: Vec<_> = store.collections().collect(); + assert!(!collections.is_empty(), "No collections found"); + let collection_digest = collections[0].digest.clone(); + + // Expected sequences + let expected = vec![ + ("seq1", "ACGTACGTACGT"), + ("seq2", "TTGGCCAA"), + ("seq3", "NNNNAAAA"), // Tests IUPAC 'N' handling + ]; + + // Verify all sequences can be retrieved and decoded correctly + for (name, expected_seq) in expected.iter() { + let record = store + .get_sequence_by_collection_and_name(&collection_digest, name) + .unwrap_or_else(|| panic!("Failed to retrieve {}", name)); + + assert!(record.has_data(), "Record should have data"); + + let decoded = record + .decode() + .unwrap_or_else(|| panic!("decode() returned None for {}", name)); + + assert_eq!( + &decoded, expected_seq, + "Decoded sequence for {} doesn't match", + name + ); + } +} + +#[test] +fn test_decode_workflow_raw() { + // Test the same workflow with raw (uncompressed) storage mode + let fasta_file = create_test_fasta(); + let fasta_path = fasta_file.path(); + + // Create a raw store + let mut store = GlobalRefgetStore::in_memory(); + store.disable_encoding(); // Switch to Raw mode + store + .add_sequence_collection_from_fasta(fasta_path) + .expect("Failed to import FASTA"); + + // Get the collection digest + let collections: Vec<_> = store.collections().collect(); + assert!(!collections.is_empty(), "No collections found"); + let collection_digest = collections[0].digest.clone(); + + // Verify sequences decode correctly with raw storage + let record = store + .get_sequence_by_collection_and_name(&collection_digest, "seq1") + .expect("Failed to retrieve seq1"); + + let decoded = record.decode().expect("decode() returned None"); + assert_eq!( + decoded, "ACGTACGTACGT", + "Decoded sequence from raw store doesn't match" + ); + + // Verify IUPAC sequences work with raw storage too + let record2 = store + .get_sequence_by_collection_and_name(&collection_digest, "seq3") + .expect("Failed to retrieve seq3"); + + let decoded2 = record2.decode().expect("decode() returned None for seq3"); + assert_eq!(decoded2, "NNNNAAAA", "IUPAC sequence doesn't match"); +} diff --git a/tests/README.md b/tests/README.md index 2e713ec2..d7718367 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1 +1,31 @@ -To run tests, run `cargo test` in the root directory of the project. +# Testing gtars + +## Basic Commands + +```bash +# Run all tests +cargo test + +# Run tests for a specific package/crate +cargo test -p gtars-refget + +# Run tests matching a name pattern +cargo test refget +cargo test test_import_fasta + +# Combine filters +cargo test -p gtars-refget test_import +``` + +## Useful Options + +```bash +# Show output from tests +cargo test -- --nocapture + +# Run ignored tests +cargo test -- --ignored + +# List available tests without running +cargo test -- --list +``` diff --git a/tests/data/fasta/HG002.alt.pat.f1_v2.unmasked.fa b/tests/data/fasta/HG002.alt.pat.f1_v2.unmasked.fa new file mode 100644 index 00000000..dd08063d --- /dev/null +++ b/tests/data/fasta/HG002.alt.pat.f1_v2.unmasked.fa @@ -0,0 +1,6 @@ +>chrX +TTGGGGAA +>chr1 +GGAA +>chr2 +GCGC diff --git a/tests/data/fasta/base.fa.fai b/tests/data/fasta/base.fa.fai new file mode 100644 index 00000000..dc37dcde --- /dev/null +++ b/tests/data/fasta/base.fa.fai @@ -0,0 +1,3 @@ +chrX 8 6 8 9 +chr1 4 21 4 5 +chr2 4 32 4 5 diff --git a/tests/data/fasta/crlf_endings.fa b/tests/data/fasta/crlf_endings.fa new file mode 100644 index 00000000..6089944d --- /dev/null +++ b/tests/data/fasta/crlf_endings.fa @@ -0,0 +1,5 @@ +>chr1 +ACGTACGTACGTACGTACGTACGTACGTACGT +TGCATGCATGCATGCATGCATGCATGCATGCA +>chr2 +GGGGCCCCAAAATTTTGGGGCCCCAAAATTTT diff --git a/tests/data/fasta/crlf_endings.fa.fai b/tests/data/fasta/crlf_endings.fa.fai new file mode 100644 index 00000000..e22f331b --- /dev/null +++ b/tests/data/fasta/crlf_endings.fa.fai @@ -0,0 +1,2 @@ +chr1 64 7 32 34 +chr2 32 82 32 34 diff --git a/tests/data/fasta/unwrapped.fa b/tests/data/fasta/unwrapped.fa new file mode 100644 index 00000000..9279eec7 --- /dev/null +++ b/tests/data/fasta/unwrapped.fa @@ -0,0 +1,6 @@ +>chr1 +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCAGGCCGGCCGGCCGGCCGGCCGGCCGGCCGGCCGGCCGGCC +>chr2 +AAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT +>chr3 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/data/fasta/unwrapped.fa.fai b/tests/data/fasta/unwrapped.fa.fai new file mode 100644 index 00000000..87cb4a1c --- /dev/null +++ b/tests/data/fasta/unwrapped.fa.fai @@ -0,0 +1,3 @@ +chr1 120 6 120 121 +chr2 64 133 64 65 +chr3 40 204 40 41 diff --git a/tests/data/fasta/wrapped_20.fa b/tests/data/fasta/wrapped_20.fa new file mode 100644 index 00000000..96284947 --- /dev/null +++ b/tests/data/fasta/wrapped_20.fa @@ -0,0 +1,15 @@ +>chr1 +ACGTACGTACGTACGTACGT +ACGTACGTACGTACGTACGT +TGCATGCATGCATGCATGCA +TGCATGCATGCATGCATGCA +GGCCGGCCGGCCGGCCGGCC +GGCCGGCCGGCCGGCCGGCC +>chr2 +AAAACCCCGGGGTTTTAAAA +CCCCGGGGTTTTAAAACCCC +GGGGTTTTAAAACCCCGGGG +TTTT +>chr3 +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN diff --git a/tests/data/fasta/wrapped_20.fa.fai b/tests/data/fasta/wrapped_20.fa.fai new file mode 100644 index 00000000..70a80853 --- /dev/null +++ b/tests/data/fasta/wrapped_20.fa.fai @@ -0,0 +1,3 @@ +chr1 120 6 20 21 +chr2 64 138 20 21 +chr3 40 212 20 21 diff --git a/tests/data/fasta/wrapped_40.fa b/tests/data/fasta/wrapped_40.fa new file mode 100644 index 00000000..9ace5f4c --- /dev/null +++ b/tests/data/fasta/wrapped_40.fa @@ -0,0 +1,9 @@ +>chr1 +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT +TGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA +GGCCGGCCGGCCGGCCGGCCGGCCGGCCGGCCGGCCGGCC +>chr2 +AAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCC +GGGGTTTTAAAACCCCGGGGTTTT +>chr3 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/data/fasta/wrapped_40.fa.fai b/tests/data/fasta/wrapped_40.fa.fai new file mode 100644 index 00000000..71f045db --- /dev/null +++ b/tests/data/fasta/wrapped_40.fa.fai @@ -0,0 +1,3 @@ +chr1 120 6 40 41 +chr2 64 135 40 41 +chr3 40 207 40 41