diff --git a/docs/source/references.bib b/docs/source/references.bib
index 3c0fb5ee7..b32bd028a 100644
--- a/docs/source/references.bib
+++ b/docs/source/references.bib
@@ -611,6 +611,40 @@ @article{Suzuki1992
     year={1992},
     doi={10.1016/0375-9601(92)90335-J}
 }
+@article{Rupprecht2026,
+    title={Faster matrix product state preparation by exploiting symmetry-induced block-sparsity},
+    author={Felix Rupprecht and Sabine W{"o}lk},
+    journal={arXiv},
+    year={2026},
+    doi={10.48550/arXiv.2605.28489},
+    url={https://arxiv.org/abs/2605.28489}
+}
+@software{Rupprecht2026Zenodo,
+    title={Code and Assets for: Faster matrix product state preparation by exploiting symmetry-induced block-sparsity},
+    author={Felix Rupprecht and Sabine W{"o}lk},
+    year={2026},
+    publisher={Zenodo},
+    doi={10.5281/zenodo.20393500}
+}
+@article{Berry2025,
+    title={Rapid Initial-State Preparation for the Quantum Simulation of Strongly Correlated Molecules},
+    author={Dominic W. Berry and Yu Tong and Tanuj Khattar and Alec White and Tae In Kim and Guang Hao Low and Sergio Boixo and Zhiyan Ding and Lin Lin and Seunghoon Lee and Garnet Kin-Lic Chan and Ryan Babbush and Nicholas C. Rubin},
+    journal={PRX Quantum},
+    volume={6},
+    number={2},
+    pages={020327},
+    year={2025},
+    doi={10.1103/PRXQuantum.6.020327}
+}
+@article{Clements2017,
+    title={An Optimal Design for Universal Multiport Interferometers},
+    author={William R. Clements and Peter C. Humphreys and Benjamin J. Metcalf and W. Steven Kolthammer and Ian A. Walmsley},
+    year={2017},
+    eprint={1603.08788},
+    archivePrefix={arXiv},
+    primaryClass={physics.optics},
+    url={https://arxiv.org/abs/1603.08788},
+}
 @article{Verstraete2005,
     title={Mapping local {Hamiltonians} of fermions onto local {Hamiltonians} of spins},
     author={F. Verstraete and J. I. Cirac},
diff --git a/python/src/qdk_chemistry/algorithms/registry.py b/python/src/qdk_chemistry/algorithms/registry.py
index 64aca7972..15407819c 100644
--- a/python/src/qdk_chemistry/algorithms/registry.py
+++ b/python/src/qdk_chemistry/algorithms/registry.py
@@ -794,7 +794,11 @@ def _register_python_algorithms():
     from qdk_chemistry.algorithms.propagator import MagnusPropagator  # noqa: PLC0415
     from qdk_chemistry.algorithms.qubit_hamiltonian_solver import DenseMatrixSolver, SparseMatrixSolver  # noqa: PLC0415
     from qdk_chemistry.algorithms.qubit_mapper import QdkQubitMapper  # noqa: PLC0415
-    from qdk_chemistry.algorithms.state_preparation import SparseIsometryGF2XStatePreparation  # noqa: PLC0415
+    from qdk_chemistry.algorithms.state_preparation import (  # noqa: PLC0415
+        MPSSequentialStatePreparation,
+        MPSSparseStatePreparation,
+        SparseIsometryGF2XStatePreparation,
+    )
     from qdk_chemistry.algorithms.term_grouper import (  # noqa: PLC0415
         FullCommutingTermGrouper,
         IdentityTermGrouper,
@@ -804,6 +808,8 @@ def _register_python_algorithms():
 
     register(lambda: QdkEnergyEstimator())
     register(lambda: SparseIsometryGF2XStatePreparation())
+    register(lambda: MPSSequentialStatePreparation())
+    register(lambda: MPSSparseStatePreparation())
     register(lambda: DenseMatrixSolver())
     register(lambda: SparseMatrixSolver())
     register(lambda: QdkQubitMapper())
diff --git a/python/src/qdk_chemistry/algorithms/state_preparation/__init__.py b/python/src/qdk_chemistry/algorithms/state_preparation/__init__.py
index 81cca543c..52b104a32 100644
--- a/python/src/qdk_chemistry/algorithms/state_preparation/__init__.py
+++ b/python/src/qdk_chemistry/algorithms/state_preparation/__init__.py
@@ -9,6 +9,12 @@
 # --------------------------------------------------------------------------------------------
 
 from qdk_chemistry.algorithms.state_preparation.identity import identity_state_prep
+from qdk_chemistry.algorithms.state_preparation.mps_sequential import (
+    MPSSequentialStatePreparation,
+)
+from qdk_chemistry.algorithms.state_preparation.mps_sparse import (
+    MPSSparseStatePreparation,
+)
 from qdk_chemistry.algorithms.state_preparation.sparse_isometry import (
     SparseIsometryGF2XStatePreparation,
 )
@@ -19,6 +25,8 @@
 )
 
 __all__ = [
+    "MPSSequentialStatePreparation",
+    "MPSSparseStatePreparation",
     "SparseIsometryGF2XStatePreparation",
     "StatePreparationFactory",
     "StatePreparationSettings",
diff --git a/python/src/qdk_chemistry/algorithms/state_preparation/mps_sequential.py b/python/src/qdk_chemistry/algorithms/state_preparation/mps_sequential.py
new file mode 100644
index 000000000..5cefcf5b4
--- /dev/null
+++ b/python/src/qdk_chemistry/algorithms/state_preparation/mps_sequential.py
@@ -0,0 +1,965 @@
+"""Matrix Product State (MPS) state preparation via sequential site unitary synthesis.
+
+Implements the MPS state preparation algorithm based on
+:cite:`Berry2025`. Each site unitary is decomposed based on Appendix B in
+:cite:`Rupprecht2026`.
+
+Attribution
+-----------
+The unitary synthesis is based on code originally published by Felix Rupprecht
+on Zenodo :cite:`Rupprecht2026Zenodo` under Apache 2.0 license.
+The implementation has been rewritten for integration into QDK Chemistry.
+
+References
+----------
+    Felix Rupprecht and Sabine Wölk. (2026). Faster matrix product state preparation by
+    exploiting symmetry-induced block-sparsity.
+    https://arxiv.org/pdf/2605.28489. Zenodo: https://zenodo.org/records/20393500.
+
+    Dominic W. Berry et al. (2025). Rapid Initial-State Preparation for the Quantum Simulation of
+    Strongly Correlated Molecules. PRX Quantum 6, 020327.
+    https://doi.org/10.1103/PRXQuantum.6.020327.
+
+    William R. Clements et al. (2017). An Optimal Design for Universal Multiport Interferometers.
+    https://arxiv.org/abs/1603.08788.
+
+"""
+
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+from collections import deque
+
+import numpy as np
+
+from qdk_chemistry.data.circuit import Circuit, QsharpFactoryData
+from qdk_chemistry.data.mps_wavefunction import MPSWavefunction
+from qdk_chemistry.utils.qsharp import QSHARP_UTILS
+
+from .state_preparation import StatePreparation, StatePreparationSettings
+
+__all__: list[str] = [
+    "MPSSequentialStatePreparation",
+]
+
+
+class MPSSequentialStatePreparationSettings(StatePreparationSettings):
+    """Settings for MPS sequential state preparation."""
+
+    def __init__(self):
+        """Initialize the MPSSequentialStatePreparationSettings."""
+        super().__init__()
+        self._set_default("rotation_bits", "int", 10, "Phase gradient precision.")
+        self._set_default(
+            "fast_resource_estimation",
+            "bool",
+            False,
+            "Only synthesize one site unitary to reduce classical preprocessing overhead. "
+            "Valid for resource estimation (with BeginEstimateCaching) but not simulation.",
+        )
+
+
+class MPSSequentialStatePreparation(StatePreparation):
+    r"""Matrix Product State (MPS) state preparation using sequential unitary synthesis.
+
+    Prepare the state sequentially, qubit-by-qubit (2 qubits per site), using an
+    ancilla register that stores the virtual bond dimension. Each site unitary
+    is decomposed based on Appendix B in :cite:`Rupprecht2026`, and synthesized
+    into a quantum circuit via Givens rotation layers with QROAM (Quantum
+    Read-Only Access Memory) angle loading and phase gradient rotations.
+
+    Attribution
+    -----------
+    The unitary synthesis is based on code originally published by Felix Rupprecht
+    on Zenodo :cite:`Rupprecht2026Zenodo` under Apache 2.0 license.
+    The implementation has been rewritten for integration into QDK Chemistry.
+    """
+
+    def __init__(self):
+        """Initialize the MPS sequential state preparation algorithm."""
+        super().__init__()
+        self._settings = MPSSequentialStatePreparationSettings()
+
+    def name(self) -> str:
+        """Return the algorithm name.
+
+        Returns:
+            str: The name ``"mps_sequential"``
+
+        """
+        return "mps_sequential"
+
+    def _run_impl(self, wavefunction: MPSWavefunction) -> Circuit:
+        """Return a circuit to prepare an MPS state.
+
+        Args:
+            wavefunction: An MPSWavefunction containing the tensors.
+
+        Returns:
+            A Circuit object implementing the MPS state preparation.
+
+        Raises:
+            TypeError: If wavefunction is not an MPSWavefunction instance.
+
+        """
+        if not isinstance(wavefunction, MPSWavefunction):
+            raise TypeError(f"MPSSequentialStatePreparation requires an MPSWavefunction, got {type(wavefunction)}.")
+
+        fast_re = self._settings.get("fast_resource_estimation")
+
+        data = generate_mps_preparation_data(wavefunction.tensors, fast_resource_estimation=fast_re)
+
+        rotation_bits = self._settings.get("rotation_bits")
+
+        if fast_re:
+            params = data.to_qsharp_params_grouped(rotation_bits)
+            program = QSHARP_UTILS.MPSSequential.MakeMPSSequentialCircuitGrouped
+        else:
+            params = data.to_qsharp_params(rotation_bits)
+            program = QSHARP_UTILS.MPSSequential.MakeMPSSequentialCircuit
+
+        qsharp_factory = QsharpFactoryData(
+            program=program,
+            parameter=params,
+        )
+
+        # Build composable op for QPE composition
+        if fast_re:
+            op_params = QSHARP_UTILS.MPSSequential.MPSSequentialGroupedParams(**params)
+            qsharp_op = QSHARP_UTILS.MPSSequential.MakeMPSSequentialOpGrouped(op_params)
+        else:
+            op_params = QSHARP_UTILS.MPSSequential.MPSSequentialParams(**params)
+            qsharp_op = QSHARP_UTILS.MPSSequential.MakeMPSSequentialOp(op_params)
+
+        return Circuit(qsharp_factory=qsharp_factory, qsharp_op=qsharp_op, encoding="jordan-wigner")
+
+
+# ---------------------------------------------------------------------------
+# Data containers for unitary decomposition
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class GivensLayerData:
+    """Result of decomposing a unitary into Givens rotation layers.
+
+    Stores the factorization ``U = D · L_l · ... · L_1`` where each ``L_j``
+    is a layer of parallel R_y rotations and ``D`` is a ±1 sign matrix.
+    """
+
+    layer_angles: list[list[float]]
+    """Per-layer R_y rotation angles for each parallel slot."""
+
+    layer_shifted: list[bool]
+    """Whether each layer uses odd-indexed pairs (True) or even (False)."""
+
+    phases: list[bool]
+    """Diagonal sign flips (True where entry is -1)."""
+
+
+@dataclass
+class SiteUnitaryData:
+    r"""Decomposition data for a single MPS site unitary.
+
+    Holds the 7-matrix Cosine-Sine Decomposition(CSD) from Appendix B of
+    :cite:`Rupprecht2026` and the Givens-layer synthesis of each component.
+
+    The circuit applies these components in order (see Fig. 5 of the paper)::
+
+        V -> UCR(d_0') -> CNOT -> W_0 -> UCR(d_1') -> CNOT -> W_1 -> UCR(d_2') -> U
+
+    where each UCR (Uniformly Controlled Rotation) is a multiplexed R_y
+    rotation addressed by the ancilla register, CNOT is a Controlled-NOT
+    gate, and U = diag(u_0, u_1, u_2, u_3) is block-diagonal.
+    """
+
+    v: GivensLayerData
+    """Givens layers for V (right unitary, pushed from next site)."""
+
+    rot_angles: list[list[float]]
+    """UCR (Uniformly Controlled Rotation) angles for each of the 3 rotation steps.
+
+    Format: ``[rot0, rot1, rot2]``.
+    """
+
+    w0: GivensLayerData
+    """Givens layers for W_0 (mixing unitary, controlled by site[0])."""
+
+    w1: GivensLayerData
+    """Givens layers for W_1 (mixing unitary, controlled by site[1])."""
+
+    u: GivensLayerData
+    """Givens layers for U (block-diagonal unitary on ancilla+site)."""
+
+
+@dataclass
+class MPSPreparationData:
+    """All data needed to drive the MPSSequential Q# operation.
+
+    Produced by :func:`generate_mps_preparation_data` and consumed by
+    :meth:`MPSSequentialStatePreparation._run_impl`.
+    """
+
+    num_sites: int
+    """Number of MPS sites."""
+
+    ancilla_bits: int
+    """Number of ancilla qubits."""
+
+    initial_state_vec: list[float]
+    """Flattened initial state vector for the first site."""
+
+    sites: list[SiteUnitaryData] = field(default_factory=list)
+    """Per-site decomposition data.
+
+    In resource estimation mode, this contains one entry per unique unitary
+    shape rather than per site.
+    """
+
+    site_shape_indices: list[int] | None = None
+    """In resource estimation mode, this maps each site to its corresponding shape group.
+    None means ungrouped."""
+
+    shape_effective_bits: list[int] | None = None
+    """In resource estimation mode, effective ancilla bits per shape group.
+    None means ungrouped."""
+
+    def to_qsharp_params(self, rotation_bits: int) -> dict:
+        """Flatten into the dict expected by the MakeMPSSequentialCircuit Q# operation."""
+        params: dict = {
+            "initialStateVec": self.initial_state_vec,
+            "numSites": self.num_sites,
+            "rotationBits": rotation_bits,
+            "numAncillaQubits": self.ancilla_bits,
+            "siteVLayerAngles": [s.v.layer_angles for s in self.sites],
+            "siteVLayerShifted": [s.v.layer_shifted for s in self.sites],
+            "siteVPhases": [s.v.phases for s in self.sites],
+            "siteRot0Angles": [s.rot_angles[0] for s in self.sites],
+            "siteRot1Angles": [s.rot_angles[1] for s in self.sites],
+            "siteRot2Angles": [s.rot_angles[2] for s in self.sites],
+            "siteW0LayerAngles": [s.w0.layer_angles for s in self.sites],
+            "siteW0LayerShifted": [s.w0.layer_shifted for s in self.sites],
+            "siteW0Phases": [s.w0.phases for s in self.sites],
+            "siteW1LayerAngles": [s.w1.layer_angles for s in self.sites],
+            "siteW1LayerShifted": [s.w1.layer_shifted for s in self.sites],
+            "siteW1Phases": [s.w1.phases for s in self.sites],
+            "siteULayerAngles": [s.u.layer_angles for s in self.sites],
+            "siteULayerShifted": [s.u.layer_shifted for s in self.sites],
+            "siteUPhases": [s.u.phases for s in self.sites],
+        }
+        return params
+
+    def to_qsharp_params_grouped(self, rotation_bits: int) -> dict:
+        """Flatten into the dict expected by MakeMPSSequentialCircuitGrouped.
+
+        Passes one representative per unique shape + a site-to-shape mapping.
+        This minimizes data transfer while enabling accurate per-shape caching
+        in the Q# resource estimator.
+        """
+        assert self.site_shape_indices is not None, "Grouped mode requires site_shape_indices"
+        assert self.shape_effective_bits is not None, "Grouped mode requires shape_effective_bits"
+
+        params: dict = {
+            "initialStateVec": self.initial_state_vec,
+            "numSites": self.num_sites,
+            "rotationBits": rotation_bits,
+            "numAncillaQubits": self.ancilla_bits,
+            "siteShapeIndices": self.site_shape_indices,
+            "shapeEffectiveBits": self.shape_effective_bits,
+            "shapeVLayerAngles": [s.v.layer_angles for s in self.sites],
+            "shapeVLayerShifted": [s.v.layer_shifted for s in self.sites],
+            "shapeVPhases": [s.v.phases for s in self.sites],
+            "shapeRot0Angles": [s.rot_angles[0] for s in self.sites],
+            "shapeRot1Angles": [s.rot_angles[1] for s in self.sites],
+            "shapeRot2Angles": [s.rot_angles[2] for s in self.sites],
+            "shapeW0LayerAngles": [s.w0.layer_angles for s in self.sites],
+            "shapeW0LayerShifted": [s.w0.layer_shifted for s in self.sites],
+            "shapeW0Phases": [s.w0.phases for s in self.sites],
+            "shapeW1LayerAngles": [s.w1.layer_angles for s in self.sites],
+            "shapeW1LayerShifted": [s.w1.layer_shifted for s in self.sites],
+            "shapeW1Phases": [s.w1.phases for s in self.sites],
+            "shapeULayerAngles": [s.u.layer_angles for s in self.sites],
+            "shapeULayerShifted": [s.u.layer_shifted for s in self.sites],
+            "shapeUPhases": [s.u.phases for s in self.sites],
+        }
+        return params
+
+
+# ---------------------------------------------------------------------------
+# Unitary synthesis helpers
+# ---------------------------------------------------------------------------
+
+
+def generate_mps_preparation_data(
+    tensors: Sequence[np.ndarray],
+    fast_resource_estimation: bool = False,
+) -> MPSPreparationData:
+    """Compute all data needed for the MPSSequential Q# operation.
+
+    Performs CSD + Givens layer decomposition for each site.
+    Returns structured data with raw angles (Double) and phases (Bool) --
+    Q# handles angle quantization internally.
+
+    Parameters
+    ----------
+    tensors : sequence of np.ndarray
+        MPS tensors. ``tensors[i]`` has shape ``(chi_left, d, chi_right)``.
+    fast_resource_estimation : bool
+        If True, for each unitary of a certain shapes, only decompose a single
+        representative site unitary and replicate its data for all sites.
+        This is valid for resource estimation but NOT for simulation.
+
+    Returns
+    -------
+    MPSPreparationData
+        Structured preparation data. Call ``.to_qsharp_params(rotation_bits)``
+        to flatten into the dict expected by the Q# operation.
+
+    """
+    num_sites = len(tensors)
+    d = tensors[0].shape[1]
+
+    # Determine consistent ancilla size across all sites
+    max_ancilla_dim = 1
+    for i in range(1, num_sites):
+        chi_left, _, chi_right = tensors[i].shape
+        local_bits = int(np.ceil(np.log2(max(chi_left, chi_right)))) if max(chi_left, chi_right) > 1 else 1
+        max_ancilla_dim = max(max_ancilla_dim, 1 << local_bits)
+    chi_1 = tensors[0].shape[2]
+    init_bits = int(np.ceil(np.log2(max(1, chi_1)))) if chi_1 > 1 else 1
+    max_ancilla_dim = max(max_ancilla_dim, 1 << init_bits)
+    ancilla_bits = int(np.ceil(np.log2(max_ancilla_dim))) if max_ancilla_dim > 1 else 1
+    ancilla_dim = 1 << ancilla_bits
+
+    # Per-site decomposition — process in reverse order to propagate V matrices.
+    # Each site's V is absorbed into the previous site's target matrix (via v_from_next),
+    # eliminating the explicit V unitary application.
+    sites: list[SiteUnitaryData] = []
+    v_from_next: np.ndarray | None = None
+    site_shape_indices: list[int] | None = None
+    unique_bits: list[int] | None = None
+    if fast_resource_estimation and num_sites > 2:
+        # Group sites by shape.
+        # Sites with the same max(chi_left, chi_right) have the similar circuit cost.
+        site_effective_bits: list[int] = []
+        for i in range(1, num_sites):
+            chi_left, _, chi_right = tensors[i].shape
+            eff = max(chi_left, chi_right)
+            bits = int(np.ceil(np.log2(eff))) if eff > 1 else 1
+            site_effective_bits.append(bits)
+
+        unique_bits = sorted(set(site_effective_bits))
+        shape_to_idx = {b: idx for idx, b in enumerate(unique_bits)}
+        site_shape_indices = [shape_to_idx[b] for b in site_effective_bits]
+
+        # Generate dummy data at the padded ancilla dimension for all shapes.
+        # Normal mode pads all sites to ancilla_dim, so fast mode must match.
+        shape_representatives: list[SiteUnitaryData] = []
+        for _ in unique_bits:
+            site_data = _make_dummy_site_data(ancilla_dim)
+            shape_representatives.append(site_data)
+        unique_bits = [ancilla_bits] * len(unique_bits)
+
+        sites = shape_representatives
+    else:
+        # Reverse-order decomposition: last site first, propagating V backwards.
+        sites_reversed: list[SiteUnitaryData] = []
+        for i in range(num_sites - 1, 0, -1):
+            site_data, v_mat = _decompose_site_with_v(tensors[i], ancilla_dim, v_from_next=v_from_next)
+            sites_reversed.append(site_data)
+            v_from_next = v_mat
+        sites = list(reversed(sites_reversed))
+
+    # Initial state from first tensor, absorbing V from site 1's decomposition.
+    if fast_resource_estimation:
+        rng = np.random.default_rng(0)
+        vec = rng.standard_normal(d * ancilla_dim)
+        vec /= np.linalg.norm(vec)
+        initial_state_vec = vec.tolist()
+    else:
+        first_tensor = tensors[0]
+        chi_1 = first_tensor.shape[2]
+        init_state = first_tensor.transpose(1, 2, 0).sum(axis=2)  # (d, chi_1)
+        init_padded = np.zeros((d, ancilla_dim))
+        init_padded[:, :chi_1] = init_state
+        # Absorb the V matrix from site 1 into the initial state
+        if v_from_next is not None:
+            v_pad = np.eye(ancilla_dim, dtype=np.float64)
+            v_pad[: v_from_next.shape[0], : v_from_next.shape[1]] = np.asarray(v_from_next).real
+            # V acts on the ancilla (right) dimension of init_padded
+            init_padded = init_padded @ v_pad.T
+        initial_state_vec_arr = init_padded.flatten()
+        norm = np.linalg.norm(initial_state_vec_arr)
+        if norm > 1e-15:
+            initial_state_vec_arr = initial_state_vec_arr / norm
+        initial_state_vec = initial_state_vec_arr.tolist()
+
+    return MPSPreparationData(
+        initial_state_vec=initial_state_vec,
+        num_sites=num_sites,
+        ancilla_bits=ancilla_bits,
+        sites=sites,
+        site_shape_indices=site_shape_indices,
+        shape_effective_bits=unique_bits,
+    )
+
+
+def _make_dummy_site_data(effective_dim: int) -> SiteUnitaryData:
+    """Generate representative site unitary data for resource estimation.
+
+    Uses random orthogonal matrices to produce realistic Givens layer counts
+    and array dimensions, matching what the full CSD pipeline would produce
+    for a generic (non-trivial) unitary. This is much cheaper than the full
+    pipeline (skips tensor reshaping, multi-block QR, and 3* SVD) while
+    giving accurate resource estimates.
+    """
+    dim = effective_dim
+    rng = np.random.default_rng(0)  # fixed seed for reproducibility
+
+    # V is always empty (absorbed into previous site)
+    v_givens = GivensLayerData(layer_angles=[], layer_shifted=[], phases=[False] * dim)
+
+    # W0, W1: decompose random dim x dim orthogonal matrices
+    w0_angles, w0_shifted, w0_phases = decompose_unitary_to_givens(_random_orthogonal(dim, rng))
+    w0_givens = GivensLayerData(layer_angles=w0_angles, layer_shifted=w0_shifted, phases=w0_phases)
+
+    w1_angles, w1_shifted, w1_phases = decompose_unitary_to_givens(_random_orthogonal(dim, rng))
+    w1_givens = GivensLayerData(layer_angles=w1_angles, layer_shifted=w1_shifted, phases=w1_phases)
+
+    # UCR rotation angles: random values (resource cost depends on array length, not values)
+    rot_angles = [rng.uniform(-1, 1, size=dim).tolist() for _ in range(3)]
+
+    # U: block-diagonal of 4 random dim x dim orthogonal blocks
+    blocks = [_random_orthogonal(dim, rng) for _ in range(4)]
+    u_angles, u_shifted, u_phases = decompose_block_diagonal_to_givens(blocks)
+    u_givens = GivensLayerData(layer_angles=u_angles, layer_shifted=u_shifted, phases=u_phases)
+
+    return SiteUnitaryData(v=v_givens, rot_angles=rot_angles, w0=w0_givens, w1=w1_givens, u=u_givens)
+
+
+def _random_orthogonal(dim: int, rng: np.random.Generator) -> np.ndarray:
+    """Generate a random orthogonal matrix via QR decomposition."""
+    raw = rng.standard_normal((dim, dim))
+    q, _ = np.linalg.qr(raw)
+    return q
+
+
+def _pad_and_givens(mat: np.ndarray, dim: int) -> GivensLayerData:
+    """Pad a real unitary to ``dim x dim`` and decompose into Givens layers."""
+    padded = np.eye(dim, dtype=np.float64)
+    padded[: mat.shape[0], : mat.shape[1]] = np.asarray(mat).real
+    angles, shifted, phases = decompose_unitary_to_givens(padded)
+    return GivensLayerData(layer_angles=angles, layer_shifted=shifted, phases=phases)
+
+
+def _d_prime_to_ucr_angles(d_prime: np.ndarray, dim: int) -> list[float]:
+    """Convert a sin-component diagonal to UCR R_y angles.
+
+    Each angle is ``2*arcsin(d'[k])``.
+    """
+    return [2.0 * float(np.arcsin(np.clip(d_prime[k] if k < len(d_prime) else 0.0, -1, 1))) for k in range(dim)]
+
+
+def _decompose_site_with_v(
+    tensor: np.ndarray, ancilla_dim: int, v_from_next: np.ndarray | None = None
+) -> tuple[SiteUnitaryData, np.ndarray]:
+    """CSD-decompose one MPS site tensor, returning both SiteUnitaryData and raw V.
+
+    When ``v_from_next`` is provided (from the next site's decomposition), it is
+    absorbed into this site's target matrix. The returned V is for propagation to
+    the previous site (or initial state). The V in the returned SiteUnitaryData
+    is always set to identity since V is never applied in the circuit — it's
+    always absorbed by the previous entity (another site or the initial state).
+    """
+    dim = ancilla_dim
+
+    data = compute_site_unitary_dense_data(tensor, v_from_next=v_from_next, ancilla_dim=dim)
+
+    d_0_, d_1_, d_2_ = data["d_prime"]
+    u_0, u_1, u_2, u_3 = data["u"]
+
+    # V is never applied in the circuit — always propagated to the previous site.
+    v_givens = GivensLayerData(layer_angles=[], layer_shifted=[], phases=[False] * dim)
+    w0_givens = _pad_and_givens(data["w_0"], dim)
+    w1_givens = _pad_and_givens(data["w_1"], dim)
+
+    # UCR (Uniformly Controlled Rotation) angles from the 3 sin-component diagonals
+    rot_angles = [
+        _d_prime_to_ucr_angles(d_0_, dim),
+        _d_prime_to_ucr_angles(d_1_, dim),
+        _d_prime_to_ucr_angles(d_2_, dim),
+    ]
+
+    # U: block-diagonal Givens decomposition
+    u_block_mats = []
+    for u_b in [u_0, u_1, u_2, u_3]:
+        u_pad = np.eye(dim, dtype=np.float64)
+        u_pad[: u_b.shape[0], : u_b.shape[1]] = np.asarray(u_b).real
+        u_block_mats.append(u_pad)
+    u_angles, u_shifted, u_phases = decompose_block_diagonal_to_givens(u_block_mats)
+    u_givens = GivensLayerData(layer_angles=u_angles, layer_shifted=u_shifted, phases=u_phases)
+
+    site_data = SiteUnitaryData(
+        v=v_givens,
+        rot_angles=rot_angles,
+        w0=w0_givens,
+        w1=w1_givens,
+        u=u_givens,
+    )
+    return site_data, data["v"]
+
+
+def compute_site_unitary_dense_data(
+    tensor: np.ndarray,
+    v_from_next: np.ndarray | None,
+    ancilla_dim: int,
+) -> dict:
+    r"""Compute the 7-matrix CSD of a site unitary.
+
+    Given the MPS tensor ``M_i`` of shape (left, 4, right), the target
+    isometry is ``U' = [A_0; A_1; A_2; A_3]`` where ``A_j = (M_i^j)^T``
+    (each block is dim x width). This function decomposes ``U'`` into the
+    7-matrix factorization from Appendix B of :cite:`Rupprecht2026` (Fig. 5)::
+
+        U' = diag(U_0..U_3)              -- block-diagonal unitary
+           · [[I,0,0,0],[0,I,0,0],       -- rotation with D_2/D_2'
+              [0,0,D_2,·],[0,0,D_2',·]]
+           · [[I,0,0,0],[0,I,0,0],       -- W_1 (controlled by site[0])
+              [0,0,W_1,0],[0,0,0,W_1]]
+           · [[I,0,0,0],[0,D_1,·,0],     -- rotation with D_1/D_1'
+              [0,D_1',·,0],[0,0,0,I]]
+           · [[I,0,0,0],[0,W_0,0,0],     -- W_0 (controlled by site[1])
+              [0,0,W_0,0],[0,0,0,I]]
+           · [[D_0,·,0,0],[D_0',·,0,0],  -- rotation with D_0/D_0'
+              [0,0,D_0,·],[0,0,D_0',·]]
+           · diag(V, V, V, V)            -- V (pushed to prior site)
+
+    The circuit (Fig. 5) applies these right-to-left::
+
+        V -> UCR(D_0') -> CNOT -> W_0 -> UCR(D_1') -> CNOT -> W_1 -> UCR(D_2') -> U
+
+    **Algorithm (three peeling steps):**
+
+    1. QR-decompose the lower 3
+       blocks ``[A_1; A_2; A_3]`` = ``[B_2; B_3; B_4] * R``.
+    2. QR-decompose the lower 2 of those: ``[B_3; B_4]`` = ``[C_3; C_4] * S``.
+    3. Apply ``decompose_2d`` three times (bottom-to-top) to peel off pairs:
+
+       - ``[C_3; C_4]`` → ``(U_2, U_3, D_2, D_2', V'')``
+       - ``[B_2; S]``   → ``(U_1, _, D_1, D_1', V')``
+       - ``[A_0; R]``   → ``(U_0, _, D_0, D_0', V)``
+
+    Each ``decompose_2d`` call returns *both* diagonals ``(D, D')`` of the
+    rotation block ``[[D, -D'], [D', D]]`` (see ``decompose_2d`` docs). Only
+    the ``D'`` values (the sine components) are needed for the circuit's
+    R_y rotation angles ``theta = 2·arcsin(D'[k])``. The mixing unitaries
+    are ``W_0 = V' @ _`` and ``W_1 = V'' @ _`` (products of the intermediate
+    V and U factors from adjacent peeling steps).
+
+    Parameters
+    ----------
+    tensor : np.ndarray of shape (left, d, right)
+        The tensor for this site (d=4 for two-qubit sites).
+    v_from_next : np.ndarray or None
+        The V matrix from the next site's decomposition (applied to the
+        incoming ancilla register). None for the last site.
+    ancilla_dim : int
+        The ancilla register dimension (must be a power of 2).
+
+    Returns
+    -------
+    dict
+        - ``'u'``: tuple ``(u_0, u_1, u_2, u_3)`` — block-diagonal unitaries.
+        - ``'d_prime'``: tuple ``(d_0', d_1', d_2')`` — sin-component diagonals
+          for the 3 UCR layers.
+        - ``'w_0'``, ``'w_1'``: mixing unitaries on the ancilla register.
+        - ``'v'``: right unitary to be pushed to the previous site's circuit.
+        - ``'ancilla_dim'``: the ancilla dimension used.
+
+    """
+    left, site_dim, _ = tensor.shape
+    dim = ancilla_dim
+
+    # Build target isometry U' = [A_0; A_1; A_2; A_3] of shape (4·dim, left).
+    # Each A_j block has shape (dim, left).
+    target = tensor.transpose(1, 2, 0)  # (d, right, left)
+    if v_from_next is not None:
+        target = np.einsum("ij,djk->dik", v_from_next, target)
+    padded = np.pad(target, ((0, 0), (0, dim - target.shape[1]), (0, 0)))
+    matrix = padded.reshape(site_dim * dim, left)
+
+    # --- Step 1: QR on lower 3 blocks [A_1; A_2; A_3] ---
+    # b_full (3*dim x 3*dim) has orthonormal columns, r (3*dim x left) is upper triangular.
+    # Split b_full into B_2 = b_full[:dim], and [B_3; B_4] = b_full[dim:].
+    b_full, r = np.linalg.qr(matrix[dim:, :], mode="complete")
+
+    # --- Step 2: QR on lower 2 of B: [B_3; B_4] ---
+    # c (2*dim x 2*dim) -> C_3 = c[:dim], C_4 = c[dim:].
+    # s (2*dim x dim) is upper triangular.
+    c, s = np.linalg.qr(b_full[dim:, :dim], mode="complete")
+
+    # --- Step 3: Three peeling decompositions (bottom → top) ---
+    # Peel [C_3; C_4] → rotation D_2/D_2' and unitaries U_2, U_3
+    u_2, u_3, _d_2, d_2_, v__ = decompose_2d(c[:dim, :dim], c[dim:, :dim])
+
+    # Peel [B_2; S] → rotation D_1/D_1' and unitary U_1
+    u_1, u_dummy, _d_1, d_1_, v_ = decompose_2d(b_full[:dim, :dim], s[:dim, :])
+
+    # Peel [A_0; R] → rotation D_0/D_0' and unitary U_0
+    u_0, u_top, _d_0, d_0_, v = decompose_2d(matrix[:dim, :], r[:dim, :])
+
+    d_0_ = _pad_to_power_of_2(np.asarray(d_0_).real, dim)
+    d_1_ = np.asarray(d_1_).real
+    d_2_ = np.asarray(d_2_).real
+
+    # Mixing unitaries: W_0 = V' @ U_top,  W_1 = V'' @ U_dummy
+    # (products of the intermediate right-unitary and leftover factor from
+    # the adjacent peeling step)
+    w_0 = v_ @ u_top
+    w_1 = v__ @ u_dummy
+
+    return {
+        "u": (u_0, u_1, u_2, u_3),
+        "d_prime": (d_0_, d_1_, d_2_),
+        "w_0": w_0,
+        "w_1": w_1,
+        "v": v,
+        "ancilla_dim": dim,
+    }
+
+
+def decompose_2d(a: np.ndarray, b: np.ndarray):
+    r"""Decompose a 2-block column matrix via SVD + polar decomposition.
+
+    Given matrices ``a`` (shape m, k) and ``b`` (shape m, k) whose vertical
+    stack ``[a; b]`` has orthonormal columns, compute the factorization from
+    Eq. (30) of :cite:`Berry2025` and the Lemma in
+    Appendix B of :cite:`Rupprecht2026`::
+
+        [a]   [u_1   0 ] [D_1] [v]
+        [b] = [ 0   u_2] [D_2] [v]
+
+    where ``u_1``, ``u_2`` are unitary (m x m), ``v`` is unitary (k x k),
+    and ``D_1``, ``D_2`` are real diagonal (m x k) matrices satisfying
+    ``D_1^2 + D_2^2 = I``.
+
+    **Why two diagonal vectors are returned:** The full middle block
+    ``[[D_1, -D_2], [D_2, D_1]]`` has orthonormal columns and encodes a
+    rotation: each pair ``(D_1[j], D_2[j])`` satisfies ``cos^2 + sin^2 = 1``.
+    On the quantum circuit, only ``D_2`` (= sin component) is needed to set
+    the R_y rotation angle ``theta = 2*arcsin(D_2[j])``; ``D_1`` (= cos) is
+    implied. Both are returned so callers can verify the decomposition.
+
+    **Algorithm:** ``D_1`` and ``v`` come from the SVD of ``a``. Then ``D_2``
+    and ``u_2`` come from the polar decomposition of ``b @ v^H``, which
+    yields a guaranteed-diagonal ``D_2`` (unlike a QR decomposition which
+    can fail when ``b`` is rank-deficient).
+
+    Parameters
+    ----------
+    a : np.ndarray of shape (m, k)
+        Upper block of the column matrix.
+    b : np.ndarray of shape (m, k)
+        Lower block of the column matrix.
+
+    Returns
+    -------
+    u_1 : np.ndarray of shape (m, m)
+        Left unitary for the upper block (from SVD of ``a``).
+    u_2 : np.ndarray of shape (m, m)
+        Left unitary for the lower block (from polar decomposition).
+    d_1 : np.ndarray of shape (k,)
+        Singular values of ``a``; the cos-like diagonal.
+    d_2 : np.ndarray of shape (k,)
+        Polar factor diagonal of ``b @ v^H``; the sin-like diagonal.
+    v : np.ndarray of shape (k, k)
+        Right unitary (shared by both blocks).
+
+    """
+    u_1, d_1, vt = np.linalg.svd(a, full_matrices=True)
+    v = vt
+
+    bv = b @ vt.conj().T
+    w, s, vt2 = np.linalg.svd(bv, full_matrices=True)
+    width = a.shape[1]
+    u_2 = w.copy()
+    u_2[:width, :width] = w[:width, :width] @ vt2
+    d_2_matrix = (vt2.T.conj() * s) @ vt2
+    d_2 = np.diag(d_2_matrix).real
+
+    return u_1, u_2, d_1, d_2, v
+
+
+def _pad_to_power_of_2(arr: np.ndarray, target_len: int) -> np.ndarray:
+    """Pad or truncate a 1-D array to ``target_len`` (zero-padding)."""
+    if len(arr) >= target_len:
+        return arr[:target_len]
+    return np.concatenate([arr, np.zeros(target_len - len(arr))])
+
+
+def decompose_unitary_to_givens(matrix: np.ndarray):
+    """Decompose a real orthogonal matrix into parallel Givens rotation layers.
+
+    Implements the Clements double-sided decomposition
+    (:cite:`Clements2017`) which guarantees exactly ``dim`` layers for a
+    ``dim x dim`` orthogonal matrix by alternating right-column and
+    left-row Givens eliminations.
+
+    The matrix is factored as ``U = D · L_d · ... · L_1`` where each layer
+    ``L_j`` consists of parallel 2x2 R_y(theta) (Y-axis rotation) Givens
+    rotations acting on neighboring pairs of columns, and ``D`` is a diagonal
+    sign matrix (±1).
+    Layers alternate between "even" (pairs 0-1, 2-3, ...) and "odd"
+    (pairs 1-2, 3-4, ...) forms as in Eq. (3) of the paper.
+
+    Produces the factorization ``U = D · L_d · ... · L_1`` (Eq. 7 in
+    :cite:`Rupprecht2026`) where each ``L_j`` is a layer of
+    parallel R_y rotations and ``D`` is a diagonal ±1 sign matrix.
+
+    Parameters
+    ----------
+    matrix : np.ndarray
+        Real orthogonal matrix of shape (dim, dim). dim must be a power of 2.
+
+    Returns
+    -------
+    layer_angles : list[list[float]]
+        Per-layer R_y rotation angles for each parallel slot.
+    layer_shifted : list[bool]
+        Whether each layer uses odd-indexed pairs (True) or even (False).
+    phases : list[bool]
+        Diagonal sign flips (True where entry is -1).
+
+    """
+    dim = matrix.shape[0]
+    m = matrix.copy().astype(float)
+
+    if dim <= 1:
+        return [], [], [bool(m[0, 0] < 0)] if dim == 1 else []
+
+    # Clements double-sided elimination.
+    # Layer i is shifted=(i%2==1): even layers have even pairs, odd layers odd pairs.
+    num_layers = 1 if dim == 2 else dim
+    # upper_rots[i] collects right-side rotations for layer i
+    upper_rots: list[list[tuple[int, float]]] = [[] for _ in range(num_layers)]
+    # lower_rots[col_idx] collects left-side rotations (combined into layers later)
+    lower_rots: list[list[tuple[int, float]]] = [[] for _ in range(num_layers)]
+
+    for k in range(dim - 1):
+        if k % 2 == 0:
+            # Right column elimination: zero M[row_idx, col_idx] for decreasing col_idx
+            for i, col_idx in enumerate(range(k, -1, -1)):
+                row_idx = dim - 1 - i
+                a_val = m[row_idx, col_idx + 1]
+                b_val = m[row_idx, col_idx]
+                if abs(b_val) < 1e-15:
+                    continue
+                theta = np.arctan2(b_val, a_val)
+                c, s = np.cos(theta), np.sin(theta)
+                col_i = m[:, col_idx].copy()
+                col_j = m[:, col_idx + 1].copy()
+                # Zero col_idx: [[c, s], [-s, c]] right-multiplied on cols (col_idx, col_idx+1)
+                m[:, col_idx] = c * col_i - s * col_j
+                m[:, col_idx + 1] = s * col_i + c * col_j
+                upper_rots[i].append((col_idx, theta))
+        else:
+            # Left row elimination: zero M[row_idx, col_idx] for increasing row_idx
+            for col_idx, row_idx in enumerate(range(dim - k - 1, dim)):
+                a_val = m[row_idx - 1, col_idx]
+                b_val = m[row_idx, col_idx]
+                if abs(b_val) < 1e-15:
+                    continue
+                theta = np.arctan2(b_val, a_val)
+                c, s = np.cos(theta), np.sin(theta)
+                row_i = m[row_idx - 1, :].copy()
+                row_j = m[row_idx, :].copy()
+                m[row_idx - 1, :] = c * row_i + s * row_j
+                m[row_idx, :] = -s * row_i + c * row_j
+                lower_rots[col_idx].append((row_idx - 1, theta))
+
+    # Extract diagonal (±1 for orthogonal matrices)
+    diag_vals = np.diag(m)
+    phases = [bool(d < 0) for d in diag_vals]
+
+    # Combine upper and lower rotations into exactly dim layers.
+    # Lower rotations (left-side) are commuted past the diagonal D to become
+    # right-side rotations: angle is adjusted by sign(D[p]*D[p+1]).
+    # lower_rots[col_idx] maps to layer (num_layers - 1 - col_idx).
+    num_even_slots = dim // 2
+    num_odd_slots = (dim - 1) // 2
+
+    result_angles: list[list[float]] = []
+    result_shifted: list[bool] = []
+
+    for layer_idx in range(num_layers):
+        shifted = layer_idx % 2 == 1
+        if shifted:
+            num_slots = num_odd_slots
+            slot_pairs = [2 * k + 1 for k in range(num_slots)]
+        else:
+            num_slots = num_even_slots
+            slot_pairs = [2 * k for k in range(num_slots)]
+
+        pair_to_slot = {p: i for i, p in enumerate(slot_pairs)}
+        angles = [0.0] * num_slots
+
+        # Add upper (right-side) rotations for this layer
+        for pair, angle in upper_rots[layer_idx]:
+            if pair in pair_to_slot:
+                angles[pair_to_slot[pair]] = angle
+
+        # Add lower (left-side) rotations, adjusted by diagonal signs.
+        # lower_rots[col_idx] → combined into layer (num_layers - 1 - col_idx)
+        lower_col_idx = num_layers - 1 - layer_idx
+        if lower_col_idx < len(lower_rots):
+            for pair, angle in reversed(lower_rots[lower_col_idx]):
+                sign = 1.0 if diag_vals[pair] * diag_vals[pair + 1] > 0 else -1.0
+                adjusted_angle = angle * sign
+                if pair in pair_to_slot:
+                    angles[pair_to_slot[pair]] = adjusted_angle
+
+        # Only include non-trivial layers
+        if any(abs(a) > 1e-15 for a in angles):
+            result_angles.append(angles)
+            result_shifted.append(shifted)
+
+    return result_angles, result_shifted, phases
+
+
+def decompose_block_diagonal_to_givens(blocks: list[np.ndarray]):
+    """Decompose a block-diagonal real unitary into merged Givens rotation layers.
+
+    For the block-diagonal matrix ``diag(u_0, u_1, u_2, u_3)`` from the CSD
+    decomposition, this exploits the block structure so that rotations within
+    each block can be scheduled in parallel across blocks, reducing the total
+    number of Givens layers compared to treating the full matrix as dense.
+    See Sec. 3 and Fig. 3 of :cite:`Rupprecht2026`.
+
+    Each block is decomposed independently using the Clements double-sided
+    algorithm (guaranteed ``block_dim`` layers per block), then layers from
+    different blocks are merged into global layers of the full register.
+    This matches the Qualtran Rust implementation's ``normal_form_blocks``.
+
+    Parameters
+    ----------
+    blocks : list[np.ndarray]
+        List of real orthogonal matrices forming the diagonal blocks.
+
+    Returns
+    -------
+    layer_angles : list[list[float]]
+        Per-layer R_y rotation angles for each parallel slot.
+    layer_shifted : list[bool]
+        Whether each layer uses odd-indexed pairs (True) or even (False).
+    phases : list[bool]
+        Diagonal sign flips (True where entry is -1).
+
+    """
+    total_dim = sum(b.shape[0] for b in blocks)
+    num_even_slots = total_dim // 2
+    num_odd_slots = (total_dim - 1) // 2
+
+    # Decompose each block independently using Clements decomposition.
+    block_starts: list[int] = []
+    block_decomps: list[list[tuple[bool, list[tuple[int, float]]]]] = []
+    all_phases: list[bool] = [False] * total_dim
+
+    offset = 0
+    for block in blocks:
+        block_starts.append(offset)
+        block_dim = block.shape[0]
+        angles, shifted, phases = decompose_unitary_to_givens(block)
+
+        # Convert layer data back to (pair, angle) tuples remapped to full register
+        layers: list[tuple[bool, list[tuple[int, float]]]] = []
+        for layer_angles_l, layer_shifted_l in zip(angles, shifted, strict=False):
+            rots: list[tuple[int, float]] = []
+            if layer_shifted_l:
+                slot_pairs = [2 * k + 1 for k in range((block_dim - 1) // 2)]
+            else:
+                slot_pairs = [2 * k for k in range(block_dim // 2)]
+            for slot_idx, angle in enumerate(layer_angles_l):
+                if abs(angle) > 1e-15:
+                    rots.append((offset + slot_pairs[slot_idx], angle))
+            if rots:
+                layers.append((layer_shifted_l, rots))
+
+        block_decomps.append(layers)
+
+        # Store phases remapped to full register
+        for k, p in enumerate(phases):
+            all_phases[offset + k] = p
+
+        offset += block_dim
+
+    # Merge layers across blocks into global layers.
+    # A block's layer with local shifted=S at block offset O fits in a global
+    # layer with global_shifted=G if (O + S) and G have the same parity
+    # (i.e., the local pair indices map to global pairs of the correct type).
+    # Determine initial global_shifted from the largest block's first layer.
+    max_block_idx = max(range(len(blocks)), key=lambda i: blocks[i].shape[0])
+    if block_decomps[max_block_idx]:
+        first_local_shifted = block_decomps[max_block_idx][0][0]
+        # A locally shifted layer at offset O produces pairs O+1, O+3, ...
+        # These fit in a global shifted layer if O is even, or unshifted if O is odd.
+        global_shifted = first_local_shifted ^ (block_starts[max_block_idx] % 2 == 1)
+    else:
+        global_shifted = False
+
+    # Convert block_decomps to deques for popping from front
+    block_queues = [deque(layers) for layers in block_decomps]
+
+    result_angles: list[list[float]] = []
+    result_shifted: list[bool] = []
+
+    while any(q for q in block_queues):
+        current_offset = 1 if global_shifted else 0
+
+        # Collect rotations from blocks whose front layer aligns with current global layer
+        layer_rots: list[tuple[int, float]] = []
+        for blk_idx, q in enumerate(block_queues):
+            if not q:
+                continue
+            local_shifted, rots = q[0]
+            # Check alignment: a locally shifted layer at block start O
+            # produces global pairs at O+1, O+3, ... which are all odd if O is even.
+            # These fit in global_shifted=True layer.
+            block_offset = block_starts[blk_idx]
+            local_parity = (block_offset + (1 if local_shifted else 0)) % 2
+            global_parity = current_offset % 2
+            if local_parity == global_parity:
+                q.popleft()
+                layer_rots.extend(rots)
+
+        # Build the global layer
+        if global_shifted:
+            num_slots = num_odd_slots
+            slot_pairs = [2 * k + 1 for k in range(num_slots)]
+        else:
+            num_slots = num_even_slots
+            slot_pairs = [2 * k for k in range(num_slots)]
+
+        pair_to_slot = {p: i for i, p in enumerate(slot_pairs)}
+        angles_layer = [0.0] * num_slots
+
+        for pair, angle in layer_rots:
+            if pair in pair_to_slot:
+                angles_layer[pair_to_slot[pair]] = angle
+
+        if any(abs(a) > 1e-15 for a in angles_layer):
+            result_angles.append(angles_layer)
+            result_shifted.append(global_shifted)
+
+        global_shifted = not global_shifted
+
+    return result_angles, result_shifted, all_phases
diff --git a/python/src/qdk_chemistry/algorithms/state_preparation/mps_sparse.py b/python/src/qdk_chemistry/algorithms/state_preparation/mps_sparse.py
new file mode 100644
index 000000000..e9d41115c
--- /dev/null
+++ b/python/src/qdk_chemistry/algorithms/state_preparation/mps_sparse.py
@@ -0,0 +1,663 @@
+"""Matrix Product State (MPS) state preparation exploiting block sparsity.
+
+Implements the sparse MPS preparation method from :cite:`Rupprecht2026`.
+Each site unitary is decomposed as ``U = P_row · V_blockdiag · P_col``
+where ``P_row``, ``P_col`` are permutations (implemented via QROAM + SWAP +
+X-measure) and ``V_blockdiag`` is block-diagonal (synthesized via Givens
+rotation layers per block). This exploits U(1) symmetries (particle number,
+spin) that make MPS tensors block-sparse, yielding 10-30x Toffoli savings
+over the dense method.
+
+Attribution
+-----------
+Based on the method described in :cite:`Rupprecht2026` and the Qualtran
+implementation by Felix Rupprecht (DLR) published on Zenodo
+:cite:`Rupprecht2026Zenodo` under Apache 2.0 license. The implementation
+has been rewritten for integration into QDK Chemistry.
+
+References
+----------
+    Felix Rupprecht and Sabine Wölk. (2026). Faster matrix product state preparation by
+    exploiting symmetry-induced block-sparsity.
+    https://arxiv.org/pdf/2605.28489. Zenodo: https://zenodo.org/records/20393500.
+
+"""
+
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+import numpy as np
+from scipy.sparse import csc_array
+
+from qdk_chemistry.data.circuit import Circuit, QsharpFactoryData
+from qdk_chemistry.data.mps_wavefunction import MPSWavefunction
+from qdk_chemistry.utils.qsharp import QSHARP_UTILS
+
+from .mps_sequential import (
+    GivensLayerData,
+    decompose_block_diagonal_to_givens,
+)
+from .state_preparation import StatePreparation, StatePreparationSettings
+
+__all__: list[str] = [
+    "MPSSparseStatePreparation",
+]
+
+
+class MPSSparseStatePreparationSettings(StatePreparationSettings):
+    """Settings for MPS sparse state preparation."""
+
+    def __init__(self):
+        """Initialize the MPSSparseStatePreparationSettings."""
+        super().__init__()
+        self._set_default("rotation_bits", "int", 10, "Phase gradient precision.")
+
+
+class MPSSparseStatePreparation(StatePreparation):
+    r"""MPS state preparation exploiting block sparsity.
+
+    Prepare the state using permutation-based decomposition. Each site unitary
+    is factored as ``U = P_row · V_blockdiag · P_col``, where permutations are
+    implemented via QROAM and the block-diagonal unitary is synthesized via
+    Givens rotation layers. This exploits the block-sparse structure of MPS
+    tensors arising from U(1) symmetries (particle number, spin conservation).
+
+    Attribution
+    -----------
+    Based on the method in :cite:`Rupprecht2026` and code originally published by
+    Felix Rupprecht on Zenodo :cite:`Rupprecht2026Zenodo` under Apache 2.0 license.
+    """
+
+    def __init__(self):
+        """Initialize the MPS sparse state preparation algorithm."""
+        super().__init__()
+        self._settings = MPSSparseStatePreparationSettings()
+
+    def name(self) -> str:
+        """Return the algorithm name."""
+        return "mps_sparse"
+
+    def _run_impl(self, wavefunction: MPSWavefunction) -> Circuit:
+        """Return a circuit to prepare an MPS state using block-sparsity.
+
+        Args:
+            wavefunction: An MPSWavefunction containing the tensors.
+
+        Returns:
+            A Circuit object implementing the MPS state preparation.
+
+        Raises:
+            TypeError: If wavefunction is not an MPSWavefunction instance.
+
+        """
+        if not isinstance(wavefunction, MPSWavefunction):
+            raise TypeError(f"MPSSparseStatePreparation requires an MPSWavefunction, got {type(wavefunction)}.")
+
+        data = generate_mps_sparse_preparation_data(wavefunction.tensors)
+        rotation_bits = self._settings.get("rotation_bits")
+        params = data.to_qsharp_params(rotation_bits)
+        program = QSHARP_UTILS.MPSSparse.MakeMPSSparseCircuit
+
+        qsharp_factory = QsharpFactoryData(
+            program=program,
+            parameter=params,
+        )
+
+        op_params = QSHARP_UTILS.MPSSparse.MPSSparseParams(**params)
+        qsharp_op = QSHARP_UTILS.MPSSparse.MakeMPSSparseOp(op_params)
+
+        return Circuit(qsharp_factory=qsharp_factory, qsharp_op=qsharp_op, encoding="jordan-wigner")
+
+
+# ---------------------------------------------------------------------------
+# Data containers
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class SparseSiteUnitaryData:
+    r"""Decomposition data for a single sparse MPS site unitary.
+
+    Each site unitary is decomposed as U = P_row · V_blockdiag · P_col.
+
+    The permutations are stored as target mappings: perm_targets[i] gives the
+    target index for basis state |i>. The block-diagonal unitary is stored
+    as Givens layer data.
+    """
+
+    col_perm_targets: list[int]
+    """Column permutation targets: col_perm_targets[i] = P_col(i)."""
+
+    col_inv_perm_targets: list[int]
+    """Inverse column permutation targets: col_inv_perm_targets[j] = P_col^{-1}(j)."""
+
+    row_perm_targets: list[int]
+    """Row permutation targets: row_perm_targets[i] = P_row(i)."""
+
+    row_inv_perm_targets: list[int]
+    """Inverse row permutation targets: row_inv_perm_targets[j] = P_row^{-1}(j)."""
+
+    block_givens: GivensLayerData
+    """Givens layers for the block-diagonal unitary V."""
+
+    target_bits: int
+    """Number of bits in the target register (site + ancilla)."""
+
+
+@dataclass
+class MPSSparsePreparationData:
+    """All data needed to drive the MPSSparse Q# operation."""
+
+    initial_state_vec: list[float]
+    """Flattened initial state vector for the first site."""
+
+    num_sites: int
+    """Number of MPS sites."""
+
+    ancilla_bits: int
+    """Number of ancilla qubits (log2 of ancilla dimension)."""
+
+    sites: list[SparseSiteUnitaryData] = field(default_factory=list)
+    """Per-site decomposition data (one entry per site 1..num_sites-1)."""
+
+    def to_qsharp_params(self, rotation_bits: int) -> dict:
+        """Flatten into the dict expected by the MakeMPSSparseCircuit Q# operation."""
+        d = 4  # physical dimension (2-qubit site register)
+        ancilla_dim = 1 << self.ancilla_bits
+        return {
+            "initialStateVec": self.initial_state_vec,
+            "numSites": self.num_sites,
+            "rotationBits": rotation_bits,
+            "numAncillaQubits": self.ancilla_bits,
+            "siteColPermTargets": [
+                _perm_to_bitstrings(
+                    _remap_perm_to_qsharp_order(s.col_perm_targets, d, ancilla_dim),
+                    s.target_bits,
+                )
+                for s in self.sites
+            ],
+            "siteColInvPermTargets": [
+                _perm_to_bitstrings(
+                    _remap_perm_to_qsharp_order(s.col_inv_perm_targets, d, ancilla_dim),
+                    s.target_bits,
+                )
+                for s in self.sites
+            ],
+            "siteRowPermTargets": [
+                _perm_to_bitstrings(
+                    _remap_perm_to_qsharp_order(s.row_perm_targets, d, ancilla_dim),
+                    s.target_bits,
+                )
+                for s in self.sites
+            ],
+            "siteRowInvPermTargets": [
+                _perm_to_bitstrings(
+                    _remap_perm_to_qsharp_order(s.row_inv_perm_targets, d, ancilla_dim),
+                    s.target_bits,
+                )
+                for s in self.sites
+            ],
+            "siteBlockLayerAngles": [s.block_givens.layer_angles for s in self.sites],
+            "siteBlockLayerShifted": [s.block_givens.layer_shifted for s in self.sites],
+            "siteBlockPhases": [s.block_givens.phases for s in self.sites],
+        }
+
+
+# ---------------------------------------------------------------------------
+# Sparse decomposition algorithm
+# ---------------------------------------------------------------------------
+
+
+def generate_mps_sparse_preparation_data(
+    tensors: Sequence[np.ndarray],
+) -> MPSSparsePreparationData:
+    """Compute all data needed for the MPSSparse Q# operation.
+
+    Performs the permutation + block-diagonal decomposition for each site.
+
+    Parameters
+    ----------
+    tensors : sequence of np.ndarray
+        MPS tensors. ``tensors[i]`` has shape ``(chi_left, d, chi_right)``.
+
+    Returns
+    -------
+    MPSSparsePreparationData
+        Structured preparation data.
+
+    """
+    num_sites = len(tensors)
+    d = tensors[0].shape[1]
+
+    # Determine consistent ancilla size
+    max_ancilla_dim = 1
+    for i in range(1, num_sites):
+        chi_left, _, chi_right = tensors[i].shape
+        local_bits = int(np.ceil(np.log2(max(chi_left, chi_right)))) if max(chi_left, chi_right) > 1 else 1
+        max_ancilla_dim = max(max_ancilla_dim, 1 << local_bits)
+    chi_1 = tensors[0].shape[2]
+    init_bits = int(np.ceil(np.log2(max(1, chi_1)))) if chi_1 > 1 else 1
+    max_ancilla_dim = max(max_ancilla_dim, 1 << init_bits)
+    ancilla_bits = int(np.ceil(np.log2(max_ancilla_dim))) if max_ancilla_dim > 1 else 1
+    ancilla_dim = 1 << ancilla_bits
+
+    # Per-site decomposition
+    sites: list[SparseSiteUnitaryData] = []
+    for i in range(1, num_sites):
+        site_data = _decompose_sparse_site(tensors[i], ancilla_dim)
+        sites.append(site_data)
+
+    # Initial state from first tensor
+    first_tensor = tensors[0]
+    chi_1 = first_tensor.shape[2]
+    init_state = first_tensor.transpose(1, 2, 0).sum(axis=2)  # (d, chi_1)
+    init_padded = np.zeros((d, ancilla_dim))
+    init_padded[:, :chi_1] = init_state
+    initial_state_vec_arr = init_padded.flatten()
+    norm = np.linalg.norm(initial_state_vec_arr)
+    if norm > 1e-15:
+        initial_state_vec_arr = initial_state_vec_arr / norm
+    initial_state_vec = initial_state_vec_arr.tolist()
+
+    return MPSSparsePreparationData(
+        initial_state_vec=initial_state_vec,
+        num_sites=num_sites,
+        ancilla_bits=ancilla_bits,
+        sites=sites,
+    )
+
+
+def _decompose_sparse_site(tensor: np.ndarray, ancilla_dim: int) -> SparseSiteUnitaryData:
+    """Decompose one MPS site tensor using the sparse permutation method.
+
+    Parameters
+    ----------
+    tensor : np.ndarray of shape (chi_left, 4, chi_right)
+        The MPS tensor for this site.
+    ancilla_dim : int
+        The ancilla register dimension (power of 2).
+
+    Returns
+    -------
+    SparseSiteUnitaryData
+        The decomposition data for this site.
+
+    """
+    active_dim = 4 * ancilla_dim
+
+    # Build the target matrix: transpose tensor indices and form CSC sparse matrix
+    # Target matrix has shape (4*dim, chi_left) with columns = bond states
+    target_matrix = _tensor_to_target_matrix(tensor, ancilla_dim)
+
+    # Step 1: Extract rectangles and row permutation
+    rectangles, row_perm = _get_rectangles_and_row_permutation(target_matrix)
+    row_perm = _pad_permutation(row_perm, active_dim)
+
+    # Step 2: Find column permutation to make rectangles square
+    col_perm = _find_column_permutation(rectangles, active_dim)
+
+    # Step 3: Expand rectangles to unitaries
+    blocks = [_expand_to_unitary(rect) for rect in rectangles]
+
+    # Pad with 1x1 identity blocks to fill remaining space
+    used_dim = sum(b.shape[0] for b in blocks)
+    remaining = active_dim - used_dim
+    blocks += [np.eye(1)] * remaining
+
+    # Step 4: Order blocks by size (largest first)
+    # Returns inverted ordering permutation and sorted blocks
+    ordering_perm, blocks = _order_blocks(blocks, active_dim)
+
+    col_composed = [col_perm[ordering_perm[i]] for i in range(active_dim)]
+    col_perm_final = _invert_perm(col_composed)
+    row_perm_final = [row_perm[ordering_perm[i]] for i in range(active_dim)]
+
+    # Step 5: Decompose block-diagonal into Givens layers
+    block_angles, block_shifted, block_phases = decompose_block_diagonal_to_givens(blocks)
+    block_givens = GivensLayerData(
+        layer_angles=block_angles,
+        layer_shifted=block_shifted,
+        phases=block_phases,
+    )
+
+    # Compute inverse permutations for measurement-based unlookup
+    col_inv_perm_final = _invert_perm(col_perm_final)
+    row_inv_perm_final = _invert_perm(row_perm_final)
+
+    # Number of bits in the target register (site + ancilla)
+    target_bits = int(np.log2(active_dim))
+
+    return SparseSiteUnitaryData(
+        col_perm_targets=col_perm_final,
+        col_inv_perm_targets=col_inv_perm_final,
+        row_perm_targets=row_perm_final,
+        row_inv_perm_targets=row_inv_perm_final,
+        block_givens=block_givens,
+        target_bits=target_bits,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Sparse decomposition helpers
+# ---------------------------------------------------------------------------
+
+
+def _tensor_to_target_matrix(tensor: np.ndarray, ancilla_dim: int) -> csc_array:
+    """Build the sparse target matrix from an MPS tensor.
+
+    The target matrix has shape (4 * ancilla_dim, chi_left), where each column
+    corresponds to a left-bond index and the 4 blocks of ancilla_dim rows
+    correspond to the 4 physical states.
+
+    Parameters
+    ----------
+    tensor : np.ndarray of shape (chi_left, 4, chi_right)
+        The MPS tensor.
+    ancilla_dim : int
+        The ancilla dimension (padded, power of 2).
+
+    Returns
+    -------
+    csc_array
+        The sparse target matrix of shape (4 * ancilla_dim, chi_left).
+
+    """
+    chi_left, d, _ = tensor.shape
+    # Reshape: for each physical index p, take the slice tensor[:, p, :] of shape
+    # (chi_left, chi_right) -> transpose to (chi_right, chi_left).
+    # Stack 4 such slices vertically to get (4*chi_right, chi_left), then pad rows.
+    slices = []
+    for p in range(d):
+        mat = tensor[:, p, :].T  # shape (chi_right, chi_left)
+        # Pad rows to ancilla_dim
+        if mat.shape[0] < ancilla_dim:
+            padded = np.zeros((ancilla_dim, chi_left))
+            padded[: mat.shape[0], :] = mat
+            slices.append(padded)
+        else:
+            slices.append(mat[:ancilla_dim, :])
+    full_matrix = np.vstack(slices)  # (4 * ancilla_dim, chi_left)
+    return csc_array(full_matrix)
+
+
+def _get_rectangles_and_row_permutation(
+    matrix: csc_array,
+) -> tuple[list[np.ndarray], list[int]]:
+    """Extract chained rectangular blocks and row permutation from a sparse matrix.
+
+    Walks columns left-to-right, grouping them by shared nonzero rows into
+    contiguous rectangular blocks. Returns the blocks (as dense arrays) and
+    the row permutation that chains them.
+
+    Parameters
+    ----------
+    matrix : csc_array
+        The sparse target matrix.
+
+    Returns
+    -------
+    blocks : list[np.ndarray]
+        Dense rectangular blocks with orthonormal (or near-orthonormal) columns.
+    row_permutation : list[int]
+        Row permutation mapping original row indices to chained order.
+
+    """
+    num_rows, num_cols = matrix.shape
+
+    permutation: list[int] = []
+    blocks: list[np.ndarray] = []
+    seen_rows = np.zeros(num_rows, dtype=bool)
+
+    current_rectangle_rows: list[int] = []
+    current_rectangle_cols: list[int] = []
+
+    for col_idx in range(num_cols):
+        non_zero_rows = matrix.indices[matrix.indptr[col_idx] : matrix.indptr[col_idx + 1]].tolist()
+        new_rows = [r for r in non_zero_rows if not seen_rows[r]]
+
+        if len(new_rows) == len(non_zero_rows) and len(non_zero_rows) > 0:
+            # New block starts
+            if current_rectangle_rows:
+                block = matrix[current_rectangle_rows, :][:, current_rectangle_cols].toarray()
+                blocks.append(block)
+            current_rectangle_rows = list(new_rows)
+            current_rectangle_cols = [col_idx]
+            permutation.extend(new_rows)
+        else:
+            # Continue current block
+            current_rectangle_cols.append(col_idx)
+            for r in new_rows:
+                current_rectangle_rows.append(r)
+                permutation.append(r)
+
+        for r in new_rows:
+            seen_rows[r] = True
+
+    # Flush last block
+    if current_rectangle_rows:
+        block = matrix[current_rectangle_rows, :][:, current_rectangle_cols].toarray()
+        blocks.append(block)
+
+    # Add unseen rows at the end
+    zero_rows = np.where(~seen_rows)[0].tolist()
+    permutation.extend(zero_rows)
+
+    return blocks, permutation
+
+
+def _find_column_permutation(rectangles: list[np.ndarray], dim: int) -> list[int]:
+    """Compute column permutation to transform chained rectangles into squares.
+
+    For each rectangle that is taller than wide, permutes in columns from
+    the right side so each block becomes square (enabling block-diagonal form).
+
+    Parameters
+    ----------
+    rectangles : list[np.ndarray]
+        Chained rectangular blocks.
+    dim : int
+        Total dimension of the active register.
+
+    Returns
+    -------
+    list[int]
+        Column permutation.
+
+    """
+    mapping = list(range(dim))
+
+    col_left = 0
+    col_right = dim
+    diag = 0
+
+    for rect in rectangles:
+        height, width = rect.shape
+        diff = height - width
+
+        if diff > 0:
+            # Move 'diff' columns from the right to fill the gap
+            for i in range(width, col_right - col_left):
+                mapping[col_left + i] += diff
+            for i in range(diff):
+                mapping[col_right - diff + i] = diag + width + i
+            col_left += width
+            col_right -= diff
+        else:
+            col_left += width
+
+        diag += height
+
+    return _invert_perm(mapping)
+
+
+def _expand_to_unitary(rectangle: np.ndarray) -> np.ndarray:
+    """Expand a rectangle with orthonormal columns into a full unitary.
+
+    Uses null-space completion when the rectangle is taller than wide.
+
+    Parameters
+    ----------
+    rectangle : np.ndarray
+        Matrix with orthonormal columns (height >= width).
+
+    Returns
+    -------
+    np.ndarray
+        Square unitary matrix.
+
+    """
+    h, w = rectangle.shape
+    if h == w:
+        return rectangle
+    # Null space of rectangle^T gives the orthogonal complement
+    _, _, vt = np.linalg.svd(rectangle.conj().T)
+    null_space = vt[w:, :].T.conj()
+    return np.hstack([rectangle, null_space])
+
+
+def _order_blocks(blocks: list[np.ndarray], dim: int) -> tuple[list[int], list[np.ndarray]]:
+    """Sort blocks by size (largest first) and return the reordering permutation.
+
+    Parameters
+    ----------
+    blocks : list[np.ndarray]
+        Block matrices forming a block-diagonal.
+    dim : int
+        Total dimension.
+
+    Returns
+    -------
+    permutation : list[int]
+        Inverted ordering permutation.
+    sorted_blocks : list[np.ndarray]
+        Blocks sorted largest-first.
+
+    """
+    mapping = list(range(dim))
+
+    # Sort indices by block size (descending)
+    sorted_indices = sorted(range(len(blocks)), key=lambda i: -blocks[i].shape[0])
+
+    # Compute cumulative offsets
+    offsets = []
+    offset = 0
+    for block in blocks:
+        offsets.append(offset)
+        offset += block.shape[0]
+
+    # Build forward mapping: mapping[old_pos] = new_pos
+    new_offset = 0
+    for idx in sorted_indices:
+        block_size = blocks[idx].shape[0]
+        old_start = offsets[idx]
+        for k in range(block_size):
+            mapping[old_start + k] = new_offset + k
+        new_offset += block_size
+
+    # Return inverted mapping
+    sorted_blocks = [blocks[i] for i in sorted_indices]
+    return _invert_perm(mapping), sorted_blocks
+
+
+# ---------------------------------------------------------------------------
+# Permutation utilities
+# ---------------------------------------------------------------------------
+
+
+def _invert_perm(perm: list[int]) -> list[int]:
+    """Invert a permutation."""
+    inv = [0] * len(perm)
+    for i, p in enumerate(perm):
+        inv[p] = i
+    return inv
+
+
+def _pad_permutation(perm: list[int], target_len: int) -> list[int]:
+    """Pad a permutation to target_len with identity mapping for extra indices."""
+    if len(perm) >= target_len:
+        return perm[:target_len]
+    return perm + list(range(len(perm), target_len))
+
+
+# ---------------------------------------------------------------------------
+# Q# encoding utilities
+# ---------------------------------------------------------------------------
+
+
+def _remap_perm_to_qsharp_order(perm_targets: list[int], d: int, ancilla_dim: int) -> list[int]:
+    """Remap permutation indices from target-matrix order to Q# register order.
+
+    The target matrix uses row = physical_state * ancilla_dim + ancilla_state,
+    but the Q# register (target = newSite + ancilla) with little-endian
+    convention gives value = physical_state + ancilla_state * d.
+
+    This function conjugates the permutation by the reindexing so that
+    SelectSwap (which uses Q# little-endian addressing) applies the correct
+    permutation.
+
+    Parameters
+    ----------
+    perm_targets : list[int]
+        Permutation targets in target-matrix row ordering.
+    d : int
+        Physical dimension (always 4 for 2-qubit site register).
+    ancilla_dim : int
+        Ancilla dimension (2^ancilla_bits).
+
+    Returns
+    -------
+    list[int]
+        Permutation targets reindexed for Q# register ordering.
+
+    """
+    active_dim = d * ancilla_dim
+    qs_perm = [0] * active_dim
+    for v in range(active_dim):
+        # Register value v encodes physical=v%d, ancilla=v//d
+        p = v % d
+        a = v // d
+        # Convert to target matrix row
+        r = p * ancilla_dim + a
+        # Apply permutation in target matrix space
+        r_out = perm_targets[r]
+        # Convert result back to Q# register value
+        p_out = r_out // ancilla_dim
+        a_out = r_out % ancilla_dim
+        v_out = p_out + a_out * d
+        qs_perm[v] = v_out
+    return qs_perm
+
+
+def _perm_to_bitstrings(perm_targets: list[int], num_bits: int) -> list[list[bool]]:
+    """Encode permutation targets as Bool[][] for Q# SelectSwap.
+
+    Each target integer is encoded as a little-endian bit string of length num_bits.
+
+    Parameters
+    ----------
+    perm_targets : list[int]
+        Permutation target indices.
+    num_bits : int
+        Number of bits for each target encoding.
+
+    Returns
+    -------
+    list[list[bool]]
+        Bool[N][num_bits] encoding for Q#.
+
+    """
+    result = []
+    for target in perm_targets:
+        bits = [(target >> b) & 1 == 1 for b in range(num_bits)]
+        result.append(bits)
+    return result
diff --git a/python/src/qdk_chemistry/data/__init__.py b/python/src/qdk_chemistry/data/__init__.py
index c0cff63bb..9440754e4 100644
--- a/python/src/qdk_chemistry/data/__init__.py
+++ b/python/src/qdk_chemistry/data/__init__.py
@@ -111,6 +111,7 @@
 from qdk_chemistry.data.controlled_unitary import ControlledUnitary
 from qdk_chemistry.data.enums.fermion_mode_order import FermionModeOrder
 from qdk_chemistry.data.estimator_data import EnergyExpectationResult, MeasurementData
+from qdk_chemistry.data.mps_wavefunction import MPSWavefunction
 from qdk_chemistry.data.noise_models import QuantumErrorProfile
 from qdk_chemistry.data.qpe_result import QpeResult
 from qdk_chemistry.data.qubit_hamiltonian import QubitHamiltonian
@@ -157,6 +158,7 @@
     "HamiltonianType",
     "LatticeGraph",
     "LayeredPartition",
+    "MPSWavefunction",
     "MajoranaMapping",
     "MeasurementData",
     "ModelOrbitals",
diff --git a/python/src/qdk_chemistry/data/mps_wavefunction.py b/python/src/qdk_chemistry/data/mps_wavefunction.py
new file mode 100644
index 000000000..626ba72a6
--- /dev/null
+++ b/python/src/qdk_chemistry/data/mps_wavefunction.py
@@ -0,0 +1,212 @@
+"""Wavefunction container for Matrix Product State representations."""
+
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+__all__ = ["MPSWavefunction"]
+
+
+class MPSWavefunction:
+    """Container for wavefunctions represented as Matrix Product States (MPS).
+
+    Stores the MPS tensors and provides methods for computing the full state vector
+    and metadata needed by the MPS state preparation algorithm.
+
+    Parameters
+    ----------
+    tensors : sequence of np.ndarray
+        MPS tensors. ``tensors[i]`` has shape ``(chi_left_i, d, chi_right_i)``
+        where ``d`` is the local Hilbert space dimension (typically 4 for
+        two-qubit sites in Jordan-Wigner encoding) and ``chi`` are the bond
+        dimensions.
+    site_dim : int, optional
+        Local site dimension (default 4, i.e. 2 qubits per site).
+
+    Attributes
+    ----------
+    tensors : list[np.ndarray]
+        The MPS tensors.
+    num_sites : int
+        Number of MPS sites.
+    site_dim : int
+        Local Hilbert space dimension per site.
+    bond_dims : list[int]
+        Bond dimensions ``[chi_0, chi_1, ..., chi_N]`` where ``chi_0 = chi_N = 1``
+        for open boundary conditions.
+
+    """
+
+    def __init__(self, tensors: Sequence[np.ndarray], site_dim: int = 4):
+        """Initialize the MPS wavefunction container.
+
+        Args:
+            tensors: Sequence of MPS tensors with shapes (chi_left, d, chi_right).
+            site_dim: Local Hilbert space dimension per site (default 4).
+
+        Raises:
+            ValueError: If tensors are empty, have inconsistent dimensions,
+                or violate open boundary conditions.
+
+        """
+        if not tensors:
+            raise ValueError("MPS tensors must not be empty.")
+
+        self.site_dim = site_dim
+        self.tensors = [np.asarray(t, dtype=np.float64) for t in tensors]
+        self.num_sites = len(self.tensors)
+
+        # Validate tensor shapes
+        for i, t in enumerate(self.tensors):
+            if t.ndim != 3:
+                raise ValueError(f"Tensor {i} must be 3-dimensional, got shape {t.shape}.")
+            if t.shape[1] != site_dim:
+                raise ValueError(f"Tensor {i} has site dimension {t.shape[1]}, expected {site_dim}.")
+
+        # Note: chi_left > 1 on the first tensor is allowed for non-zero spin
+        # systems (singlet embedding), where the left boundary represents
+        # multiple spin projection states.
+
+        # Validate bond dimension consistency
+        for i in range(self.num_sites - 1):
+            chi_right = self.tensors[i].shape[2]
+            chi_left_next = self.tensors[i + 1].shape[0]
+            if chi_right != chi_left_next:
+                raise ValueError(
+                    f"Bond dimension mismatch between site {i} (chi_right={chi_right}) "
+                    f"and site {i + 1} (chi_left={chi_left_next})."
+                )
+
+        self.bond_dims = [self.tensors[0].shape[0]]
+        for t in self.tensors:
+            self.bond_dims.append(t.shape[2])
+
+    @property
+    def max_bond_dim(self) -> int:
+        """Maximum bond dimension (chi_max) across all bonds."""
+        return max(self.bond_dims)
+
+    @property
+    def num_qubits(self) -> int:
+        """Total number of qubits (2 per site for d=4)."""
+        qubits_per_site = int(np.log2(self.site_dim))
+        return self.num_sites * qubits_per_site
+
+    def contract(self) -> np.ndarray:
+        """Contract the MPS to obtain the full state vector.
+
+        Returns
+        -------
+        np.ndarray
+            Normalized state vector of length ``site_dim ** num_sites``.
+
+        """
+        state = self.tensors[0]  # (chi_left, d, chi_1)
+        for tensor in self.tensors[1:]:
+            # state: (chi_left, d^k, chi_prev), tensor: (chi_prev, d, chi_next)
+            left = state.shape[0]
+            num_states = state.shape[1]
+            chi_prev = state.shape[2]
+            chi_in, d, chi_next = tensor.shape
+
+            state_flat = state.reshape(left * num_states, chi_prev)
+            tensor_flat = tensor.reshape(chi_in, d * chi_next)
+            result = state_flat @ tensor_flat
+            state = result.reshape(left, num_states * d, chi_next)
+
+        # Sum over the left boundary index (chi_left) and squeeze chi_right=1.
+        # For chi_left=1 this is equivalent to flatten; for chi_left > 1
+        # (non-zero spin) it sums the spin projection contributions.
+        vec = state.sum(axis=0).flatten()
+        norm = np.linalg.norm(vec)
+        if norm > 1e-15:
+            vec = vec / norm
+        return vec
+
+    @classmethod
+    def random(
+        cls,
+        num_sites: int,
+        bond_dim: int,
+        site_dim: int = 4,
+        rng: np.random.Generator | None = None,
+        right_canonical: bool = True,
+    ) -> MPSWavefunction:
+        """Generate a random MPS with specified bond dimension.
+
+        Parameters
+        ----------
+        num_sites : int
+            Number of sites.
+        bond_dim : int
+            Internal bond dimension (capped by dimensional constraints).
+        site_dim : int
+            Local site dimension (default 4).
+        rng : np.random.Generator or None
+            Random number generator. If None, uses default.
+        right_canonical : bool
+            If True, put the MPS in right-canonical form (default True).
+
+        Returns
+        -------
+        MPSWavefunction
+            A random MPS wavefunction.
+
+        """
+        if rng is None:
+            rng = np.random.default_rng()
+
+        # Determine bond dimensions respecting boundary constraints
+        bond_dims = [1]
+        for i in range(1, num_sites):
+            max_left = bond_dims[-1] * site_dim
+            max_right = site_dim ** min(i, num_sites - i)
+            chi = min(bond_dim, max_left, max_right)
+            bond_dims.append(chi)
+        bond_dims.append(1)
+
+        tensors = []
+        for i in range(num_sites):
+            chi_l = bond_dims[i]
+            chi_r = bond_dims[i + 1]
+            t = rng.standard_normal((chi_l, site_dim, chi_r))
+            tensors.append(t)
+
+        if right_canonical:
+            tensors = _make_right_canonical(tensors)
+
+        return cls(tensors, site_dim=site_dim)
+
+
+def _make_right_canonical(tensors: list[np.ndarray]) -> list[np.ndarray]:
+    """Put MPS tensors in right-canonical form via successive QR decompositions.
+
+    Sweeps from right to left. After this, all tensors except the first
+    satisfy A†A = I (right-isometric).
+    """
+    result = [t.copy() for t in tensors]
+    num_sites = len(result)
+
+    for i in range(num_sites - 1, 0, -1):
+        chi_l, d, chi_r = result[i].shape
+        # Reshape to (chi_l, d * chi_r) and do QR from the right
+        mat = result[i].reshape(chi_l, d * chi_r)
+        # SVD-based right canonicalization: mat = U @ S @ Vt
+        # Vt becomes the new tensor, U @ S is absorbed left
+        q_mat, r_mat = np.linalg.qr(mat.T, mode="reduced")
+        result[i] = q_mat.T.reshape(chi_l, d, chi_r)
+        chi_l_prev, d_prev, _ = result[i - 1].shape
+        left_mat = result[i - 1].reshape(chi_l_prev * d_prev, chi_l)
+        result[i - 1] = (left_mat @ r_mat.T).reshape(chi_l_prev, d_prev, chi_l)
+
+    return result
diff --git a/python/src/qdk_chemistry/utils/qsharp/__init__.py b/python/src/qdk_chemistry/utils/qsharp/__init__.py
index 0742fc79c..c811d0389 100644
--- a/python/src/qdk_chemistry/utils/qsharp/__init__.py
+++ b/python/src/qdk_chemistry/utils/qsharp/__init__.py
@@ -4,12 +4,13 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See LICENSE.txt in the project root for license information.
 # --------------------------------------------------------------------------------------------
+import re
 from pathlib import Path
 
 import qdk
-from qdk import qsharp
+from qdk import TargetProfile, qsharp
 
-__all__ = ["QSHARP_UTILS"]
+__all__ = ["QSHARP_UTILS", "get_qsharp_utils"]
 
 _QS_FILES = [
     Path(__file__).parent / "StatePreparation.qs",
@@ -22,15 +23,52 @@
     Path(__file__).parent / "MeasurementBasis.qs",
 ]
 
+_MPS_PROJECT_ROOT = str(Path(__file__).parent / "mps_sequential")
 
-def get_qsharp_utils():
-    """Returns the Q# namespace for chemistry operations (lazy-loaded)."""
+_state: dict[str, str | None] = {"mode": None}  # "base", "mps", or None
+
+
+def _ensure_base_session():
+    """Ensure interpreter is in Base mode with utility Q# files loaded."""
+    if _state["mode"] == "base":
+        try:
+            _ = qdk.code.QDKChemistry.Utils.StatePreparation
+            return
+        except AttributeError:
+            _state["mode"] = None  # stale — interpreter was reset externally
+    if _state["mode"] == "mps":
+        qsharp.init(target_profile=TargetProfile.Base)
     try:
-        return qdk.code.QDKChemistry.Utils
+        _ = qdk.code.QDKChemistry.Utils.StatePreparation
     except AttributeError:
         code = "\n".join(f.read_text() for f in _QS_FILES)
         qsharp.eval(code)
-        return qdk.code.QDKChemistry.Utils
+    _state["mode"] = "base"
+
+
+def _ensure_mps_session():
+    """Ensure interpreter has MPS project loaded (Unrestricted) plus utility files."""
+    if _state["mode"] == "mps":
+        try:
+            _ = qdk.code.MPSSparse
+            return
+        except AttributeError:
+            _state["mode"] = None  # stale — interpreter was reset externally
+    qsharp.init(project_root=_MPS_PROJECT_ROOT)
+    code = "\n".join(f.read_text() for f in _QS_FILES)
+    qsharp.eval(code)
+    _state["mode"] = "mps"
+
+
+def get_qsharp_utils():
+    """Returns the Q# namespace for chemistry operations (lazy-loaded).
+
+    Initializes the global Q# interpreter with the MPS project on first call,
+    then loads additional Q# utility files via eval. Use this when the MPS
+    project must be available (e.g. for resource estimation of MPS circuits).
+    """
+    _ensure_mps_session()
+    return qdk.code.QDKChemistry.Utils
 
 
 class _QSharpUtilsProxy:
@@ -39,12 +77,20 @@ class _QSharpUtilsProxy:
     def __getattr__(self, name: str):
         """Load Q# code (if necessary) and resolve *name* on the utilities namespace.
 
+        Falls through to MPS namespace for names not found in QDKChemistry.Utils.
+
         Args:
             name: The name of the attribute being accessed on the Q# utilities namespace.
 
         """
-        utils = get_qsharp_utils()
-        return getattr(utils, name)
+        if name == "MPSSequential":
+            _ensure_mps_session()
+            return qdk.code.MPSSequential
+        if name == "MPSSparse":
+            _ensure_mps_session()
+            return qdk.code.MPSSparse
+        _ensure_base_session()
+        return getattr(qdk.code.QDKChemistry.Utils, name)
 
 
 QSHARP_UTILS = _QSharpUtilsProxy()
diff --git a/python/src/qdk_chemistry/utils/qsharp/mps_sequential/qsharp.json b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/qsharp.json
new file mode 100644
index 000000000..0967ef424
--- /dev/null
+++ b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/qsharp.json
@@ -0,0 +1 @@
+{}
diff --git a/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/GivensDecomposition.qs b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/GivensDecomposition.qs
new file mode 100644
index 000000000..66287a4d7
--- /dev/null
+++ b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/GivensDecomposition.qs
@@ -0,0 +1,604 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE.txt in the project root for
+// license information.
+
+import Std.Math.*;
+import Std.Convert.*;
+import Std.Arrays.*;
+import Std.Canon.*;
+import Std.Diagnostics.*;
+import Std.ResourceEstimation.*;
+import QDKChemistry.Utils.SelectSwap.ControlledQroamCleanRotation;
+import QDKChemistry.Utils.SelectSwap.QroamCleanRotation;
+import PhaseGradient.RyViaPhaseGradient;
+
+export ApplyGivensLayer, ApplyRealUnitaryViaGivens, ApplyControlledRealUnitaryViaGivens, ApplyBlockDiagUnitaryViaGivens, IncrementByOne, QuantizeGivensAngles, QuantizeRyAngles, PhaseFlipsAsSelectData, ApplyPhasePolynomial;
+
+/// # Summary
+/// Increments a little-endian register by 1 (mod 2^n).
+///
+/// # Input
+/// ## target
+/// Register in little-endian (LE) format (target[0] = LSB (Least Significant Bit)).
+operation IncrementByOne(target : Qubit[]) : Unit {
+    AddConstant(1, target);
+}
+
+// =============================================================================
+// Classical constant addition (Sanders et al. Fig. 18)
+// =============================================================================
+
+/// # Summary
+/// Adds a classical constant `c` to a little-endian quantum register in place.
+///
+/// # Description
+/// Implements |x⟩ → |x + c (mod 2^n)⟩ using the ripple-carry structure from
+/// Sanders et al. (PRX Quantum 1, 020312, 2020), Fig. 18.
+///
+/// Resource cost:
+///   - n − 2 AND gates  → 4(n − 2) T-gates
+///   - n − 2 IAND gates → 0 T-gates (measurement-based)
+///   - O(n) Clifford gates
+///   - n − 1 ancilla qubits (borrowed, returned to |0⟩)
+///
+/// # Input
+/// ## c
+/// The classical integer to add.
+/// ## target
+/// Register in little-endian format (target[0] = LSB). Length n.
+///
+/// # References
+/// - Sanders, Y.R., et al. "Compilation of Fault-Tolerant Quantum Heuristics
+///   for Combinatorial Optimization." PRX Quantum 1, 020312 (2020).
+operation AddConstant(c : Int, target : Qubit[]) : Unit {
+    let n = Length(target);
+    let cMod = ((c % (1 <<< n)) + (1 <<< n)) % (1 <<< n);
+    let cBits = IntAsBoolArray(cMod, n); // LE: cBits[0] = LSB
+
+    if n == 1 {
+        if cBits[0] {
+            X(target[0]);
+        }
+    } elif n == 2 {
+        // 2-bit: no Toffolis needed. Use Clifford-only circuit.
+        if cBits[1] {
+            if cBits[0] {
+                // +3 = -1 mod 4: decrement
+                X(target[0]);
+                CNOT(target[0], target[1]);
+            } else {
+                // +2: flip MSB
+                X(target[1]);
+            }
+        } else {
+            if cBits[0] {
+                // +1: increment
+                CNOT(target[0], target[1]);
+                X(target[0]);
+            }
+            // +0: identity
+        }
+    } else {
+        use ancillas = Qubit[n - 1];
+
+        // --- Forward pass: compute carry bits ---
+        if cBits[0] {
+            CNOT(target[0], ancillas[0]);
+        }
+
+        for i in 1..n - 2 {
+            let j = i - 1;
+            CNOT(ancillas[j], target[i]);
+            if cBits[i] {
+                X(ancillas[j]);
+            }
+            AND(ancillas[j], target[i], ancillas[i]);
+            if cBits[i] {
+                X(ancillas[j]);
+            }
+            CNOT(ancillas[j], ancillas[i]);
+        }
+
+        // --- MSB: XOR final carry ---
+        CNOT(ancillas[n - 2], target[n - 1]);
+
+        // --- Reverse pass: uncompute ancillas ---
+        for i in n - 2..-1..1 {
+            let j = i - 1;
+            CNOT(ancillas[j], ancillas[i]);
+            if cBits[i] {
+                X(ancillas[j]);
+            }
+            Adjoint AND(ancillas[j], target[i], ancillas[i]);
+            if cBits[i] {
+                X(ancillas[j]);
+            }
+        }
+
+        if cBits[0] {
+            CNOT(target[0], ancillas[0]);
+        }
+
+        // --- Final XOR: add constant bits ---
+        for i in 0..n - 1 {
+            if cBits[i] {
+                X(target[i]);
+            }
+        }
+    }
+}
+
+// =============================================================================
+// Angle quantization helpers (moved from Python preprocessing)
+// =============================================================================
+
+/// # Summary
+/// Quantize Givens rotation angles to Bool[][] format for Select/QROAM
+/// (Quantum Read-Only Access Memory).
+///
+/// # Description
+/// For a Givens rotation with angle θ (where the 2×2 matrix is [[cos θ, -sin θ], [sin θ, cos θ]]):
+/// RyViaPhaseGradient applies Ry(4π·x/2^b). We need Ry(2θ), so x = θ·2^b/(2π).
+///
+/// # Input
+/// ## angles
+/// Double[numAngles]: raw Givens rotation angles in radians.
+/// ## numAddresses
+/// The padded address space dimension (= dim/2 for a dim-qubit register).
+/// ## rotationBits
+/// Phase gradient precision bits.
+///
+/// # Output
+/// Bool[numAddresses][rotationBits]: quantized angle data for Select.
+function QuantizeGivensAngles(angles : Double[], numAddresses : Int, rotationBits : Int) : Bool[][] {
+    let scale = 1 <<< rotationBits;
+    let scaleF = IntAsDouble(scale);
+    mutable data : Bool[][] = [];
+    for k in 0..numAddresses - 1 {
+        let angle = k < Length(angles) ? angles[k] | 0.0;
+        mutable xInt = Round(scaleF * angle / (2.0 * PI()));
+        set xInt = ((xInt % scale) + scale) % scale;
+        set data += [IntAsBoolArray(xInt, rotationBits)];
+    }
+    return data;
+}
+
+/// # Summary
+/// Quantize standard Ry angles to Bool[][] format for Select/QROAM
+/// (Quantum Read-Only Access Memory).
+///
+/// # Description
+/// For a standard Ry(α) rotation: RyViaPhaseGradient applies Ry(4π·x/2^b).
+/// We need Ry(α), so x = α·2^b/(4π).
+///
+/// # Input
+/// ## angles
+/// Double[dim]: standard Ry rotation angles in radians.
+/// ## rotationBits
+/// Phase gradient precision bits.
+///
+/// # Output
+/// Bool[dim][rotationBits]: quantized angle data for Select.
+function QuantizeRyAngles(angles : Double[], rotationBits : Int) : Bool[][] {
+    let scale = 1 <<< rotationBits;
+    let scaleF = IntAsDouble(scale);
+    mutable data : Bool[][] = [];
+    for k in 0..Length(angles) - 1 {
+        mutable xInt = Round(scaleF * angles[k] / (4.0 * PI()));
+        set xInt = ((xInt % scale) + scale) % scale;
+        set data += [IntAsBoolArray(xInt, rotationBits)];
+    }
+    return data;
+}
+
+/// # Summary
+/// Convert a Bool[] phase flip array to Bool[][1] format for Select.
+///
+/// # Input
+/// ## phases
+/// Bool[dim]: true if state |i⟩ needs a Z flip.
+///
+/// # Output
+/// Bool[dim][1]: Select-compatible format.
+function PhaseFlipsAsSelectData(phases : Bool[]) : Bool[][] {
+    mutable data : Bool[][] = [];
+    for p in phases {
+        set data += [[p]];
+    }
+    return data;
+}
+
+// =============================================================================
+// Phase polynomial correction (Reed-Muller decomposition)
+// =============================================================================
+
+/// # Summary
+/// Computes Reed-Muller (Möbius) coefficients for a phase diagonal.
+///
+/// # Description
+/// For an n-qubit diagonal D = diag((-1)^{f(x)}) where f: {0,1}^n → {0,1},
+/// computes the multilinear polynomial representation:
+///   f(x) = ⊕_S c_S · ∏_{i∈S} x_i
+/// where c_S = ⊕_{T⊆S} f(T) (Möbius inversion over GF(2)).
+///
+/// # Input
+/// ## phases
+/// Bool[2^n]: phases[i] = true if state |i⟩ gets (-1) phase.
+///
+/// # Output
+/// Bool[2^n]: coefficients. coeffs[S] = true if monomial ∏_{i∈S} x_i is active.
+/// Index S encodes the subset as a bitmask.
+function ComputePhasePolynomialCoeffs(phases : Bool[]) : Bool[] {
+    let dim = Length(phases);
+    mutable coeffs = phases;
+    // Möbius transform (in-place butterfly)
+    mutable step = 1;
+    while step < dim {
+        for j in 0..dim - 1 {
+            if (j / step) % 2 == 1 {
+                set coeffs w/= j <- coeffs[j] != coeffs[j - step];
+            }
+        }
+        set step = step * 2;
+    }
+    return coeffs;
+}
+
+/// # Summary
+/// Applies a phase correction D = diag(±1) using the Reed-Muller polynomial decomposition.
+///
+/// # Description
+/// Decomposes the diagonal into Z, CZ (Controlled-Z), CCZ (doubly-Controlled-Z),
+/// etc. gates via the Möbius transform.
+/// For n≤2 qubits: all Clifford (0 CCZ). For n=3: at most 1 CCZ. For n=4: at most 5 CCZ.
+/// This is more efficient than the Select-based approach for small registers.
+///
+/// # Input
+/// ## phases
+/// Bool[2^n]: phases[i] = true if |i⟩ gets Z flip.
+/// ## register
+/// Qubit[n]: the register in LE (little-endian) order
+/// (register[0] = LSB (Least Significant Bit)).
+operation ApplyPhasePolynomial(phases : Bool[], register : Qubit[]) : Unit {
+    let n = Length(register);
+    let dim = Length(phases);
+    let coeffs = ComputePhasePolynomialCoeffs(phases);
+
+    // Apply multi-controlled Z for each nonzero coefficient of degree ≥ 1
+    for s in 1..dim - 1 {
+        if coeffs[s] {
+            // Extract qubit indices where s has bit 1
+            mutable qubits : Qubit[] = [];
+            for bit in 0..n - 1 {
+                if (s >>> bit) &&& 1 == 1 {
+                    set qubits += [register[bit]];
+                }
+            }
+            // Apply multi-controlled Z: degree 1 = Z, degree 2 = CZ (Controlled-Z),
+            // degree 3 = CCZ (doubly-Controlled-Z), etc.
+            if Length(qubits) == 1 {
+                Z(qubits[0]);
+            } elif Length(qubits) == 2 {
+                Controlled Z([qubits[0]], qubits[1]);
+            } else {
+                // degree ≥ 3: use Controlled Z
+                Controlled Z(qubits[0..Length(qubits) - 2], qubits[Length(qubits) - 1]);
+            }
+        }
+    }
+}
+
+// =============================================================================
+// Single Givens rotation layer
+// =============================================================================
+
+/// # Summary
+/// Gate-based unitary synthesis via Givens rotation layers.
+///
+/// # Description
+/// Applies arbitrary unitaries using QROAM (Quantum Read-Only Access Memory)
+/// loaded angles + phase gradient rotations.
+/// Each unitary is pre-decomposed (classically) into layers of 2×2 Givens rotations.
+/// Each layer is applied as:
+///   1. QROAM loads the quantized angle for the current address state
+///   2. Ry via phase gradient applies the rotation to the active qubit
+///   3. Adjoint QROAM uncomputes the angle register
+///
+/// # References
+/// - Berry et al. (PRX Quantum 6, 020327): https://doi.org/10.1103/PRXQuantum.6.020327
+/// - Clements et al. (arXiv:1603.08788): https://arxiv.org/abs/1603.08788
+///
+/// # Input
+/// ## angleData
+/// Bool[numAngles][rotationBits]: pre-quantized rotation angles. angleData[k] is the angle
+/// for the k-th pair, encoded as a rotationBits-bit integer x such that θ = 4π·x/2^rotationBits.
+/// ## isShifted
+/// If true, applies the shifted version (Berry eq. 24).
+/// ## target
+/// Target register in MSB (Most Significant Bit)-first format
+/// (target[0] = MSB, target[n-1] = LSB (Least Significant Bit)).
+/// State value = target[0]*2^(n-1) + ... + target[n-1]*2^0.
+/// ## phaseGradient
+/// Phase gradient ancilla register.
+operation ApplyGivensLayer(
+    angleData : Bool[][],
+    isShifted : Bool,
+    target : Qubit[],
+    phaseGradient : Qubit[],
+    angleReg : Qubit[]
+) : Unit {
+    let n = Length(target);
+
+    // Reversed(target) gives LE (little-endian) view: index 0 = LSB of state value
+    if isShifted {
+        AddConstant(-1, Reversed(target));
+    }
+
+    // Active qubit = LSB of state = target[n-1].
+    // Address = higher bits = target[0..n-2], reversed for Select (LSB-first).
+    let activeQubit = target[n - 1];
+    let address = target[0..n - 2];
+
+    // QROAM-clean: forward-only Select+Swap + Ry + Unlookup for half the cost
+    QroamCleanRotation(angleData, Reversed(address), activeQubit, phaseGradient);
+
+    if isShifted {
+        AddConstant(1, Reversed(target));
+    }
+}
+
+/// # Summary
+/// Same as ApplyGivensLayer but controlled on an additional qubit.
+///
+/// # Description
+/// When control = |1⟩, applies the Givens rotation layer.
+/// When control = |0⟩, does nothing (identity).
+///
+/// Uses QROAMClean pattern (Berry et al. arXiv:1902.02134):
+///   1. Controlled Select loads N data entries (NOT 2N extended data).
+///      Cost: N-1 AND gates (vs 2N-2 for data-doubling approach).
+///   2. Ry rotation via phase gradient.
+///   3. Measurement-based uncomputation (Unlookup) on 2N extended data
+///      to correctly handle the ctrl=0 branch.
+///
+/// The IncrementByOne shift is applied UNCONDITIONALLY (not controlled).
+/// This is correct because when ctrl=0, Select loads nothing, so
+/// Ry(0)=I makes the shift/unshift pair cancel out.
+///
+/// # Input
+/// ## angleData
+/// Bool[numAngles][rotationBits]: pre-quantized rotation angles.
+/// ## isShifted
+/// If true, applies the shifted version.
+/// ## target
+/// Target register.
+/// ## phaseGradient
+/// Phase gradient register.
+/// ## control
+/// Control qubit.
+operation ApplyControlledGivensLayer(
+    angleData : Bool[][],
+    isShifted : Bool,
+    target : Qubit[],
+    phaseGradient : Qubit[],
+    control : Qubit,
+    angleReg : Qubit[]
+) : Unit {
+    let n = Length(target);
+    let numAngles = Length(angleData);
+    let rotationBits = Length(angleReg);
+
+    // Shifts are UNCONDITIONAL.
+    // When ctrl=0, Select loads nothing so Ry(0)=I, making shift/unshift cancel.
+    if isShifted {
+        AddConstant(-1, Reversed(target));
+    }
+
+    let activeQubit = target[n - 1];
+    let address = target[0..n - 2];
+
+    // QROAMClean with SelectSwap blocking: uses forward-only Controlled Select
+    // with blocking + measurement-based Unlookup for reduced T-count.
+    ControlledQroamCleanRotation(angleData, Reversed(address), control, activeQubit, phaseGradient);
+
+    if isShifted {
+        AddConstant(1, Reversed(target));
+    }
+}
+
+// =============================================================================
+// Full unitary via Givens decomposition
+// =============================================================================
+
+/// # Summary
+/// Applies a real unitary matrix via its Givens rotation decomposition.
+///
+/// # Description
+/// A real unitary U is decomposed as: U = D · R_{k-1} · ... · R_1 · R_0
+/// where each R_i is a Givens rotation layer and D = diag(±1) is a phase correction.
+///
+/// The layers are applied in order (R_0 first), followed by the phase correction.
+/// The phase correction is implemented by loading sign bits via QROAM and applying Z.
+///
+/// # Input
+/// ## layerAngleData
+/// Bool[numLayers][numAngles][rotationBits]: angle data for each Givens layer.
+/// ## layerIsShifted
+/// Bool[numLayers]: whether each layer is shifted.
+/// ## phaseFlipData
+/// Bool[dim][1]: phase correction. phaseFlipData[i] = [true] if state |i⟩ gets Z.
+/// Empty array means no phase correction needed.
+/// ## target
+/// Target register.
+/// ## phaseGradient
+/// Phase gradient register.
+operation ApplyRealUnitaryViaGivens(
+    layerAngleData : Bool[][][],
+    layerIsShifted : Bool[],
+    phaseFlipData : Bool[][],
+    target : Qubit[],
+    phaseGradient : Qubit[],
+    angleReg : Qubit[]
+) : Unit {
+    let numLayers = Length(layerAngleData);
+
+    // Apply Givens layers in order
+    // All layers have the same circuit structure (same register sizes), only data differs.
+    // Cache by shifted/non-shifted variant to avoid re-tracing ~1000 identical layers.
+    for i in 0..numLayers - 1 {
+        let variant = Length(target) * 2 + (if layerIsShifted[i] { 1 } else { 0 });
+        if BeginEstimateCaching("GivensLayer", variant) {
+            ApplyGivensLayer(layerAngleData[i], layerIsShifted[i], target, phaseGradient, angleReg);
+            EndEstimateCaching();
+        }
+    }
+
+    // Phase correction: D = diag(±1) via Reed-Muller polynomial
+    if Length(phaseFlipData) > 0 {
+        mutable hasFlips = false;
+        for row in phaseFlipData {
+            if Length(row) > 0 and row[0] {
+                set hasFlips = true;
+            }
+        }
+        if hasFlips {
+            mutable phases : Bool[] = [];
+            for row in phaseFlipData {
+                set phases += [Length(row) > 0 and row[0]];
+            }
+            ApplyPhasePolynomial(phases, Reversed(target));
+        }
+    }
+}
+
+/// # Summary
+/// Applies a controlled real unitary via Givens decomposition.
+///
+/// # Description
+/// When control = |1⟩, applies the unitary. When control = |0⟩, identity.
+/// Each Givens layer is controlled, as is the phase correction.
+///
+/// # Input
+/// ## layerAngleData
+/// Bool[numLayers][numAngles][rotationBits]: angle data for each Givens layer.
+/// ## layerIsShifted
+/// Bool[numLayers]: whether each layer is shifted.
+/// ## phaseFlipData
+/// Bool[dim][1]: phase correction data. Empty array means no correction.
+/// ## target
+/// Target register.
+/// ## phaseGradient
+/// Phase gradient register.
+/// ## control
+/// Control qubit.
+operation ApplyControlledRealUnitaryViaGivens(
+    layerAngleData : Bool[][][],
+    layerIsShifted : Bool[],
+    phaseFlipData : Bool[][],
+    target : Qubit[],
+    phaseGradient : Qubit[],
+    control : Qubit,
+    angleReg : Qubit[]
+) : Unit {
+    let numLayers = Length(layerAngleData);
+
+    for i in 0..numLayers - 1 {
+        let variant = Length(target) * 2 + (if layerIsShifted[i] { 1 } else { 0 });
+        if BeginEstimateCaching("ControlledGivensLayer", variant) {
+            ApplyControlledGivensLayer(
+                layerAngleData[i],
+                layerIsShifted[i],
+                target,
+                phaseGradient,
+                control,
+                angleReg
+            );
+            EndEstimateCaching();
+        }
+    }
+
+    // Controlled phase correction: when ctrl=1, apply D on target.
+    // This is a diagonal on (ctrl, target) register = [control] + Reversed(target).
+    // Extended phases: ctrl=0 → no flip, ctrl=1 → phaseFlipData.
+    if Length(phaseFlipData) > 0 {
+        mutable hasFlips = false;
+        for row in phaseFlipData {
+            if Length(row) > 0 and row[0] {
+                set hasFlips = true;
+            }
+        }
+        if hasFlips {
+            // Build extended phase array: [zeros for ctrl=0] ++ [phases for ctrl=1]
+            let dim = Length(phaseFlipData);
+            mutable extendedPhases : Bool[] = Repeated(false, dim);
+            for row in phaseFlipData {
+                set extendedPhases += [Length(row) > 0 and row[0]];
+            }
+            // Register order: Reversed(target) = LE of target state; control = MSB of extended register
+            ApplyPhasePolynomial(extendedPhases, Reversed(target) + [control]);
+        }
+    }
+}
+
+/// # Summary
+/// Applies a block-diagonal unitary via Givens decomposition.
+///
+/// # Description
+/// Applies block_diag(U_0, U_1, ..., U_{2^s-1}) where s = Length(subspace).
+/// The block-diagonal is decomposed jointly into Givens layers that act on the
+/// combined (subspace ⊕ target) register.
+///
+/// # Input
+/// ## layerAngleData
+/// Bool[numLayers][numAngles][rotationBits]: angle data for the joint Givens decomposition.
+/// The address space covers both subspace and target upper bits.
+/// ## layerIsShifted
+/// Bool[numLayers]: whether each layer is shifted.
+/// ## phaseFlipData
+/// Bool[totalDim][1]: phase correction for the full (subspace ⊕ target) space.
+/// ## target
+/// Target register (ancilla qubits).
+/// ## subspace
+/// Subspace selection register (site qubits).
+/// ## phaseGradient
+/// Phase gradient register.
+operation ApplyBlockDiagUnitaryViaGivens(
+    layerAngleData : Bool[][][],
+    layerIsShifted : Bool[],
+    phaseFlipData : Bool[][],
+    target : Qubit[],
+    subspace : Qubit[],
+    phaseGradient : Qubit[],
+    angleReg : Qubit[]
+) : Unit {
+    // The joint register: subspace (MSB) ++ target forms the full state space.
+    // This matches ApplyUnitary(M, Reversed(target + subspace)) convention when
+    // the caller passes Reversed(ancilla) as target and [q1,q0] as subspace.
+    let jointReg = subspace + target;
+    let numLayers = Length(layerAngleData);
+
+    for i in 0..numLayers - 1 {
+        let variant = Length(jointReg) * 2 + (if layerIsShifted[i] { 1 } else { 0 });
+        if BeginEstimateCaching("BlockDiagGivensLayer", variant) {
+            ApplyGivensLayer(layerAngleData[i], layerIsShifted[i], jointReg, phaseGradient, angleReg);
+            EndEstimateCaching();
+        }
+    }
+
+    // Phase correction on the joint register via Reed-Muller polynomial
+    if Length(phaseFlipData) > 0 {
+        mutable hasFlips = false;
+        for row in phaseFlipData {
+            if Length(row) > 0 and row[0] {
+                set hasFlips = true;
+            }
+        }
+        if hasFlips {
+            mutable phases : Bool[] = [];
+            for row in phaseFlipData {
+                set phases += [Length(row) > 0 and row[0]];
+            }
+            ApplyPhasePolynomial(phases, Reversed(jointReg));
+        }
+    }
+}
diff --git a/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/MPSSequential.qs b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/MPSSequential.qs
new file mode 100644
index 000000000..7b78e823e
--- /dev/null
+++ b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/MPSSequential.qs
@@ -0,0 +1,573 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+
+import Std.Math.*;
+import Std.Convert.*;
+import Std.Arrays.*;
+import Std.Canon.*;
+import Std.Diagnostics.*;
+import Std.ResourceEstimation.*;
+import QDKChemistry.Utils.SelectSwap.ControlledQroamCleanRotation;
+import QDKChemistry.Utils.SelectSwap.QroamCleanRotation;
+import PhaseGradient.RyViaPhaseGradient;
+import PhaseGradient.PreparePhaseGradientState;
+import QroamStatePrep.QroamStatePrep;
+import GivensDecomposition.*;
+
+export MPSSequential, MakeMPSSequentialCircuit, MakeMPSSequentialCircuitGrouped, MakeMPSSequentialOp, ApplyMPSSequential, MPSSequentialParams, MakeMPSSequentialOpGrouped, ApplyMPSSequentialGrouped, MPSSequentialGroupedParams, SiteUnitary;
+
+// =============================================================================
+// CSD decomposition (Givens + QROAM + phase gradient)
+// =============================================================================
+
+/// # Summary
+/// Applies a single site unitary using Givens decomposition.
+///
+/// # Description
+/// Uses QROAM + phase gradient rotations. Each matrix (V, W₀, W₁, U) is pre-decomposed
+/// into Givens rotation layers. Angle quantization is done internally in Q#.
+///
+/// The 9-step structure is preserved:
+///   1. V on ancilla (via Givens layers)
+///   2. UCR Ry on q0 (via QROAM + phase gradient)
+///   3. CNOT(q1, q0)
+///   4. W₀ on ancilla, controlled by q0 (via controlled Givens layers)
+///   5. Controlled UCR on q1, ctrl by q0 (via controlled QROAM + phase gradient)
+///   6. CNOT(q1, q0)
+///   7. W₁ on ancilla, controlled by q1 (via controlled Givens layers)
+///   8. Controlled UCR on q0, ctrl by q1 (via controlled QROAM + phase gradient)
+///   9. Multiplexed U on ancilla+site (via block-diagonal Givens layers)
+///
+/// # Input
+/// ## vLayerAngles
+/// Double[numLayers][numAngles]: raw Givens rotation angles for V (radians).
+/// ## vLayerShifted
+/// Bool[numLayers]: whether each V layer is shifted.
+/// ## vPhases
+/// Bool[dim]: V phase correction (true = Z flip on that basis state).
+/// ## rot0Angles
+/// Double[dim]: UCR Ry angles for step 2 (standard Ry convention).
+/// ## rot1Angles
+/// Double[dim]: controlled UCR Ry angles for step 5.
+/// ## rot2Angles
+/// Double[dim]: controlled UCR Ry angles for step 8.
+/// ## w0LayerAngles
+/// Double[numLayers][numAngles]: raw Givens angles for W₀.
+/// ## w0LayerShifted
+/// Bool[numLayers]: whether each W₀ layer is shifted.
+/// ## w0Phases
+/// Bool[dim]: W₀ phase correction.
+/// ## w1LayerAngles
+/// Double[numLayers][numAngles]: raw Givens angles for W₁.
+/// ## w1LayerShifted
+/// Bool[numLayers]: whether each W₁ layer is shifted.
+/// ## w1Phases
+/// Bool[dim]: W₁ phase correction.
+/// ## uLayerAngles
+/// Double[numLayers][numAngles]: raw Givens angles for U (block-diagonal).
+/// ## uLayerShifted
+/// Bool[numLayers]: whether each U layer is shifted.
+/// ## uPhases
+/// Bool[4*dim]: U phase correction.
+/// ## newSite
+/// The 2-qubit site register [q0, q1].
+/// ## ancilla
+/// The ancilla register.
+/// ## phaseGradient
+/// Phase gradient register (pre-initialized).
+operation SiteUnitary(
+    vLayerAngles : Double[][],
+    vLayerShifted : Bool[],
+    vPhases : Bool[],
+    rot0Angles : Double[],
+    rot1Angles : Double[],
+    rot2Angles : Double[],
+    w0LayerAngles : Double[][],
+    w0LayerShifted : Bool[],
+    w0Phases : Bool[],
+    w1LayerAngles : Double[][],
+    w1LayerShifted : Bool[],
+    w1Phases : Bool[],
+    uLayerAngles : Double[][],
+    uLayerShifted : Bool[],
+    uPhases : Bool[],
+    newSite : Qubit[],
+    ancilla : Qubit[],
+    phaseGradient : Qubit[],
+    angleReg : Qubit[]
+) : Unit {
+    let q0 = newSite[0];
+    let q1 = newSite[1];
+    let rotationBits = Length(phaseGradient);
+    let ancillaDim = 1 <<< Length(ancilla);
+    let numAddresses = ancillaDim / 2;
+
+    // Quantize angles (fast classical computation done in Q#)
+    let vData = Mapped(layer -> QuantizeGivensAngles(layer, numAddresses, rotationBits), vLayerAngles);
+    let vPhaseData = PhaseFlipsAsSelectData(vPhases);
+    let rot0Data = QuantizeRyAngles(rot0Angles, rotationBits);
+    let rot1Data = QuantizeRyAngles(rot1Angles, rotationBits);
+    let rot2Data = QuantizeRyAngles(rot2Angles, rotationBits);
+    let w0Data = Mapped(layer -> QuantizeGivensAngles(layer, numAddresses, rotationBits), w0LayerAngles);
+    let w0PhaseData = PhaseFlipsAsSelectData(w0Phases);
+    let w1Data = Mapped(layer -> QuantizeGivensAngles(layer, numAddresses, rotationBits), w1LayerAngles);
+    let w1PhaseData = PhaseFlipsAsSelectData(w1Phases);
+    // U address space is (4*dim)/2 = 2*dim
+    let uNumAddresses = 2 * ancillaDim;
+    let uData = Mapped(layer -> QuantizeGivensAngles(layer, uNumAddresses, rotationBits), uLayerAngles);
+    let uPhaseData = PhaseFlipsAsSelectData(uPhases);
+
+    // Single shared angle register for all UCR steps (reused sequentially)
+    // (angleReg is now passed in from the caller)
+
+    // Step 1: V on ancilla (gate-based via Givens)
+    // Reversed(ancilla) maps ancilla register to target with LSB=ancilla[0] as active qubit
+    ApplyRealUnitaryViaGivens(vData, vLayerShifted, vPhaseData, Reversed(ancilla), phaseGradient, angleReg);
+
+    // Step 2: UCR Ry on q0, addressed by ancilla
+    // QROAMClean: forward-only Select+Swap + Ry + Unlookup
+    QroamCleanRotation(rot0Data, ancilla, q0, phaseGradient);
+
+    // Step 3: CNOT(q1, q0)
+    CNOT(q1, q0);
+
+    // Step 4: W₀ on ancilla, controlled by q0
+    ApplyControlledRealUnitaryViaGivens(
+        w0Data,
+        w0LayerShifted,
+        w0PhaseData,
+        Reversed(ancilla),
+        phaseGradient,
+        q0,
+        angleReg
+    );
+
+    // Step 5: Controlled UCR on q1, ctrl by q0
+    // QROAMClean with SelectSwap blocking for reduced T-count.
+    ControlledQroamCleanRotation(rot1Data, ancilla, q0, q1, phaseGradient);
+
+    // Step 6: CNOT(q1, q0) — undo
+    CNOT(q1, q0);
+
+    // Step 7: W₁ on ancilla, controlled by q1
+    ApplyControlledRealUnitaryViaGivens(
+        w1Data,
+        w1LayerShifted,
+        w1PhaseData,
+        Reversed(ancilla),
+        phaseGradient,
+        q1,
+        angleReg
+    );
+
+    // Step 8: Controlled UCR on q0, ctrl by q1
+    // QROAMClean with SelectSwap blocking for reduced T-count.
+    ControlledQroamCleanRotation(rot2Data, ancilla, q1, q0, phaseGradient);
+
+    // Step 9: Multiplexed U on ancilla, selected by (q0, q1)
+    // Joint register = [q1, q0] + Reversed(ancilla) so block index = q1*2+q0 (MSB)
+    ApplyBlockDiagUnitaryViaGivens(
+        uData,
+        uLayerShifted,
+        uPhaseData,
+        Reversed(ancilla),
+        [q1, q0],
+        phaseGradient,
+        angleReg
+    );
+}
+
+/// # Summary
+/// MPS state preparation via CSD decomposition (Appendix B).
+///
+/// # Description
+/// Implements the site unitary decomposition from Berry et al.
+/// (PRX Quantum 6, 020327, https://doi.org/10.1103/PRXQuantum.6.020327,
+/// Figure 5) using structured quantum gates:
+///   V → UCR(rot2) → W₁ → UCR(rot1) → W₀ → UCR(rot0) → U
+///
+/// # Description
+/// Calls SiteUnitary for each site. All matrices are pre-decomposed
+/// into Givens rotation layers. Angle quantization is handled internally by Q#.
+/// Requires a phase gradient register which is initialized and freed by this operation.
+///
+// Based on code originally published by Felix Rupprecht (DLR) on Zenodo:
+///   https://zenodo.org/records/20393500
+/// Rewritten and adapted for integration into the QDK Chemistry library.
+///
+/// # Input
+/// ## initialStateVec
+/// Real amplitudes of the initial state.
+/// ## numSites
+/// Number of MPS sites.
+/// ## rotationBits
+/// Phase gradient precision (number of bits).
+/// ## siteVLayerAngles, siteVLayerShifted, siteVPhases
+/// Per-site V Givens decomposition: Double[numSites-1][numLayers][numAngles],
+/// Bool[numSites-1][numLayers], Bool[numSites-1][dim].
+/// ## siteRot0Angles, siteRot1Angles, siteRot2Angles
+/// Per-site UCR angles: Double[numSites-1][dim].
+/// ## siteW0LayerAngles, siteW0LayerShifted, siteW0Phases
+/// Per-site W₀ Givens decomposition.
+/// ## siteW1LayerAngles, siteW1LayerShifted, siteW1Phases
+/// Per-site W₁ Givens decomposition.
+/// ## siteULayerAngles, siteULayerShifted, siteUPhases
+/// Per-site U Givens decomposition.
+/// ## state
+/// State register.
+/// ## ancilla
+/// Ancilla register.
+operation MPSSequential(
+    initialStateVec : Double[],
+    numSites : Int,
+    rotationBits : Int,
+    siteVLayerAngles : Double[][][],
+    siteVLayerShifted : Bool[][],
+    siteVPhases : Bool[][],
+    siteRot0Angles : Double[][],
+    siteRot1Angles : Double[][],
+    siteRot2Angles : Double[][],
+    siteW0LayerAngles : Double[][][],
+    siteW0LayerShifted : Bool[][],
+    siteW0Phases : Bool[][],
+    siteW1LayerAngles : Double[][][],
+    siteW1LayerShifted : Bool[][],
+    siteW1Phases : Bool[][],
+    siteULayerAngles : Double[][][],
+    siteULayerShifted : Bool[][],
+    siteUPhases : Bool[][],
+    state : Qubit[],
+    ancilla : Qubit[]
+) : Unit {
+    // Initialize phase gradient register
+    use phaseGradient = Qubit[rotationBits];
+    PreparePhaseGradientState(phaseGradient);
+
+    // Single shared angle register for QROM-loaded angles (reused by all operations)
+    use angleReg = Qubit[rotationBits];
+
+    // Prepare initial state
+    let initReg = ancilla + state[0..1];
+    QroamStatePrep(initialStateVec, Reversed(initReg), phaseGradient, angleReg);
+
+    // Apply site unitaries
+    // All sites share the same circuit structure (same bond dimension / qubit counts),
+    // so we cache the resource estimate from the first site and reuse it for the rest.
+    for siteIdx in 0..numSites - 2 {
+        let newSite = state[2 * (siteIdx + 1)..2 * (siteIdx + 1) + 1];
+        if BeginEstimateCaching("SiteUnitary", SingleVariant()) {
+            SiteUnitary(
+                siteVLayerAngles[siteIdx],
+                siteVLayerShifted[siteIdx],
+                siteVPhases[siteIdx],
+                siteRot0Angles[siteIdx],
+                siteRot1Angles[siteIdx],
+                siteRot2Angles[siteIdx],
+                siteW0LayerAngles[siteIdx],
+                siteW0LayerShifted[siteIdx],
+                siteW0Phases[siteIdx],
+                siteW1LayerAngles[siteIdx],
+                siteW1LayerShifted[siteIdx],
+                siteW1Phases[siteIdx],
+                siteULayerAngles[siteIdx],
+                siteULayerShifted[siteIdx],
+                siteUPhases[siteIdx],
+                newSite,
+                ancilla,
+                phaseGradient,
+                angleReg
+            );
+            EndEstimateCaching();
+        }
+    }
+
+    // Undo phase gradient state
+    Adjoint PreparePhaseGradientState(phaseGradient);
+}
+
+/// Circuit wrapper for resource estimation — allocates qubits internally.
+operation MakeMPSSequentialCircuit(
+    initialStateVec : Double[],
+    numSites : Int,
+    rotationBits : Int,
+    numAncillaQubits : Int,
+    siteVLayerAngles : Double[][][],
+    siteVLayerShifted : Bool[][],
+    siteVPhases : Bool[][],
+    siteRot0Angles : Double[][],
+    siteRot1Angles : Double[][],
+    siteRot2Angles : Double[][],
+    siteW0LayerAngles : Double[][][],
+    siteW0LayerShifted : Bool[][],
+    siteW0Phases : Bool[][],
+    siteW1LayerAngles : Double[][][],
+    siteW1LayerShifted : Bool[][],
+    siteW1Phases : Bool[][],
+    siteULayerAngles : Double[][][],
+    siteULayerShifted : Bool[][],
+    siteUPhases : Bool[][]
+) : Unit {
+    use state = Qubit[2 * numSites];
+    use ancilla = Qubit[numAncillaQubits];
+    MPSSequential(
+        initialStateVec,
+        numSites,
+        rotationBits,
+        siteVLayerAngles,
+        siteVLayerShifted,
+        siteVPhases,
+        siteRot0Angles,
+        siteRot1Angles,
+        siteRot2Angles,
+        siteW0LayerAngles,
+        siteW0LayerShifted,
+        siteW0Phases,
+        siteW1LayerAngles,
+        siteW1LayerShifted,
+        siteW1Phases,
+        siteULayerAngles,
+        siteULayerShifted,
+        siteUPhases,
+        state,
+        ancilla
+    );
+}
+
+/// Circuit wrapper for accurate fast resource estimation with non-uniform bond dimensions.
+///
+/// Groups sites by their effective ancilla dimension (shape). For each unique shape,
+/// one representative set of angle data is provided. The resource estimator evaluates
+/// each shape once and caches the cost, then replicates it for all sites of that shape.
+///
+/// # Input
+/// ## siteShapeIndices
+/// Int[numSites-1]: for each site, the index into the shape arrays.
+/// ## shapeEffectiveBits
+/// Int[numShapes]: effective ancilla bits for each shape group.
+/// ## shape*
+/// Per-shape representative decomposition data (indexed by shape, not site).
+operation MakeMPSSequentialCircuitGrouped(
+    initialStateVec : Double[],
+    numSites : Int,
+    rotationBits : Int,
+    numAncillaQubits : Int,
+    siteShapeIndices : Int[],
+    shapeEffectiveBits : Int[],
+    shapeVLayerAngles : Double[][][],
+    shapeVLayerShifted : Bool[][],
+    shapeVPhases : Bool[][],
+    shapeRot0Angles : Double[][],
+    shapeRot1Angles : Double[][],
+    shapeRot2Angles : Double[][],
+    shapeW0LayerAngles : Double[][][],
+    shapeW0LayerShifted : Bool[][],
+    shapeW0Phases : Bool[][],
+    shapeW1LayerAngles : Double[][][],
+    shapeW1LayerShifted : Bool[][],
+    shapeW1Phases : Bool[][],
+    shapeULayerAngles : Double[][][],
+    shapeULayerShifted : Bool[][],
+    shapeUPhases : Bool[][]
+) : Unit {
+    use state = Qubit[2 * numSites];
+    use ancilla = Qubit[numAncillaQubits];
+
+    // Initialize phase gradient register
+    use phaseGradient = Qubit[rotationBits];
+    PreparePhaseGradientState(phaseGradient);
+
+    // Single shared angle register for QROM-loaded angles (reused by all operations)
+    use angleReg = Qubit[rotationBits];
+
+    // Prepare initial state (uses full ancilla)
+    let initReg = ancilla + state[0..1];
+    QroamStatePrep(initialStateVec, Reversed(initReg), phaseGradient, angleReg);
+
+    // Apply site unitaries — each site uses data for its shape group.
+    // BeginEstimateCaching with the shape index ensures accurate per-shape costing:
+    // sites with the same effective dimension share a cached cost.
+    for siteIdx in 0..numSites - 2 {
+        let newSite = state[2 * (siteIdx + 1)..2 * (siteIdx + 1) + 1];
+        let shapeIdx = siteShapeIndices[siteIdx];
+        let effectiveBits = shapeEffectiveBits[shapeIdx];
+        if BeginEstimateCaching("SiteUnitary", shapeIdx) {
+            SiteUnitary(
+                shapeVLayerAngles[shapeIdx],
+                shapeVLayerShifted[shapeIdx],
+                shapeVPhases[shapeIdx],
+                shapeRot0Angles[shapeIdx],
+                shapeRot1Angles[shapeIdx],
+                shapeRot2Angles[shapeIdx],
+                shapeW0LayerAngles[shapeIdx],
+                shapeW0LayerShifted[shapeIdx],
+                shapeW0Phases[shapeIdx],
+                shapeW1LayerAngles[shapeIdx],
+                shapeW1LayerShifted[shapeIdx],
+                shapeW1Phases[shapeIdx],
+                shapeULayerAngles[shapeIdx],
+                shapeULayerShifted[shapeIdx],
+                shapeUPhases[shapeIdx],
+                newSite,
+                ancilla[0..effectiveBits - 1],
+                phaseGradient,
+                angleReg
+            );
+            EndEstimateCaching();
+        }
+    }
+
+    // Undo phase gradient state
+    Adjoint PreparePhaseGradientState(phaseGradient);
+}
+
+// =============================================================================
+// Composable Op wrapper (allocates ancilla internally)
+// =============================================================================
+
+/// Parameters struct for MPS sequential state preparation.
+struct MPSSequentialParams {
+    initialStateVec : Double[],
+    numSites : Int,
+    rotationBits : Int,
+    numAncillaQubits : Int,
+    siteVLayerAngles : Double[][][],
+    siteVLayerShifted : Bool[][],
+    siteVPhases : Bool[][],
+    siteRot0Angles : Double[][],
+    siteRot1Angles : Double[][],
+    siteRot2Angles : Double[][],
+    siteW0LayerAngles : Double[][][],
+    siteW0LayerShifted : Bool[][],
+    siteW0Phases : Bool[][],
+    siteW1LayerAngles : Double[][][],
+    siteW1LayerShifted : Bool[][],
+    siteW1Phases : Bool[][],
+    siteULayerAngles : Double[][][],
+    siteULayerShifted : Bool[][],
+    siteUPhases : Bool[][],
+}
+
+/// Applies MPS sequential state preparation on the system qubit array.
+/// Ancilla qubits are allocated internally (they start and end in |0⟩).
+operation ApplyMPSSequential(
+    params : MPSSequentialParams,
+    qubits : Qubit[]
+) : Unit {
+    use ancilla = Qubit[params.numAncillaQubits];
+    MPSSequential(
+        params.initialStateVec,
+        params.numSites,
+        params.rotationBits,
+        params.siteVLayerAngles,
+        params.siteVLayerShifted,
+        params.siteVPhases,
+        params.siteRot0Angles,
+        params.siteRot1Angles,
+        params.siteRot2Angles,
+        params.siteW0LayerAngles,
+        params.siteW0LayerShifted,
+        params.siteW0Phases,
+        params.siteW1LayerAngles,
+        params.siteW1LayerShifted,
+        params.siteW1Phases,
+        params.siteULayerAngles,
+        params.siteULayerShifted,
+        params.siteUPhases,
+        qubits,
+        ancilla
+    );
+}
+
+/// Returns a Qubit[] => Unit callable for MPS sequential state preparation.
+function MakeMPSSequentialOp(params : MPSSequentialParams) : Qubit[] => Unit {
+    ApplyMPSSequential(params, _)
+}
+
+// =============================================================================
+// Grouped composable Op wrapper (per-shape caching for fast resource estimation)
+// =============================================================================
+
+/// Parameters struct for grouped MPS sequential state preparation.
+struct MPSSequentialGroupedParams {
+    initialStateVec : Double[],
+    numSites : Int,
+    rotationBits : Int,
+    numAncillaQubits : Int,
+    siteShapeIndices : Int[],
+    shapeEffectiveBits : Int[],
+    shapeVLayerAngles : Double[][][],
+    shapeVLayerShifted : Bool[][],
+    shapeVPhases : Bool[][],
+    shapeRot0Angles : Double[][],
+    shapeRot1Angles : Double[][],
+    shapeRot2Angles : Double[][],
+    shapeW0LayerAngles : Double[][][],
+    shapeW0LayerShifted : Bool[][],
+    shapeW0Phases : Bool[][],
+    shapeW1LayerAngles : Double[][][],
+    shapeW1LayerShifted : Bool[][],
+    shapeW1Phases : Bool[][],
+    shapeULayerAngles : Double[][][],
+    shapeULayerShifted : Bool[][],
+    shapeUPhases : Bool[][],
+}
+
+/// Applies grouped MPS sequential state preparation on the system qubit array.
+/// Ancilla qubits are allocated internally (they start and end in |0⟩).
+/// Uses per-shape estimate caching for accurate fast resource estimation.
+operation ApplyMPSSequentialGrouped(
+    params : MPSSequentialGroupedParams,
+    qubits : Qubit[]
+) : Unit {
+    use ancilla = Qubit[params.numAncillaQubits];
+
+    // Initialize phase gradient register
+    use phaseGradient = Qubit[params.rotationBits];
+    PreparePhaseGradientState(phaseGradient);
+
+    // Single shared angle register for QROM-loaded angles
+    use angleReg = Qubit[params.rotationBits];
+
+    // Prepare initial state (uses full ancilla)
+    let initReg = ancilla + qubits[0..1];
+    QroamStatePrep(params.initialStateVec, Reversed(initReg), phaseGradient, angleReg);
+
+    // Apply site unitaries with per-shape caching
+    for siteIdx in 0..params.numSites - 2 {
+        let newSite = qubits[2 * (siteIdx + 1)..2 * (siteIdx + 1) + 1];
+        let shapeIdx = params.siteShapeIndices[siteIdx];
+        let effectiveBits = params.shapeEffectiveBits[shapeIdx];
+        if BeginEstimateCaching("SiteUnitary", shapeIdx) {
+            SiteUnitary(
+                params.shapeVLayerAngles[shapeIdx],
+                params.shapeVLayerShifted[shapeIdx],
+                params.shapeVPhases[shapeIdx],
+                params.shapeRot0Angles[shapeIdx],
+                params.shapeRot1Angles[shapeIdx],
+                params.shapeRot2Angles[shapeIdx],
+                params.shapeW0LayerAngles[shapeIdx],
+                params.shapeW0LayerShifted[shapeIdx],
+                params.shapeW0Phases[shapeIdx],
+                params.shapeW1LayerAngles[shapeIdx],
+                params.shapeW1LayerShifted[shapeIdx],
+                params.shapeW1Phases[shapeIdx],
+                params.shapeULayerAngles[shapeIdx],
+                params.shapeULayerShifted[shapeIdx],
+                params.shapeUPhases[shapeIdx],
+                newSite,
+                ancilla[0..effectiveBits - 1],
+                phaseGradient,
+                angleReg
+            );
+            EndEstimateCaching();
+        }
+    }
+
+    // Undo phase gradient state
+    Adjoint PreparePhaseGradientState(phaseGradient);
+}
+
+/// Returns a Qubit[] => Unit callable for grouped MPS sequential state preparation.
+function MakeMPSSequentialOpGrouped(params : MPSSequentialGroupedParams) : Qubit[] => Unit {
+    ApplyMPSSequentialGrouped(params, _)
+}
diff --git a/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/MPSSparse.qs b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/MPSSparse.qs
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/PhaseGradient.qs b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/PhaseGradient.qs
new file mode 100644
index 000000000..f932212e8
--- /dev/null
+++ b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/PhaseGradient.qs
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+
+import Std.Math.*;
+import Std.Convert.*;
+import Std.Arrays.*;
+import Std.Arithmetic.RippleCarryCGIncByLE;
+import Std.Canon.ApplyQFT;
+
+export PreparePhaseGradientState, RyViaPhaseGradient;
+
+/// # Summary
+/// Prepares the phase gradient state |φ⟩ = (1/√2^n) Σ_k exp(-2πi·k/2^n) |k⟩_LE.
+///
+/// # Description
+/// The phase gradient state is the eigenstate of LE addition with eigenvalue
+/// e^{+2πi·x/2^n} for adding x. Prepared via QFT†|1⟩ where the QFT encoding
+/// matches the LE adder basis.
+///
+/// Note: Q#'s ApplyQFT omits the final bit-reversal swaps, so we must use
+/// the non-Reversed register to ensure the QFT output aligns with the LE
+/// adder (RippleCarryCGIncByLE). With ApplyQFT(pgr) (pgr[0]=MSB for QFT),
+/// the "no bit-reversal swap" output places frequency k at
+/// state_idx = bit_reverse(k), which corresponds to pgr_LE = k.
+///
+/// # Input
+/// ## phaseGradient
+/// Register to prepare the phase gradient state in. Must be initialized to |0...0⟩.
+/// Register is in little-endian format (pgr[0] = LSB for the adder).
+operation PreparePhaseGradientState(phaseGradient : Qubit[]) : Unit is Adj + Ctl {
+    let n = Length(phaseGradient);
+    X(phaseGradient[n - 1]);
+    Adjoint ApplyQFT(phaseGradient);
+}
+
+/// # Summary
+/// Applies Rz(θ) to a target qubit using phase gradient addition.
+///
+/// # Description
+/// Implements Rz(4π·x/2^b) where x is the integer value stored in angleQubits
+/// and b is the number of bits. Uses the CNOT-adder-CNOT pattern:
+///   1. CNOT target into each PGR qubit (flips PGR conditioned on target=|1⟩)
+///   2. Unconditional add angleQubits into PGR
+///   3. CNOT target into each PGR qubit (unflips)
+/// Net effect: PGR += angle when target=|0⟩, PGR -= angle when target=|1⟩.
+/// Cost: n Toffoli (adder) + 2b CNOTs, vs 2n Toffoli for Controlled adder.
+///
+/// Reference: Sanders et al. (arXiv:2007.07391, §IIA1, Figure 4a).
+///
+/// # Input
+/// ## targetQubit
+/// The qubit to apply the rotation to.
+/// ## angleQubits
+/// Register containing the binary representation of the rotation angle.
+/// ## phaseGradient
+/// The phase gradient ancilla register.
+operation RzViaPhaseGradient(
+    targetQubit : Qubit,
+    angleQubits : Qubit[],
+    phaseGradient : Qubit[]
+) : Unit is Adj + Ctl {
+    // CNOT-adder-CNOT: cheaper than Controlled RippleCarryCGIncByLE
+    for k in 0..Length(phaseGradient) - 1 {
+        CNOT(targetQubit, phaseGradient[k]);
+    }
+    RippleCarryCGIncByLE(angleQubits, phaseGradient);
+    for k in 0..Length(phaseGradient) - 1 {
+        CNOT(targetQubit, phaseGradient[k]);
+    }
+}
+
+/// # Summary
+/// Applies Ry(θ) to a target qubit using phase gradient addition.
+///
+/// # Description
+/// Implements Ry(θ) via the decomposition Ry = S†·H·Rz·H·S.
+/// The angle is encoded in the angleQubits register.
+///
+/// # Input
+/// ## targetQubit
+/// The qubit to apply the Y-rotation to.
+/// ## angleQubits
+/// Register containing the binary representation of the rotation angle.
+/// ## phaseGradient
+/// The phase gradient ancilla register.
+operation RyViaPhaseGradient(
+    targetQubit : Qubit,
+    angleQubits : Qubit[],
+    phaseGradient : Qubit[]
+) : Unit is Adj + Ctl {
+    S(targetQubit);
+    H(targetQubit);
+    RzViaPhaseGradient(targetQubit, angleQubits, phaseGradient);
+    H(targetQubit);
+    Adjoint S(targetQubit);
+}
diff --git a/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/QroamStatePrep.qs b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/QroamStatePrep.qs
new file mode 100644
index 000000000..9bb616110
--- /dev/null
+++ b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/QroamStatePrep.qs
@@ -0,0 +1,223 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+
+/// # Summary
+/// Dense state preparation using QROM + phase-gradient Ry rotations.
+///
+/// # Description
+/// Replaces the multiplexed-Pauli cascade in PreparePureStateD with:
+///   QROM load → Ry via phase-gradient adder → X-measurement uncompute
+///
+/// # References
+/// - Low, Kliuchnikov, Schaeffer (arXiv:1812.00954): QROM state prep
+/// - Sanders et al. (arXiv:2007.07391): Phase gradient rotation
+/// - Shende, Bullock, Markov (arXiv:quant-ph/0406176): SBM decomposition
+
+import Std.Math.*;
+import Std.Convert.*;
+import Std.Arrays.*;
+import Std.Canon.*;
+import Std.Diagnostics.*;
+import Std.TableLookup.Select;
+import PhaseGradient.RyViaPhaseGradient;
+import PhaseGradient.PreparePhaseGradientState;
+
+export QroamStatePrep, ComputeSBMAngles, ComputeSignBits;
+
+/// # Summary
+/// Prepares a dense quantum state |ψ⟩ = Σ_j α_j |j⟩ / ‖α‖ using QROM-loaded rotations.
+///
+/// # Description
+/// Given real coefficients α_j and a phase-gradient ancilla, prepares the target state
+/// by iterating over qubits MSB to LSB. At each level i, the rotation angle θ(address)
+/// is loaded via QROM conditioned on the previously-prepared qubits, applied as Ry via
+/// the phase-gradient register, and uncomputed via X-basis measurement.
+///
+/// The phase-gradient register must be pre-initialized (use PreparePhaseGradientState)
+/// and is returned unchanged (up to phases absorbed by MResetX).
+///
+/// Uses only Std.* (standard library) operations:
+///   - Std.TableLookup.Select for QROM (unary iteration)
+///   - Std.Arithmetic.RippleCarryCGIncByLE for phase-gradient rotation
+///
+/// # Input
+/// ## coefficients
+/// Real amplitudes of the target state. Length must be ≤ 2^Length(target).
+/// Padded with zeros if shorter.
+/// ## target
+/// State register, initialized to |0...0⟩.
+/// ## phaseGradient
+/// Phase-gradient ancilla register (rotationBits qubits), pre-initialized.
+///
+/// # Remarks
+/// Qubit ordering: target[0] = MSB (first-allocated = highest bit of state index).
+/// The operation is not adjointable due to mid-circuit measurements.
+operation QroamStatePrep(
+    coefficients : Double[],
+    target : Qubit[],
+    phaseGradient : Qubit[],
+    angleReg : Qubit[]
+) : Unit {
+    let nQubits = Length(target);
+    let rotationBits = Length(phaseGradient);
+    Fact(nQubits >= 1, "Need at least 1 target qubit.");
+    Fact(rotationBits >= 2, "Phase gradient register needs at least 2 qubits.");
+
+    // Classical pre-computation: SBM angle tree quantized to rotationBits bits.
+    let angleTree = ComputeSBMAngles(coefficients, nQubits, rotationBits);
+
+    // Iterate from level 0 (MSB = target[0]) to level n-1 (LSB = target[nQubits-1]).
+    // At level i, 2^i angles are conditioned on the i previously-prepared qubits.
+    for level in 0..nQubits - 1 {
+        let targetQubit = target[level];
+
+        // Extract this level's angles from the binary-heap-indexed tree.
+        let startIdx = 1 <<< level;
+        let numAngles = 1 <<< level;
+
+        if level == 0 {
+            // Root level: single unconditional rotation Ry(θ_root).
+            let angleBits = IntAsBoolArray(angleTree[1], rotationBits);
+            within {
+                ApplyPauliFromBitString(PauliX, true, angleBits, angleReg);
+            } apply {
+                RyViaPhaseGradient(targetQubit, angleReg, phaseGradient);
+            }
+        } else {
+            // Address = previously prepared qubits (target[0..level-1]), reversed
+            // so that Select's LE convention yields the correct tree node index.
+            let address = Reversed(target[0..level - 1]);
+
+            // Build QROM data table: Bool[2^level][rotationBits].
+            let data = ComputeQROMData(angleTree, startIdx, numAngles, rotationBits);
+
+            within {
+                Select(data, address, angleReg);
+            } apply {
+                RyViaPhaseGradient(targetQubit, angleReg, phaseGradient);
+            }
+        }
+    }
+
+    // Sign correction: the Ry rotations only produce |α_j| (positive amplitudes).
+    // For real states with negative coefficients, flip the phase of those basis states.
+    let signData = ComputeSignBits(coefficients, nQubits);
+    if Any(row -> row[0], signData) {
+        // At least one coefficient is negative — apply sign correction via QROM.
+        // Load 1 sign bit per basis state, apply Z, uncompute.
+        // Reversed(target) so Select's LE address matches coefficient index j.
+        use signBit = Qubit[1];
+        within {
+            Select(signData, Reversed(target), signBit);
+        } apply {
+            Z(signBit[0]);
+        }
+    }
+}
+
+/// # Summary
+/// Compute the SBM (Shende-Bullock-Markov) rotation angles for Grover-Rudolph state prep.
+///
+/// # Description
+/// Builds a binary tree of probabilities p_i = Σ_{leaves(i)} |α_j|² and converts
+/// each node to a quantized Ry angle:
+///   θ_i = arccos(√(p_left / p_i))
+///   quantized_i = Round(2^b · θ / (2π)) mod 2^b
+///
+/// The RyViaPhaseGradient operation applies Ry(4π·x/2^b), so to get Ry(2θ) we need
+/// x = 2^b · θ / (2π).
+///
+/// # Input
+/// ## coefficients
+/// Real amplitudes (will be padded to length 2^nQubits).
+/// ## nQubits
+/// Number of target qubits.
+/// ## rotationBits
+/// Phase-gradient precision (number of bits).
+///
+/// # Output
+/// Int array of length 2^nQubits, indexed as a binary heap (root at index 1).
+function ComputeSBMAngles(coefficients : Double[], nQubits : Int, rotationBits : Int) : Int[] {
+    let nCoeffs = 1 <<< nQubits;
+
+    // Pad coefficients to 2^n.
+    mutable coeffs = Repeated(0.0, nCoeffs);
+    for i in 0..MinI(Length(coefficients), nCoeffs) - 1 {
+        set coeffs w/= i <- coefficients[i];
+    }
+
+    // Build probability tree (binary heap, 1-indexed).
+    // Leaves at [nCoeffs..2*nCoeffs-1] store |α_j|².
+    mutable tree = Repeated(0.0, 2 * nCoeffs);
+    for i in 0..nCoeffs - 1 {
+        set tree w/= (nCoeffs + i) <- coeffs[i] * coeffs[i];
+    }
+    // Internal nodes = sum of children.
+    for i in (nCoeffs - 1)..-1..1 {
+        set tree w/= i <- tree[2 * i] + tree[2 * i + 1];
+    }
+
+    // Compute quantized angles.
+    let scale = IntAsDouble(1 <<< rotationBits);
+    mutable angles = Repeated(0, nCoeffs);
+
+    for level in 0..nQubits - 1 {
+        let startIdx = 1 <<< level;
+        let endIdx = (1 <<< (level + 1)) - 1;
+        for node in startIdx..endIdx {
+            let pTotal = tree[node];
+            let pLeft = tree[2 * node];
+
+            mutable angle = 0.0;
+            if pTotal > 1e-15 {
+                let cosVal = MinD(1.0, Sqrt(pLeft / pTotal));
+                set angle = ArcCos(cosVal);
+            }
+
+            // Quantize: x = 2^b · θ / (2π), so Ry(4π·x/2^b) = Ry(2θ) as desired.
+            let xInt = Round(scale * angle / (2.0 * PI())) % (1 <<< rotationBits);
+            set angles w/= node <- xInt;
+        }
+    }
+
+    return angles;
+}
+
+/// # Summary
+/// Converts a slice of the angle tree into QROM-compatible Bool[][] format.
+function ComputeQROMData(angleTree : Int[], startIdx : Int, count : Int, rotationBits : Int) : Bool[][] {
+    mutable data : Bool[][] = [];
+    for i in 0..count - 1 {
+        set data += [IntAsBoolArray(angleTree[startIdx + i], rotationBits)];
+    }
+    return data;
+}
+
+/// # Summary
+/// Computes the sign correction table for real-valued state preparation.
+///
+/// # Description
+/// Returns a Bool[2^n][1] table where entry j is [true] if coefficients[j] < 0,
+/// and [false] otherwise. Used with Select to flip the phase of basis states
+/// corresponding to negative coefficients.
+///
+/// # Input
+/// ## coefficients
+/// Real amplitudes of the target state.
+/// ## nQubits
+/// Number of target qubits.
+///
+/// # Output
+/// Bool[][] with 2^nQubits rows, each containing a single Bool (the sign bit).
+function ComputeSignBits(coefficients : Double[], nQubits : Int) : Bool[][] {
+    let nCoeffs = 1 <<< nQubits;
+    mutable signTable : Bool[][] = [];
+    for i in 0..nCoeffs - 1 {
+        if i < Length(coefficients) {
+            set signTable += [[coefficients[i] < 0.0]];
+        } else {
+            set signTable += [[false]];
+        }
+    }
+    return signTable;
+}
diff --git a/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/SelectSwap.qs b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/SelectSwap.qs
new file mode 100644
index 000000000..d30226109
--- /dev/null
+++ b/python/src/qdk_chemistry/utils/qsharp/mps_sequential/src/SelectSwap.qs
@@ -0,0 +1,393 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE.txt in the project root for
+// license information.
+
+/// SELECT-SWAP network for efficient QROM data loading.
+///
+/// Implements the SELECT-SWAP technique that trades ancilla qubits for
+/// reduced T-gate count when loading classical data into quantum registers.
+/// Uses measurement-based uncomputation for the adjoint.
+///
+/// Operations:
+///   SelectSwap — loads data[address] into output using SWAP network
+///
+/// References:
+///   Low et al. arXiv:1805.03662
+///   Berry et al. arXiv:1902.02134
+namespace QDKChemistry.Utils.SelectSwap {
+
+    import Std.Arrays.Chunks;
+    import Std.Arrays.Enumerated;
+    import Std.Arrays.Flattened;
+    import Std.Arrays.IsEmpty;
+    import Std.Arrays.MappedOverRange;
+    import Std.Arrays.Padded;
+    import Std.Arrays.Partitioned;
+    import Std.Arrays.Zipped;
+    import Std.Canon.ApplyToEachA;
+    import Std.Canon.ApplyToEachCA;
+    import Std.Convert.IntAsDouble;
+    import Std.Diagnostics.Fact;
+    import Std.Math.Ceiling;
+    import Std.Math.Floor;
+    import Std.Math.Lg;
+    import Std.Math.MaxI;
+    import Std.Math.MinI;
+    import Std.TableLookup.Select;
+    import PhaseGradient.RyViaPhaseGradient;
+
+    // ═══════════════════════════════════════════════════════════════════════════
+    //  1D SELECT-SWAP
+    // ═══════════════════════════════════════════════════════════════════════════
+
+    operation SelectSwap(numSwapBits : Int, data : Bool[][], address : Qubit[], output : Qubit[]) : Unit is Adj + Ctl {
+        let (n, nRequired) = DimensionsForSelect(data, address);
+        let addressFitted = address[...nRequired - 1];
+
+        let numSwapBits = numSwapBits == -1 ? ComputeOptimalLambda1D(Length(data), Length(data[0])) | numSwapBits;
+
+        Fact(numSwapBits <= nRequired, "Too many bits for SWAP network");
+
+        if numSwapBits == 0 {
+            Select(data, addressFitted, output);
+        } else {
+            WithSelectSwap(numSwapBits, data, address, intermediate => ApplyToEachCA(CNOT, Zipped(intermediate, output)));
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════════════════
+    //  1D CONTROLLED QROAM-CLEAN (forward-only Select+Swap with Unlookup)
+    // ═══════════════════════════════════════════════════════════════════════════
+
+    /// # Summary
+    /// Controlled QROM load using forward-only SelectSwap with measurement-based
+    /// uncomputation (QROAMClean pattern with blocking).
+    ///
+    /// # Description
+    /// Loads `data[address]` into an internal register controlled on `control`,
+    /// applies Ry rotation on `activeQubit` using the loaded angle, then
+    /// uncomputes the loaded data using measurement-based Adjoint Select.
+    ///
+    /// When `control = |0⟩`: no data loaded, Ry(0) = identity.
+    /// When `control = |1⟩`: loads data[address], applies Ry, uncomputes.
+    ///
+    /// Uses SelectSwap blocking (lambda > 0) for the forward load to reduce
+    /// T-gate count compared to plain Controlled Select.
+    ///
+    /// # Input
+    /// ## data
+    /// Bool[N][m]: N angle entries of m bits each.
+    /// ## address
+    /// Address register (at least ceil(lg(N)) qubits).
+    /// ## control
+    /// Control qubit.
+    /// ## activeQubit
+    /// Target qubit for Ry rotation.
+    /// ## phaseGradient
+    /// Phase gradient register for Ry via phase gradient.
+    operation ControlledQroamCleanRotation(
+        data : Bool[][],
+        address : Qubit[],
+        control : Qubit,
+        activeQubit : Qubit,
+        phaseGradient : Qubit[]
+    ) : Unit {
+        let N = Length(data);
+        Fact(N > 0, "data cannot be empty");
+        let m = Length(data[0]);
+        let nRequired = Ceiling(Lg(IntAsDouble(N)));
+        let addressFitted = address[...nRequired - 1];
+        let lambda = ComputeOptimalLambdaControlled1D(N, m);
+
+        if lambda == 0 {
+            // No blocking: Controlled Select + Ry + Adjoint Select (Unlookup)
+            let zeros = Repeated(Repeated(false, m), N);
+            let extendedData = zeros + data;
+            use angleReg = Qubit[m];
+            Controlled Select([control], (data, addressFitted, angleReg));
+            RyViaPhaseGradient(activeQubit, angleReg, phaseGradient);
+            Adjoint Select(extendedData, addressFitted + [control], angleReg);
+        } else {
+            // With blocking: Controlled Select on N/K entries + Swap + Ry + Unlookup
+            let k = nRequired - lambda;
+            let addressParts = Partitioned([k, lambda], addressFitted);
+
+            let paddedData = CreatePaddedData(data, nRequired, m, k);
+            let zeros = Repeated(Repeated(false, m), N);
+            let extPaddedData = CreatePaddedData(zeros + data, nRequired + 1, m, k + 1);
+
+            use dataReg = Qubit[m * (1 <<< lambda)];
+            let chunks = Chunks(m, dataReg);
+
+            // Forward: Controlled Select loads blocked data into dataReg
+            Controlled Select([control], (paddedData, addressParts[0], dataReg));
+            // Swap: move correct chunk to position 0
+            SwapDataOutputs(addressParts[1], chunks);
+            // Rotation: Ry on activeQubit using chunks[0] as angle
+            RyViaPhaseGradient(activeQubit, chunks[0], phaseGradient);
+            // Uncompute: measurement-based Unlookup on extended padded data
+            Adjoint Select(extPaddedData, addressParts[0] + addressParts[1] + [control], dataReg);
+        }
+    }
+
+    /// # Summary
+    /// Computes optimal lambda (number of swap bits) for controlled forward-only
+    /// QROAMClean pattern.
+    ///
+    /// # Description
+    /// The cost model for controlled forward-only SelectSwap is:
+    ///   Controlled Select(N/K) + SwapDataOutputs + PhaseLookup(Unlookup)
+    /// This differs from the standard SelectSwapCost which models a full round trip.
+    internal function ComputeOptimalLambdaControlled1D(numData : Int, numBits : Int) : Int {
+        let addressBits = Ceiling(Lg(IntAsDouble(numData)));
+
+        mutable best = ControlledQroamCleanCost(0, numData, numBits);
+        mutable bestLambda = 0;
+
+        for lambda in 1..addressBits - 1 {
+            let cost = ControlledQroamCleanCost(lambda, numData, numBits);
+            if cost < best {
+                set bestLambda = lambda;
+                set best = cost;
+            }
+        }
+
+        return bestLambda;
+    }
+
+    /// Cost model for controlled forward-only QROAMClean:
+    ///   Controlled Select (N/K entries) + Swap + Unlookup (PhaseLookup)
+    internal function ControlledQroamCleanCost(lambda : Int, numData : Int, numBits : Int) : Int {
+        let addressBits = Ceiling(Lg(IntAsDouble(numData)));
+
+        // Controlled Select on padded data: 2^(addressBits-lambda) entries, +1 for control
+        let ctrlSelectCost = 2^(addressBits - lambda) - 2 + 1;
+
+        // Swap cost: (K-1) * m controlled SWAPs
+        let swapCost = (2^lambda - 1) * numBits;
+
+        // Unlookup (PhaseLookup) cost: depends on number of entries in extPaddedData
+        // extPaddedData has 2^(k+1) = 2^(addressBits-lambda+1) entries
+        let unlookupAddrBits = addressBits - lambda + 1;
+        let n1 = unlookupAddrBits / 2;
+        let n2 = unlookupAddrBits - n1;
+        let unlookupCost = MaxI(0, 2^n1 - n1 - 1) + MaxI(0, 2^n2 - n2 - 1);
+
+        return ctrlSelectCost + swapCost + unlookupCost;
+    }
+
+    // ═══════════════════════════════════════════════════════════════════════════
+    //  1D UNCONTROLLED QROAM-CLEAN (forward-only Select+Swap with Unlookup)
+    // ═══════════════════════════════════════════════════════════════════════════
+
+    /// # Summary
+    /// QROM-clean rotation: loads angle data, applies Ry rotation, then
+    /// uncomputes using measurement-based Adjoint Select.
+    ///
+    /// # Description
+    /// Replaces `within { SelectSwap } apply { Ry }` which costs 2× the
+    /// SelectSwap body (forward + adjoint). QROAMClean does forward-only +
+    /// measurement-based Unlookup for roughly half the cost.
+    ///
+    /// # Input
+    /// ## data
+    /// Bool[N][m]: N angle entries of m bits each.
+    /// ## address
+    /// Address register (at least ceil(lg(N)) qubits).
+    /// ## activeQubit
+    /// Target qubit for Ry rotation.
+    /// ## phaseGradient
+    /// Phase gradient register for Ry via phase gradient.
+    operation QroamCleanRotation(
+        data : Bool[][],
+        address : Qubit[],
+        activeQubit : Qubit,
+        phaseGradient : Qubit[]
+    ) : Unit {
+        let N = Length(data);
+        Fact(N > 0, "data cannot be empty");
+        let m = Length(data[0]);
+        let nRequired = Ceiling(Lg(IntAsDouble(N)));
+        let addressFitted = address[...nRequired - 1];
+        let lambda = ComputeOptimalLambdaQroamClean1D(N, m);
+
+        if lambda == 0 {
+            // No blocking: Select + Ry + Adjoint Select (Unlookup)
+            use angleReg = Qubit[m];
+            Select(data, addressFitted, angleReg);
+            RyViaPhaseGradient(activeQubit, angleReg, phaseGradient);
+            Adjoint Select(data, addressFitted, angleReg);
+        } else {
+            // With blocking: Select on N/K entries + Swap + Ry + Unlookup
+            let k = nRequired - lambda;
+            let addressParts = Partitioned([k, lambda], addressFitted);
+            let paddedData = CreatePaddedData(data, nRequired, m, k);
+
+            use dataReg = Qubit[m * (1 <<< lambda)];
+            let chunks = Chunks(m, dataReg);
+
+            // Forward: Select loads blocked data into dataReg
+            Select(paddedData, addressParts[0], dataReg);
+            // Swap: move correct chunk to position 0
+            SwapDataOutputs(addressParts[1], chunks);
+            // Rotation: Ry on activeQubit using chunks[0] as angle
+            RyViaPhaseGradient(activeQubit, chunks[0], phaseGradient);
+            // Uncompute: measurement-based Unlookup
+            Adjoint Select(paddedData, addressParts[0] + addressParts[1], dataReg);
+        }
+    }
+
+    /// Computes optimal lambda for uncontrolled forward-only QROAMClean pattern.
+    internal function ComputeOptimalLambdaQroamClean1D(numData : Int, numBits : Int) : Int {
+        let addressBits = Ceiling(Lg(IntAsDouble(numData)));
+
+        mutable best = QroamCleanCost(0, numData, numBits);
+        mutable bestLambda = 0;
+
+        for lambda in 1..addressBits - 1 {
+            let cost = QroamCleanCost(lambda, numData, numBits);
+            if cost < best {
+                set bestLambda = lambda;
+                set best = cost;
+            }
+        }
+
+        return bestLambda;
+    }
+
+    /// Cost model for uncontrolled forward-only QROAMClean:
+    ///   Select(N/K entries) + Swap + Unlookup(PhaseLookup)
+    internal function QroamCleanCost(lambda : Int, numData : Int, numBits : Int) : Int {
+        let addressBits = Ceiling(Lg(IntAsDouble(numData)));
+
+        // Select on padded data: 2^(addressBits-lambda) entries
+        let selectCost = 2^(addressBits - lambda) - 2;
+
+        // Swap cost: (K-1) * m controlled SWAPs
+        let swapCost = (2^lambda - 1) * numBits;
+
+        // Unlookup (PhaseLookup): paddedData has 2^(addressBits-lambda) entries
+        let unlookupAddrBits = addressBits - lambda;
+        let n1 = unlookupAddrBits / 2;
+        let n2 = unlookupAddrBits - n1;
+        let unlookupCost = MaxI(0, 2^n1 - n1 - 1) + MaxI(0, 2^n2 - n2 - 1);
+
+        return selectCost + swapCost + unlookupCost;
+    }
+
+    // ═══════════════════════════════════════════════════════════════════════════
+    //  Internal: 1D helpers
+    // ═══════════════════════════════════════════════════════════════════════════
+
+    internal operation WithSelectSwap(numSwapBits : Int, data : Bool[][], address : Qubit[], action : (Qubit[] => Unit is Adj + Ctl)) : Unit is Adj + Ctl {
+        body (...) {
+            let (n, nRequired) = DimensionsForSelect(data, address);
+            let addressFitted = address[...nRequired - 1];
+
+            Fact(numSwapBits <= nRequired, "Too many bits for SWAP network");
+            Fact(not IsEmpty(data), "data cannot be empty");
+            let m = Length(data[0]);
+
+            if numSwapBits == 0 {
+                use output = Qubit[m];
+                within {
+                    Select(data, address, output);
+                } apply {
+                    action(output);
+                }
+            } else {
+                let numSelectBits = nRequired - numSwapBits;
+                let addressParts = Partitioned([numSelectBits, numSwapBits], addressFitted);
+
+                use dataRegister = Qubit[m * 2^numSwapBits];
+
+                let dataArray = CreatePaddedData(data, nRequired, m, numSelectBits);
+                let chunkedDataRegister = Chunks(m, dataRegister);
+
+                within {
+                    WithSelectSwapSelectPart(dataArray, addressParts, dataRegister, chunkedDataRegister);
+                } apply {
+                    action(chunkedDataRegister[0]);
+                }
+            }
+        }
+
+        adjoint self;
+    }
+
+    internal operation WithSelectSwapSelectPart(data : Bool[][], addressParts : Qubit[][], target : Qubit[], chunkedTarget : Qubit[][]) : Unit {
+        body (...) {
+            Select(data, addressParts[0], target);
+            SwapDataOutputs(addressParts[1], chunkedTarget);
+        }
+
+        adjoint (...) {
+            Adjoint Select(data, addressParts[0] + addressParts[1], target);
+        }
+    }
+
+    internal function ComputeOptimalLambda1D(numData : Int, numBits : Int) : Int {
+        mutable best = 2^32;
+        mutable bestLambda = 0;
+
+        let addressBits = Ceiling(Lg(IntAsDouble(numData)));
+        for lambda in 0..addressBits - 1 {
+            let cost = SelectSwapCost1D(lambda, numData, numBits);
+            if cost < best {
+                set bestLambda = lambda;
+                set best = cost;
+            }
+        }
+
+        return bestLambda;
+    }
+
+    internal function SelectSwapCost1D(lambda : Int, numData : Int, numBits : Int) : Int {
+        if lambda == 0 {
+            return numData - 2;
+        } else {
+            let addressBits = Ceiling(Lg(IntAsDouble(numData)));
+            let split = MinI(Floor(Lg(IntAsDouble(2^lambda * numBits))), addressBits - 1);
+
+            let select_cost = 2^(addressBits - lambda) - 2;
+            let unselect_cost = MaxI(0, 2^split - 2) + 2^(addressBits - split) - 2;
+            let swap_cost = (2^lambda - 1) * numBits;
+
+            return select_cost + unselect_cost + swap_cost;
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════════════════
+    //  Internal: shared helpers
+    // ═══════════════════════════════════════════════════════════════════════════
+
+    internal function DimensionsForSelect(data : Bool[][], address : Qubit[]) : (Int, Int) {
+        let N = Length(data);
+        Fact(N > 0, "data cannot be empty");
+
+        let n = Ceiling(Lg(IntAsDouble(N)));
+        Fact(Length(address) >= n, $"address register is too small, requires at least {n} qubits");
+
+        return (N, n);
+    }
+
+    internal function CreatePaddedData(data : Bool[][], nRequired : Int, m : Int, k : Int) : Bool[][] {
+        let dataPadded = Padded(-2^nRequired, [false, size = m], data);
+
+        MappedOverRange(i -> Flattened(dataPadded[i..2^k..2^nRequired - 1]), 0..2^k - 1)
+    }
+
+    internal operation SwapDataOutputs(address : Qubit[], outputs : Qubit[][]) : Unit is Adj {
+        let l = Length(address);
+        for (i, control) in Enumerated(address) {
+            let innerStepSize = 2^i;
+            let outerStepSize = 2^(i + 1);
+            let numSwaps = 2^l / 2^(i + 1);
+            for j in 0..numSwaps - 1 {
+                let targets1 = outputs[j * outerStepSize];
+                let targets2 = outputs[j * outerStepSize + innerStepSize];
+                ApplyToEachA(ts => Controlled SWAP([control], ts), Zipped(targets1, targets2));
+            }
+        }
+    }
+}
diff --git a/python/tests/test_mps_sequential_state_prep.py b/python/tests/test_mps_sequential_state_prep.py
new file mode 100644
index 000000000..0c3a0fe17
--- /dev/null
+++ b/python/tests/test_mps_sequential_state_prep.py
@@ -0,0 +1,834 @@
+"""Tests for MPS state preparation algorithm.
+
+Tests both the classical preprocessing (fidelity of the CSD decomposition)
+and the full Q# circuit (state preparation fidelity and gate counts).
+"""
+
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+import numpy as np
+import pytest
+from qdk import qsharp
+
+from qdk_chemistry.algorithms.state_preparation.mps_sequential import (
+    MPSSequentialStatePreparation,
+    compute_site_unitary_dense_data,
+    decompose_2d,
+    decompose_unitary_to_givens,
+    generate_mps_preparation_data,
+)
+from qdk_chemistry.data.mps_wavefunction import MPSWavefunction
+from qdk_chemistry.utils.qsharp import get_qsharp_utils
+
+
+class TestMPSWavefunction:
+    """Tests for the MPSWavefunction data container."""
+
+    def test_basic_construction(self):
+        """Test constructing an MPSWavefunction from tensors."""
+        rng = np.random.default_rng(42)
+        mps = MPSWavefunction.random(num_sites=3, bond_dim=4, rng=rng)
+        assert mps.num_sites == 3
+        assert mps.num_qubits == 6
+        assert mps.max_bond_dim == 4
+
+    def test_contract_normalized(self):
+        """Test that contracted state vector is normalized."""
+        rng = np.random.default_rng(42)
+        mps = MPSWavefunction.random(num_sites=3, bond_dim=4, rng=rng)
+        state = mps.contract()
+        assert abs(np.linalg.norm(state) - 1.0) < 1e-10
+
+    def test_validation_errors(self):
+        """Test that invalid inputs raise ValueError."""
+        with pytest.raises(ValueError, match="must not be empty"):
+            MPSWavefunction([])
+
+        with pytest.raises(ValueError, match="3-dimensional"):
+            MPSWavefunction([np.zeros((4, 4))])
+
+    def test_bond_dim_consistency(self):
+        """Test that inconsistent bond dimensions are caught."""
+        t1 = np.zeros((1, 4, 3))
+        t2 = np.zeros((2, 4, 1))  # chi_left=2 doesn't match t1's chi_right=3
+        with pytest.raises(ValueError, match="Bond dimension mismatch"):
+            MPSWavefunction([t1, t2])
+
+
+class TestDecomposition:
+    """Test that CSD reconstructs the target matrix."""
+
+    @pytest.mark.parametrize(
+        ("chi_left", "chi_right", "seed"),
+        [
+            (2, 2, 42),
+            (4, 2, 123),
+            (2, 4, 456),
+            (4, 4, 789),
+        ],
+    )
+    def test_reconstruction_matches_target(self, chi_left, chi_right, seed):
+        """Verify the 7-matrix decomposition reconstructs the target isometry."""
+        rng = np.random.default_rng(seed)
+        d = 4
+
+        # Generate a random isometry
+        raw = rng.standard_normal((d * chi_right, chi_left))
+        q_mat, _ = np.linalg.qr(raw, mode="reduced")
+        tensor = q_mat.reshape(d, chi_right, chi_left).transpose(2, 0, 1)
+
+        ancilla_bits = int(np.ceil(np.log2(max(chi_left, d * chi_right))))
+        ancilla_dim = 1 << ancilla_bits
+
+        data = compute_site_unitary_dense_data(tensor, v_from_next=None, ancilla_dim=ancilla_dim)
+
+        # Check that V is orthogonal
+        v = data["v"]
+        width = v.shape[0]
+        assert np.allclose(v @ v.T, np.eye(width), atol=1e-10)
+
+        # Check W matrices are orthogonal
+        dim = ancilla_dim
+        w_0 = data["w_0"]
+        w_1 = data["w_1"]
+        assert np.allclose(w_0 @ w_0.T, np.eye(dim), atol=1e-10)
+        assert np.allclose(w_1 @ w_1.T, np.eye(dim), atol=1e-10)
+
+        # Check U matrices are orthogonal
+        for u in data["u"]:
+            assert np.allclose(u @ u.T, np.eye(dim), atol=1e-10)
+
+    def test_decompose_2d_correctness(self):
+        """Verify decompose_2d produces valid decomposition."""
+        rng = np.random.default_rng(42)
+        dim = 4
+        raw = rng.standard_normal((2 * dim, dim))
+        q_mat, _ = np.linalg.qr(raw, mode="reduced")
+        a = q_mat[:dim, :]
+        b = q_mat[dim:, :]
+
+        u_1, u_2, d_1, _d_2, v = decompose_2d(a, b)
+
+        assert u_1.shape == (dim, dim)
+        assert u_2.shape == (dim, dim)
+        assert v.shape == (dim, dim)
+        assert np.allclose(u_1 @ u_1.T, np.eye(dim), atol=1e-10)
+        assert np.allclose(u_2 @ u_2.T, np.eye(dim), atol=1e-10)
+        assert np.allclose(v @ v.T, np.eye(dim), atol=1e-10)
+        assert np.allclose(a, u_1[:, :dim] * d_1 @ v, atol=1e-10)
+
+    def test_givens_decomposition_reconstructs_unitary(self):
+        """Verify Givens decomposition reconstructs the original unitary."""
+        rng = np.random.default_rng(42)
+        dim = 4
+        raw = rng.standard_normal((dim, dim))
+        q_mat, _ = np.linalg.qr(raw)
+
+        layer_angles, layer_shifted, phases = decompose_unitary_to_givens(q_mat)
+
+        # Reconstruct from layers using circuit convention:
+        # Circuit applies layers[0] first, ..., layers[l-1], then D.
+        # Resulting unitary = D · L_{l-1} · ... · L_0
+        # Build by left-multiplying each layer in order, then D.
+        reconstructed = np.eye(dim)
+        for angles, shifted in zip(layer_angles, layer_shifted, strict=False):
+            layer_mat = np.eye(dim)
+            if shifted:
+                pairs = [(2 * k + 1, 2 * k + 2) for k in range((dim - 1) // 2)]
+            else:
+                pairs = [(2 * k, 2 * k + 1) for k in range(dim // 2)]
+
+            for slot_idx, (i, j) in enumerate(pairs):
+                if slot_idx < len(angles) and abs(angles[slot_idx]) > 1e-15:
+                    c = np.cos(angles[slot_idx])
+                    s = np.sin(angles[slot_idx])
+                    rot = np.eye(dim)
+                    rot[i, i] = c
+                    rot[i, j] = -s
+                    rot[j, i] = s
+                    rot[j, j] = c
+                    layer_mat = layer_mat @ rot
+            reconstructed = layer_mat @ reconstructed
+
+        # Apply phase correction (D applied last = leftmost in product)
+        phase_diag = np.diag([(-1.0 if p else 1.0) for p in phases[:dim]])
+        reconstructed = phase_diag @ reconstructed
+
+        assert np.allclose(np.abs(reconstructed - q_mat), 0, atol=1e-8) or np.allclose(
+            np.abs(reconstructed + q_mat), 0, atol=1e-8
+        )
+
+
+class TestGenerateMPSPreparationData:
+    """Test the full preprocessing pipeline for MPSSequential."""
+
+    def test_two_site_data_structure(self):
+        """Verify generate_mps_preparation_data returns correct structure for 2 sites."""
+        rng = np.random.default_rng(42)
+        mps = MPSWavefunction.random(num_sites=2, bond_dim=2, rng=rng)
+        data = generate_mps_preparation_data(mps.tensors)
+
+        assert data.num_sites == 2
+        assert data.ancilla_bits >= 1
+        assert len(data.initial_state_vec) > 0
+        # One site unitary (num_sites - 1)
+        assert len(data.sites) == 1
+
+    def test_three_site_data_structure(self):
+        """Verify generate_mps_preparation_data returns correct structure for 3 sites."""
+        rng = np.random.default_rng(42)
+        mps = MPSWavefunction.random(num_sites=3, bond_dim=2, rng=rng)
+        data = generate_mps_preparation_data(mps.tensors)
+
+        assert data.num_sites == 3
+        assert len(data.sites) == 2
+
+    def test_initial_state_normalized(self):
+        """Verify the initial state vector is normalized."""
+        rng = np.random.default_rng(42)
+        mps = MPSWavefunction.random(num_sites=3, bond_dim=4, rng=rng)
+        data = generate_mps_preparation_data(mps.tensors)
+
+        init_vec = np.array(data.initial_state_vec)
+        assert abs(np.linalg.norm(init_vec) - 1.0) < 1e-10
+
+
+class TestMPSSequentialFidelity:
+    """Test that MPS Sequential state preparation produces high-fidelity states."""
+
+    @pytest.mark.parametrize(
+        ("num_sites", "bond_dim", "seed"),
+        [
+            (2, 4, 42),
+            (3, 4, 42),
+            (4, 2, 42),
+        ],
+    )
+    def test_fidelity_vs_exact(self, num_sites, bond_dim, seed):
+        """Test state preparation fidelity against the exact MPS state vector.
+
+        Uses single-shot statevector simulation via DumpMachine() instead of
+        statistical sampling (prohibitively slow due to the QROAM-based circuit
+        allocating many internal ancillas per shot).
+        """
+        # Ensure MPS Q# project is loaded into the global interpreter
+        get_qsharp_utils()
+
+        rng = np.random.default_rng(seed)
+        mps = MPSWavefunction.random(num_sites=num_sites, bond_dim=bond_dim, rng=rng)
+
+        # Classical reference state
+        target_state = mps.contract()
+
+        # Prepare gate-based data
+        prep_data = generate_mps_preparation_data(mps.tensors)
+        params = prep_data.to_qsharp_params(rotation_bits=10)
+        num_state_qubits = 2 * num_sites
+        num_ancilla_qubits = prep_data.ancilla_bits
+
+        qs_code = _build_mps_eval_code(params)
+        qsharp.eval(f"use state = Qubit[{num_state_qubits}];")
+        qsharp.eval(f"use ancilla = Qubit[{num_ancilla_qubits}];")
+        qsharp.eval(qs_code)
+        dump = qsharp.dump_machine()
+        amplitudes = np.array(dump.as_dense_state(), dtype=complex)
+        qsharp.eval("ResetAll(state + ancilla);")
+
+        # Extract amplitudes where ancilla = |0⟩.
+        # DumpMachine qubit ordering: state[0]...state[N-1], ancilla[0]...
+        # (MSB-first in bit string). Ancilla qubits are the rightmost bits.
+        num_total_qubits = num_state_qubits + num_ancilla_qubits
+        dim = 2**num_total_qubits
+        ancilla_mask = (1 << num_ancilla_qubits) - 1
+        state_dim = 2**num_state_qubits
+        state_amplitudes = np.zeros(state_dim, dtype=complex)
+        for idx in range(dim):
+            if (idx & ancilla_mask) == 0:
+                state_idx = idx >> num_ancilla_qubits
+                state_amplitudes[state_idx] = amplitudes[idx]
+
+        # P(ancilla = |0⟩) — measures how well the ancilla is disentangled
+        ancilla_zero_prob = np.sum(np.abs(state_amplitudes) ** 2)
+        assert ancilla_zero_prob > 0.90, f"P(ancilla=0) = {ancilla_zero_prob:.4f} too low — ancilla not clean"
+
+        # Normalize the post-selected state
+        state_amplitudes = state_amplitudes / np.sqrt(ancilla_zero_prob)
+
+        # Reindex: the Q# circuit uses little-endian ordering within each
+        # 2-qubit site, so DumpMachine's big-endian bits need to be reversed
+        # within each site to match the Python MPS convention.
+        site_bits = 2  # qubits per site
+        reordered = np.zeros_like(state_amplitudes)
+        for dm_idx in range(state_dim):
+            py_idx = 0
+            for site in range(num_sites):
+                shift = (num_sites - 1 - site) * site_bits
+                site_val = (dm_idx >> shift) & ((1 << site_bits) - 1)
+                # Reverse bits within this site
+                rev_val = 0
+                for b in range(site_bits):
+                    if site_val & (1 << b):
+                        rev_val |= 1 << (site_bits - 1 - b)
+                py_idx |= rev_val << shift
+            reordered[py_idx] = state_amplitudes[dm_idx]
+        state_amplitudes = reordered
+
+        # Compute quantum state fidelity |⟨target|prepared⟩|²
+        fidelity = np.abs(np.dot(np.conj(state_amplitudes[: len(target_state)]), target_state)) ** 2
+
+        # Should achieve high fidelity with rotation_bits=10
+        assert fidelity > 0.95, f"Fidelity {fidelity:.4f} too low for num_sites={num_sites}, bond_dim={bond_dim}"
+
+
+# Qualtran reference test data: MPS tensors and expected states
+
+# These tensors and expected states are from the Qualtran MPSPreparation tests
+# (Apache-2.0). They serve as regression fixtures for state preparation fidelity.
+
+_qualtran_mps_tensors = (
+    np.array(
+        [
+            [
+                [0.01650572, 0.0, 0.0, 0.0],
+                [0.0, -0.52929781, 0.0, 0.0],
+                [0.0, 0.0, -0.84462254, 0.0],
+                [0.0, 0.0, 0.0, -0.07863941],
+            ]
+        ]
+    ),
+    np.array(
+        [
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [-0.05969264, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.9973967, 0.04045497, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [-0.08381532, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.98376348, 0.15869598, 0.0],
+            ],
+            [
+                [-0.0421477, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.46961402, 0.0265522, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.41109095, 0.03268939, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.77904869],
+            ],
+        ]
+    ),
+    np.array(
+        [
+            [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [-0.19640516, 0.0, 0.0, 0.0], [0.0, -0.98052283, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [-0.98052283, 0.0, 0.0, 0.0], [0.0, 0.19640516, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0, 0.0], [-0.02411236, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, -0.99970925, 0.0]],
+            [[0.0, 0.0, 0.0, 0.0], [-0.99970925, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.02411236, 0.0]],
+            [
+                [-0.17695837, 0.0, 0.0, 0.0],
+                [0.0, -0.58052668, 0.0, 0.0],
+                [0.0, 0.0, -0.53176612, 0.0],
+                [0.0, 0.0, 0.0, -0.59067698],
+            ],
+        ]
+    ),
+    np.array(
+        [
+            [[0.0], [0.0], [0.0], [1.0]],
+            [[0.0], [0.0], [1.0], [0.0]],
+            [[0.0], [1.0], [0.0], [0.0]],
+            [[1.0], [0.0], [0.0], [0.0]],
+        ]
+    ),
+)
+
+_qualtran_mps_expected_state = np.array(
+    [0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.01650572, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.03159519, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.12468186, 0.        , 0.        ,
+     0.51343194, 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.07079231, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.15403441, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.82743524, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.00331447, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.00930066, 0.        , 0.        ,
+     0.03580077, 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.00334943, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.03225657, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.01084116, 0.        , 0.        ,
+     0.03556534, 0.        , 0.        , 0.03257808, 0.        , 0.        ,
+     0.03618719, 0.        , 0.        , 0.        ])  # fmt: skip
+
+# Qualtran resource estimates (QROM mode) for cross-validation.
+# From QubitCount and QECGatesCost (and_bloq + cswap).
+QUALTRAN_COST_DENSE = {"num_qubits": 26, "toffoli": 600}
+QUALTRAN_COST_SPARSE = {"num_qubits": 32, "toffoli": 321}
+
+# Non-zero spin MPS tensors — a 4-site system with chi_left=3 (singlet embedding).
+# From the Qualtran MPSPreparation tests (Apache-2.0).
+_qualtran_mps_tensors_non_zero_spin = (
+    np.array(
+        [
+            [
+                [-0.00110206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.00316609, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, -0.57734054, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.00110206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, -0.00223876, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, -0.00223876, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.57734054, 0.0],
+            ],
+            [
+                [0.0, 0.0, -0.00110206, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.00316609, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.57734054],
+            ],
+        ]
+    ),
+    np.array(
+        [
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [-1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [-0.70710678, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, -0.70710678, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [-0.55872176, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.82920795, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01562571, 0.0],
+            ],
+            [
+                [0.0, -0.55872176, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.82920795, -0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01562571],
+            ],
+            [
+                [0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.70710678, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.70710678],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            ],
+        ]
+    ),
+    np.array(
+        [
+            [
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [-1.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0],
+                [-1.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [-0.99960484, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, -0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.02810986],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, -0.70710678, 0.0, 0.0],
+                [0.0, 0.0, -0.70710678, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, -0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, -1.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, -0.0, 0.0],
+                [0.0, 0.0, 0.0, -1.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+        ]
+    ),
+    np.array(
+        [
+            [[0.0], [0.0], [0.0], [1.0]],
+            [[0.0], [0.0], [1.0], [0.0]],
+            [[0.0], [1.0], [0.0], [0.0]],
+            [[1.0], [0.0], [0.0], [0.0]],
+        ]
+    ),
+)
+
+_qualtran_mps_expected_state_non_zero_spin = np.array(
+    [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        , -0.00110206,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        , -0.00110206,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.00176896,  0.        ,  0.        ,  0.        ,
+       -0.00125085,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        , -0.00262431,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.00185567,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+       -0.00125085,  0.        ,  0.        ,  0.        ,  0.00176896,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.00185567,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        , -0.00262431,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.57734054,  0.        ,  0.        ,
+        0.        , -0.40824141,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        , -0.40824141,  0.        ,
+        0.        ,  0.        ,  0.57734054,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ])  # fmt: skip
+
+# Qualtran resource estimates for the non-zero spin case (chi_left=3).
+QUALTRAN_COST_NON_ZERO_SPIN_DENSE = {"num_qubits": 29, "toffoli": 734}
+QUALTRAN_COST_NON_ZERO_SPIN_SPARSE = {"num_qubits": 29, "toffoli": 258}
+
+
+class TestMPSSequentialQualtranFidelity:
+    """Test that MPSWavefunction contraction matches Qualtran expected states."""
+
+    @pytest.mark.parametrize(
+        ("tensors", "expected_state"),
+        [
+            (_qualtran_mps_tensors, _qualtran_mps_expected_state),
+            (_qualtran_mps_tensors_non_zero_spin, _qualtran_mps_expected_state_non_zero_spin),
+        ],
+        ids=["standard", "non_zero_spin"],
+    )
+    def test_contract_matches_expected_state(self, tensors, expected_state):
+        """Verify MPS contraction produces the Qualtran expected state vector."""
+        mps = MPSWavefunction(tensors)
+        contracted = mps.contract()
+        # Use atol=1e-3: non-zero spin tensors are not perfectly canonical,
+        # so raw contraction has O(1e-4) leakage. Fidelity is still > 0.9999.
+        assert np.allclose(contracted, expected_state, atol=1e-3)
+
+
+class TestMPSSequentialQualtranCostComparison:
+    """Test that Q# resource estimates are consistent with Qualtran."""
+
+    @pytest.mark.parametrize(
+        ("tensors", "expected_num_sites", "min_ancilla_bits"),
+        [
+            (_qualtran_mps_tensors, 4, 3),
+            (_qualtran_mps_tensors_non_zero_spin, 4, 3),
+        ],
+        ids=["standard", "non_zero_spin"],
+    )
+    def test_prepare_gate_based_data_produces_valid_output(self, tensors, expected_num_sites, min_ancilla_bits):
+        """Verify generate_mps_preparation_data succeeds on Qualtran tensors."""
+        mps = MPSWavefunction(tensors)
+        data = generate_mps_preparation_data(mps.tensors)
+
+        assert data.num_sites == expected_num_sites
+        assert data.ancilla_bits >= min_ancilla_bits
+        # num_sites - 1 site unitaries
+        assert len(data.sites) == expected_num_sites - 1
+
+        # Initial state should be normalized
+        init_vec = np.array(data.initial_state_vec)
+        assert abs(np.linalg.norm(init_vec) - 1.0) < 1e-10
+
+    @pytest.mark.parametrize(
+        ("tensors", "qualtran_cost"),
+        [
+            (_qualtran_mps_tensors, QUALTRAN_COST_DENSE),
+            (_qualtran_mps_tensors_non_zero_spin, QUALTRAN_COST_NON_ZERO_SPIN_DENSE),
+        ],
+        ids=["standard", "non_zero_spin"],
+    )
+    def test_resource_estimate_qubit_count(self, tensors, qualtran_cost):
+        """Verify Q# resource estimate qubit count is consistent with Qualtran."""
+        mps = MPSWavefunction(tensors)
+        algo = MPSSequentialStatePreparation()
+        circuit = algo.run(mps)
+        result = circuit.estimate()
+        counts = result.logical_counts
+
+        # Q# estimate should use a comparable number of qubits to Qualtran dense mode
+        assert counts["numQubits"] >= qualtran_cost["num_qubits"]
+        assert counts["numQubits"] <= qualtran_cost["num_qubits"] * 2
+
+    @pytest.mark.parametrize(
+        ("tensors", "qualtran_cost"),
+        [
+            (_qualtran_mps_tensors, QUALTRAN_COST_SPARSE),
+            (_qualtran_mps_tensors_non_zero_spin, QUALTRAN_COST_NON_ZERO_SPIN_SPARSE),
+        ],
+        ids=["standard", "non_zero_spin"],
+    )
+    def test_resource_estimate_toffoli_count(self, tensors, qualtran_cost):
+        """Verify Q# resource estimate Toffoli count is consistent with Qualtran."""
+        mps = MPSWavefunction(tensors)
+        algo = MPSSequentialStatePreparation()
+        circuit = algo.run(mps)
+        result = circuit.estimate()
+        counts = result.logical_counts
+
+        # Q# logical_counts reports all CCZ gates including internal QROAM/Select
+        # decompositions, so the count is larger than Qualtran's sparse Toffoli count.
+        assert counts["cczCount"] > 0
+        assert counts["cczCount"] <= qualtran_cost["toffoli"] * 10
+
+
+class TestMPSSequentialFastEstimation:
+    """Test that fast resource estimation mode produces similar results to normal mode."""
+
+    @pytest.mark.parametrize(
+        "tensors",
+        [_qualtran_mps_tensors, _qualtran_mps_tensors_non_zero_spin],
+        ids=["standard", "non_zero_spin"],
+    )
+    def test_fast_vs_normal_resource_estimates(self, tensors):
+        """Verify fast estimation produces similar qubit and Toffoli counts as normal mode."""
+        mps = MPSWavefunction(tensors)
+
+        # Normal mode (full CSD decomposition per site)
+        algo_normal = MPSSequentialStatePreparation()
+        circuit_normal = algo_normal.run(mps)
+        result_normal = circuit_normal.estimate()
+        counts_normal = result_normal.logical_counts
+
+        # Fast mode (one representative per shape group)
+        algo_fast = MPSSequentialStatePreparation()
+        algo_fast.settings().update("fast_resource_estimation", True)
+        circuit_fast = algo_fast.run(mps)
+        result_fast = circuit_fast.estimate()
+        counts_fast = result_fast.logical_counts
+
+        # Qubit counts should be identical (same register layout)
+        assert counts_fast["numQubits"] == counts_normal["numQubits"]
+
+        # Toffoli counts should be close — fast mode uses dummy angles of the
+        # correct array dimensions, so QROAM table sizes and rotation counts match.
+        # Allow up to 30% deviation since shape grouping may merge sites with
+        # slightly different effective dimensions.
+        normal_ccz = counts_normal["cczCount"]
+        fast_ccz = counts_fast["cczCount"]
+        assert fast_ccz > 0
+        ratio = fast_ccz / normal_ccz
+        assert 0.9 <= ratio <= 1.1, (
+            f"Fast/normal CCZ ratio {ratio:.3f} outside [0.9, 1.1]: fast={fast_ccz}, normal={normal_ccz}"
+        )
+
+    @pytest.mark.parametrize(
+        ("num_sites", "bond_dim", "seed"),
+        [
+            (3, 2, 42),
+            (3, 4, 99),
+            (4, 2, 7),
+        ],
+    )
+    def test_fast_vs_normal_small_random_mps(self, num_sites, bond_dim, seed):
+        """Verify fast estimation agrees with normal mode on small random MPS circuits."""
+        rng = np.random.default_rng(seed)
+        mps = MPSWavefunction.random(num_sites=num_sites, bond_dim=bond_dim, rng=rng)
+
+        algo_normal = MPSSequentialStatePreparation()
+        circuit_normal = algo_normal.run(mps)
+        result_normal = circuit_normal.estimate()
+        counts_normal = result_normal.logical_counts
+
+        algo_fast = MPSSequentialStatePreparation()
+        algo_fast.settings().update("fast_resource_estimation", True)
+        circuit_fast = algo_fast.run(mps)
+        result_fast = circuit_fast.estimate()
+        counts_fast = result_fast.logical_counts
+
+        assert counts_fast["numQubits"] == counts_normal["numQubits"]
+
+        normal_ccz = counts_normal["cczCount"]
+        fast_ccz = counts_fast["cczCount"]
+        assert fast_ccz > 0
+        ratio = fast_ccz / normal_ccz
+        assert 0.9 <= ratio <= 1.1, (
+            f"Fast/normal CCZ ratio {ratio:.3f} outside [0.9, 1.1]: fast={fast_ccz}, normal={normal_ccz}"
+        )
+
+
+# =============================================================================
+# Helper functions for Q# literal generation
+# =============================================================================
+
+
+def _float_to_qsharp(x: float) -> str:
+    """Format float for Q#."""
+    return f"{x:.15f}"
+
+
+def _build_mps_eval_code(params: dict) -> str:
+    """Build Q# eval code for MPSSequential from the params dict.
+
+    Assumes `state` and `ancilla` qubit registers are already allocated in scope.
+    """
+    initial_state_str = ", ".join(_float_to_qsharp(x) for x in params["initialStateVec"])
+    args = [
+        f"[{initial_state_str}]",
+        str(params["numSites"]),
+        str(params["rotationBits"]),
+        _nested_list_to_qsharp_3d(params["siteVLayerAngles"]),
+        _nested_list_to_qsharp_2d_bool(params["siteVLayerShifted"]),
+        _nested_list_to_qsharp_2d_bool(params["siteVPhases"]),
+        _nested_list_to_qsharp_2d_float(params["siteRot0Angles"]),
+        _nested_list_to_qsharp_2d_float(params["siteRot1Angles"]),
+        _nested_list_to_qsharp_2d_float(params["siteRot2Angles"]),
+        _nested_list_to_qsharp_3d(params["siteW0LayerAngles"]),
+        _nested_list_to_qsharp_2d_bool(params["siteW0LayerShifted"]),
+        _nested_list_to_qsharp_2d_bool(params["siteW0Phases"]),
+        _nested_list_to_qsharp_3d(params["siteW1LayerAngles"]),
+        _nested_list_to_qsharp_2d_bool(params["siteW1LayerShifted"]),
+        _nested_list_to_qsharp_2d_bool(params["siteW1Phases"]),
+        _nested_list_to_qsharp_3d(params["siteULayerAngles"]),
+        _nested_list_to_qsharp_2d_bool(params["siteULayerShifted"]),
+        _nested_list_to_qsharp_2d_bool(params["siteUPhases"]),
+        "state",
+        "ancilla",
+    ]
+    args_str = ",\n                ".join(args)
+    return f"MPSSequential.MPSSequential(\n                {args_str}\n    )"
+
+
+def _nested_list_to_qsharp_3d(data: list) -> str:
+    """Convert list[list[list[float]]] to Q# literal."""
+    site_strs = []
+    for site in data:
+        layer_strs = []
+        for layer in site:
+            angles = ", ".join(_float_to_qsharp(a) for a in layer)
+            layer_strs.append(f"[{angles}]")
+        site_strs.append(f"[{', '.join(layer_strs)}]")
+    return f"[{', '.join(site_strs)}]"
+
+
+def _nested_list_to_qsharp_2d_float(data: list) -> str:
+    """Convert list[list[float]] to Q# literal."""
+    site_strs = []
+    for site in data:
+        vals = ", ".join(_float_to_qsharp(v) for v in site)
+        site_strs.append(f"[{vals}]")
+    return f"[{', '.join(site_strs)}]"
+
+
+def _nested_list_to_qsharp_2d_bool(data: list) -> str:
+    """Convert list[list[bool]] to Q# literal."""
+    site_strs = []
+    for site in data:
+        vals = ", ".join("true" if b else "false" for b in site)
+        site_strs.append(f"[{vals}]")
+    return f"[{', '.join(site_strs)}]"
diff --git a/python/tests/test_mps_sparse_state_prep.py b/python/tests/test_mps_sparse_state_prep.py
new file mode 100644
index 000000000..0dbc9efd1
--- /dev/null
+++ b/python/tests/test_mps_sparse_state_prep.py
@@ -0,0 +1,656 @@
+"""Tests for MPS sparse state preparation algorithm.
+
+Tests both the classical preprocessing (decomposition correctness) and
+the full Q# circuit (state preparation fidelity via statevector simulation).
+"""
+
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+import numpy as np
+import pytest
+from qdk import qsharp
+from scipy.linalg import block_diag
+
+from qdk_chemistry.algorithms.state_preparation.mps_sparse import (
+    MPSSparseStatePreparation,
+    _expand_to_unitary,
+    _find_column_permutation,
+    _get_rectangles_and_row_permutation,
+    _invert_perm,
+    _order_blocks,
+    _pad_permutation,
+    _perm_to_bitstrings,
+    _tensor_to_target_matrix,
+    generate_mps_sparse_preparation_data,
+)
+from qdk_chemistry.data.mps_wavefunction import MPSWavefunction
+from qdk_chemistry.utils.qsharp import get_qsharp_utils
+
+# =============================================================================
+# Qualtran reference data (from test_mps_sequential_state_prep.py)
+# =============================================================================
+
+_qualtran_mps_tensors = (
+    np.array(
+        [
+            [
+                [0.01650572, 0.0, 0.0, 0.0],
+                [0.0, -0.52929781, 0.0, 0.0],
+                [0.0, 0.0, -0.84462254, 0.0],
+                [0.0, 0.0, 0.0, -0.07863941],
+            ]
+        ]
+    ),
+    np.array(
+        [
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [-0.05969264, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.9973967, 0.04045497, 0.0, 0.0, 0.0],
+            ],
+            [
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [-0.08381532, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.98376348, 0.15869598, 0.0],
+            ],
+            [
+                [-0.0421477, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.46961402, 0.0265522, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.41109095, 0.03268939, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.77904869],
+            ],
+        ]
+    ),
+    np.array(
+        [
+            [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [-0.19640516, 0.0, 0.0, 0.0], [0.0, -0.98052283, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [-0.98052283, 0.0, 0.0, 0.0], [0.0, 0.19640516, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0, 0.0], [-0.02411236, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, -0.99970925, 0.0]],
+            [[0.0, 0.0, 0.0, 0.0], [-0.99970925, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.02411236, 0.0]],
+            [
+                [-0.17695837, 0.0, 0.0, 0.0],
+                [0.0, -0.58052668, 0.0, 0.0],
+                [0.0, 0.0, -0.53176612, 0.0],
+                [0.0, 0.0, 0.0, -0.59067698],
+            ],
+        ]
+    ),
+    np.array(
+        [
+            [[0.0], [0.0], [0.0], [1.0]],
+            [[0.0], [0.0], [1.0], [0.0]],
+            [[0.0], [1.0], [0.0], [0.0]],
+            [[1.0], [0.0], [0.0], [0.0]],
+        ]
+    ),
+)
+
+_qualtran_mps_expected_state = np.array(
+    [0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.01650572, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.03159519, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.12468186, 0.        , 0.        ,
+     0.51343194, 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.07079231, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.15403441, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.82743524, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.00331447, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.00930066, 0.        , 0.        ,
+     0.03580077, 0.        , 0.        , 0.        , 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.00334943, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.03225657, 0.        , 0.        ,
+     0.        , 0.        , 0.        , 0.01084116, 0.        , 0.        ,
+     0.03556534, 0.        , 0.        , 0.03257808, 0.        , 0.        ,
+     0.03618719, 0.        , 0.        , 0.        ])  # fmt: skip
+
+# Qualtran resource estimates for sparse mode.
+QUALTRAN_COST_SPARSE = {"num_qubits": 32, "toffoli": 321}
+
+
+# =============================================================================
+# Tests: Preprocessing correctness
+# =============================================================================
+
+
+class TestSparseDecompositionHelpers:
+    """Unit tests for the internal sparse decomposition helper functions."""
+
+    def test_tensor_to_target_matrix_shape(self):
+        """Verify target matrix has correct shape."""
+        tensor = _qualtran_mps_tensors[1]  # shape (4, 4, 6)
+        chi_left = tensor.shape[0]
+        ancilla_dim = 8
+        mat = _tensor_to_target_matrix(tensor, ancilla_dim)
+        assert mat.shape == (4 * ancilla_dim, chi_left)
+
+    def test_tensor_to_target_matrix_sparse(self):
+        """Verify target matrix is sparse (most entries zero)."""
+        tensor = _qualtran_mps_tensors[1]
+        mat = _tensor_to_target_matrix(tensor, 8)
+        dense = mat.toarray()
+        nnz_frac = np.count_nonzero(dense) / dense.size
+        assert nnz_frac < 0.15  # less than 15% non-zero
+
+    def test_rectangles_cover_nonzero_rows(self):
+        """Verify that extracted rectangles cover all nonzero structure."""
+        tensor = _qualtran_mps_tensors[1]
+        ancilla_dim = 8
+        mat = _tensor_to_target_matrix(tensor, ancilla_dim)
+        rectangles, row_perm = _get_rectangles_and_row_permutation(mat)
+
+        # All nonzero rows should be captured in row_perm
+        dense = mat.toarray()
+        nonzero_rows = set(np.where(np.any(dense != 0, axis=1))[0])
+        assert nonzero_rows.issubset(set(row_perm))
+
+        # Each rectangle should be non-empty
+        for rect in rectangles:
+            assert rect.size > 0
+
+    def test_column_permutation_is_valid(self):
+        """Verify column permutation is a valid permutation."""
+        tensor = _qualtran_mps_tensors[1]
+        ancilla_dim = 8
+        active_dim = 4 * ancilla_dim
+        mat = _tensor_to_target_matrix(tensor, ancilla_dim)
+        rectangles, _ = _get_rectangles_and_row_permutation(mat)
+        col_perm = _find_column_permutation(rectangles, active_dim)
+
+        # Should be a valid permutation of [0..active_dim-1]
+        assert sorted(col_perm) == list(range(active_dim))
+
+    def test_expand_to_unitary_produces_unitary(self):
+        """Verify expanded blocks are unitary."""
+        tensor = _qualtran_mps_tensors[1]
+        ancilla_dim = 8
+        mat = _tensor_to_target_matrix(tensor, ancilla_dim)
+        rectangles, _ = _get_rectangles_and_row_permutation(mat)
+
+        for rect in rectangles:
+            unitary = _expand_to_unitary(rect)
+            h = unitary.shape[0]
+            assert np.allclose(unitary @ unitary.T, np.eye(h), atol=1e-8)
+
+    def test_order_blocks_sorts_descending(self):
+        """Verify blocks are sorted by size largest first."""
+        blocks = [np.eye(2), np.eye(5), np.eye(3)]
+        dim = 10
+        _, sorted_blocks = _order_blocks(blocks, dim)
+        sizes = [b.shape[0] for b in sorted_blocks]
+        assert sizes == sorted(sizes, reverse=True)
+
+    def test_order_blocks_permutation_valid(self):
+        """Verify _order_blocks returns a valid permutation."""
+        blocks = [np.eye(2), np.eye(5), np.eye(3)]
+        dim = 10
+        perm, _ = _order_blocks(blocks, dim)
+        assert sorted(perm) == list(range(dim))
+
+    def test_invert_perm_roundtrip(self):
+        """Verify inverting a permutation round-trips."""
+        perm = [3, 0, 4, 1, 2]
+        inv = _invert_perm(perm)
+        # Applying perm then inv should be identity
+        for i in range(len(perm)):
+            assert inv[perm[i]] == i
+
+    def test_perm_to_bitstrings_roundtrip(self):
+        """Verify bitstring encoding round-trips correctly."""
+        perm = [5, 2, 7, 0, 3, 1, 6, 4]
+        num_bits = 3
+        bitstrings = _perm_to_bitstrings(perm, num_bits)
+        for i, bits in enumerate(bitstrings):
+            val = sum(int(b) << j for j, b in enumerate(bits))
+            assert val == perm[i]
+
+
+class TestSparseDecompositionCorrectness:
+    """End-to-end correctness tests for the sparse decomposition.
+
+    Verifies that V[invert(row_perm)][:, col_perm][:, :cols] == target.
+    """
+
+    @pytest.mark.parametrize("site_idx", [1, 2, 3])
+    def test_decomposition_reconstructs_target(self, site_idx):
+        """Verify decomposition reproduces the target matrix for each site."""
+        tensor = _qualtran_mps_tensors[site_idx]
+        chi_left = tensor.shape[0]
+        ancilla_dim = 8
+        active_dim = 4 * ancilla_dim
+
+        target_mat = _tensor_to_target_matrix(tensor, ancilla_dim)
+        target_dense = target_mat.toarray()
+        num_cols = chi_left
+
+        # Run the decomposition steps manually
+        rectangles, row_perm = _get_rectangles_and_row_permutation(target_mat)
+        row_perm = _pad_permutation(row_perm, active_dim)
+        col_perm = _find_column_permutation(rectangles, active_dim)
+
+        blocks = [_expand_to_unitary(r) for r in rectangles]
+        blocks += [np.eye(1)] * (active_dim - sum(b.shape[0] for b in blocks))
+
+        ordering_perm, blocks = _order_blocks(blocks, active_dim)
+
+        # Compose following Qualtran convention
+        col_composed = [col_perm[ordering_perm[i]] for i in range(active_dim)]
+        col_perm_final = _invert_perm(col_composed)
+        row_perm_final = [row_perm[ordering_perm[i]] for i in range(active_dim)]
+
+        row_inv = _invert_perm(row_perm_final)
+        block_diag_mat = block_diag(*blocks)
+        result = block_diag_mat[row_inv][:, col_perm_final][:, :num_cols]
+
+        assert np.allclose(result, target_dense, atol=1e-10), (
+            f"Decomposition failed for site {site_idx}: max error = {np.max(np.abs(result - target_dense)):.2e}"
+        )
+
+    def test_decomposition_all_blocks_unitary(self):
+        """Verify all blocks after ordering are unitary."""
+        tensor = _qualtran_mps_tensors[1]
+        ancilla_dim = 8
+        active_dim = 4 * ancilla_dim
+
+        mat = _tensor_to_target_matrix(tensor, ancilla_dim)
+        rectangles, _ = _get_rectangles_and_row_permutation(mat)
+        blocks = [_expand_to_unitary(r) for r in rectangles]
+        blocks += [np.eye(1)] * (active_dim - sum(b.shape[0] for b in blocks))
+        _, blocks = _order_blocks(blocks, active_dim)
+
+        for i, b in enumerate(blocks):
+            h = b.shape[0]
+            err = np.max(np.abs(b @ b.T - np.eye(h)))
+            assert err < 1e-8, f"Block {i} (size {h}) not unitary: error={err:.2e}"
+
+
+class TestGenerateMPSSparsePreparationData:
+    """Test the full sparse preprocessing pipeline."""
+
+    def test_data_structure_standard(self):
+        """Verify sparse data has correct structure for standard tensors."""
+        data = generate_mps_sparse_preparation_data(_qualtran_mps_tensors)
+
+        assert data.num_sites == 4
+        assert data.ancilla_bits >= 2
+        # One entry per site after the first
+        assert len(data.sites) == 3
+
+    def test_initial_state_normalized(self):
+        """Verify the initial state vector is normalized."""
+        data = generate_mps_sparse_preparation_data(_qualtran_mps_tensors)
+        init_vec = np.array(data.initial_state_vec)
+        assert abs(np.linalg.norm(init_vec) - 1.0) < 1e-10
+
+    def test_permutation_sizes_consistent(self):
+        """Verify permutation target arrays have consistent sizes."""
+        data = generate_mps_sparse_preparation_data(_qualtran_mps_tensors)
+        ancilla_dim = 1 << data.ancilla_bits
+        active_dim = 4 * ancilla_dim
+
+        for site in data.sites:
+            assert len(site.col_perm_targets) == active_dim
+            assert len(site.row_perm_targets) == active_dim
+            assert sorted(site.col_perm_targets) == list(range(active_dim))
+            assert sorted(site.row_perm_targets) == list(range(active_dim))
+            assert site.target_bits == int(np.log2(active_dim))
+
+    def test_to_qsharp_params_structure(self):
+        """Verify to_qsharp_params returns all expected keys."""
+        data = generate_mps_sparse_preparation_data(_qualtran_mps_tensors)
+        params = data.to_qsharp_params(rotation_bits=10)
+
+        expected_keys = {
+            "initialStateVec",
+            "numSites",
+            "rotationBits",
+            "numAncillaQubits",
+            "siteColPermTargets",
+            "siteColInvPermTargets",
+            "siteRowPermTargets",
+            "siteRowInvPermTargets",
+            "siteBlockLayerAngles",
+            "siteBlockLayerShifted",
+            "siteBlockPhases",
+        }
+        assert set(params.keys()) == expected_keys
+        assert params["numSites"] == 4
+        assert params["rotationBits"] == 10
+        assert len(params["siteColPermTargets"]) == 3
+        assert len(params["siteRowPermTargets"]) == 3
+        assert len(params["siteColInvPermTargets"]) == 3
+        assert len(params["siteRowInvPermTargets"]) == 3
+
+    @pytest.mark.parametrize(
+        ("num_sites", "bond_dim", "seed"),
+        [
+            (2, 4, 42),
+            (3, 2, 123),
+            (4, 2, 456),
+        ],
+    )
+    def test_random_mps_decomposition(self, num_sites, bond_dim, seed):
+        """Verify decomposition produces valid data for random MPS tensors."""
+        rng = np.random.default_rng(seed)
+        mps = MPSWavefunction.random(num_sites=num_sites, bond_dim=bond_dim, rng=rng)
+        data = generate_mps_sparse_preparation_data(mps.tensors)
+
+        assert data.num_sites == num_sites
+        assert len(data.sites) == num_sites - 1
+        init_vec = np.array(data.initial_state_vec)
+        assert abs(np.linalg.norm(init_vec) - 1.0) < 1e-10
+
+
+class TestMPSSparseQSharpFidelity:
+    """Test that the MPSSparse Q# circuit produces the correct state."""
+
+    @pytest.mark.parametrize(
+        ("num_sites", "bond_dim", "seed"),
+        [
+            (2, 4, 42),
+        ],
+    )
+    def test_fidelity_random_mps(self, num_sites, bond_dim, seed):
+        """Test sparse state preparation fidelity on random MPS.
+
+        Uses single-shot statevector simulation via DumpMachine().
+        """
+        get_qsharp_utils()
+
+        rng = np.random.default_rng(seed)
+        mps = MPSWavefunction.random(num_sites=num_sites, bond_dim=bond_dim, rng=rng)
+        target_state = mps.contract()
+
+        data = generate_mps_sparse_preparation_data(mps.tensors)
+        params = data.to_qsharp_params(rotation_bits=10)
+
+        num_state_qubits = 2 * num_sites
+        num_ancilla_qubits = data.ancilla_bits
+
+        qs_code = _build_mps_sparse_eval_code(params)
+        qsharp.eval(f"use state = Qubit[{num_state_qubits}];")
+        qsharp.eval(f"use ancilla = Qubit[{num_ancilla_qubits}];")
+        qsharp.eval(qs_code)
+        dump = qsharp.dump_machine()
+        amplitudes = np.array(dump.as_dense_state(), dtype=complex)
+        qsharp.eval("ResetAll(state + ancilla);")
+
+        state_amplitudes = _extract_state_amplitudes(amplitudes, num_state_qubits, num_ancilla_qubits)
+
+        # P(ancilla = |0>) should be high
+        ancilla_zero_prob = np.sum(np.abs(state_amplitudes) ** 2)
+        assert ancilla_zero_prob > 0.85, f"P(ancilla=0) = {ancilla_zero_prob:.4f} too low"
+
+        # Normalize and reindex
+        state_amplitudes = state_amplitudes / np.sqrt(ancilla_zero_prob)
+        state_amplitudes = _reindex_sites(state_amplitudes, num_sites)
+
+        # Compute probability fidelity (Bhattacharyya coefficient squared).
+        # The sparse circuit uses measurement-based QROAM uncomputation which
+        # introduces non-deterministic phase flips; probability fidelity is the
+        # appropriate metric.
+        p = np.abs(state_amplitudes[: len(target_state)]) ** 2
+        q = np.abs(target_state) ** 2
+        fidelity = np.sum(np.sqrt(p * q)) ** 2
+        assert fidelity > 0.90, f"Fidelity {fidelity:.4f} too low for num_sites={num_sites}, bond_dim={bond_dim}"
+
+    def test_fidelity_qualtran_tensors(self):
+        """Test sparse preparation fidelity on the Qualtran reference tensors."""
+        get_qsharp_utils()
+
+        mps = MPSWavefunction(_qualtran_mps_tensors)
+        target_state = _qualtran_mps_expected_state
+
+        data = generate_mps_sparse_preparation_data(mps.tensors)
+        params = data.to_qsharp_params(rotation_bits=10)
+
+        num_sites = 4
+        num_state_qubits = 2 * num_sites
+        num_ancilla_qubits = data.ancilla_bits
+
+        qs_code = _build_mps_sparse_eval_code(params)
+        qsharp.eval(f"use state = Qubit[{num_state_qubits}];")
+        qsharp.eval(f"use ancilla = Qubit[{num_ancilla_qubits}];")
+        qsharp.eval(qs_code)
+        dump = qsharp.dump_machine()
+        qsharp.eval("ResetAll(state + ancilla);")
+
+        # Use sparse extraction (dump may have many internal qubits from QROAM)
+        state_amplitudes = _extract_state_amplitudes_sparse(dump, num_state_qubits, num_ancilla_qubits)
+
+        ancilla_zero_prob = np.sum(np.abs(state_amplitudes) ** 2)
+        assert ancilla_zero_prob > 0.85, f"P(ancilla=0) = {ancilla_zero_prob:.4f} too low for Qualtran tensors"
+
+        state_amplitudes = state_amplitudes / np.sqrt(ancilla_zero_prob)
+        state_amplitudes = _reindex_sites(state_amplitudes, num_sites)
+
+        # Probability fidelity: measurement-based QROAM uncomputation
+        # introduces non-deterministic phase flips but preserves the correct
+        # probability distribution.
+        p = np.abs(state_amplitudes[: len(target_state)]) ** 2
+        q = np.abs(target_state) ** 2
+        fidelity = np.sum(np.sqrt(p * q)) ** 2
+        assert fidelity > 0.90, f"Qualtran tensor fidelity {fidelity:.4f} too low"
+
+
+class TestMPSSparseResourceEstimate:
+    """Test that resource estimates are consistent with Qualtran sparse mode."""
+
+    def test_resource_estimate_qubit_count(self):
+        """Verify Q# resource estimate qubit count is reasonable."""
+        mps = MPSWavefunction(_qualtran_mps_tensors)
+        algo = MPSSparseStatePreparation()
+        circuit = algo.run(mps)
+        result = circuit.estimate()
+        counts = result.logical_counts
+
+        # Sparse mode should use comparable qubits to Qualtran
+        assert counts["numQubits"] >= QUALTRAN_COST_SPARSE["num_qubits"]
+        assert counts["numQubits"] <= QUALTRAN_COST_SPARSE["num_qubits"] * 3
+
+    def test_resource_estimate_toffoli_count(self):
+        """Verify Q# Toffoli count is in a reasonable range."""
+        mps = MPSWavefunction(_qualtran_mps_tensors)
+        algo = MPSSparseStatePreparation()
+        circuit = algo.run(mps)
+        result = circuit.estimate()
+        counts = result.logical_counts
+
+        assert counts["cczCount"] > 0
+        # Allow up to 5x Qualtran (different decomposition choices)
+        assert counts["cczCount"] <= QUALTRAN_COST_SPARSE["toffoli"] * 5
+
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+
+
+def _extract_state_amplitudes(
+    amplitudes: np.ndarray,
+    num_state_qubits: int,
+    num_ancilla_qubits: int,
+) -> np.ndarray:
+    """Extract amplitudes where ancilla = |0>.
+
+    DumpMachine qubit ordering: state[0]...state[N-1], ancilla[0]...
+    Ancilla qubits are the rightmost bits.
+    """
+    num_total_qubits = num_state_qubits + num_ancilla_qubits
+    dim = 2**num_total_qubits
+    ancilla_mask = (1 << num_ancilla_qubits) - 1
+    state_dim = 2**num_state_qubits
+    state_amplitudes = np.zeros(state_dim, dtype=complex)
+    for idx in range(dim):
+        if (idx & ancilla_mask) == 0:
+            state_idx = idx >> num_ancilla_qubits
+            state_amplitudes[state_idx] = amplitudes[idx]
+    return state_amplitudes
+
+
+def _extract_state_amplitudes_sparse(
+    dump,
+    num_state_qubits: int,
+    num_ancilla_qubits: int,
+) -> np.ndarray:
+    """Extract amplitudes where ancilla = |0> from a sparse DumpMachine result.
+
+    Works with large qubit counts where as_dense_state() would be infeasible.
+    Only considers basis states where all internal qubits (beyond state+ancilla)
+    are |0>, i.e., properly uncomputed.
+
+    Parameters
+    ----------
+    dump : StateDump
+        Sparse dump from qsharp.dump_machine().
+    num_state_qubits : int
+        Number of state qubits (lowest-addressed qubits).
+    num_ancilla_qubits : int
+        Number of ancilla qubits (next after state).
+
+    Returns
+    -------
+    np.ndarray
+        Amplitudes for the state register conditioned on ancilla = |0>.
+
+    """
+    num_relevant_qubits = num_state_qubits + num_ancilla_qubits
+    ancilla_mask = (1 << num_ancilla_qubits) - 1
+    state_dim = 2**num_state_qubits
+    state_amplitudes = np.zeros(state_dim, dtype=complex)
+
+    for idx in dump:
+        # Only consider states where internal qubits (above state+ancilla) are 0
+        if idx >> num_relevant_qubits != 0:
+            continue
+        # Only consider states where ancilla = |0>
+        if (idx & ancilla_mask) == 0:
+            state_idx = idx >> num_ancilla_qubits
+            if state_idx < state_dim:
+                state_amplitudes[state_idx] = dump[idx]
+
+    return state_amplitudes
+
+
+def _reindex_sites(state_amplitudes: np.ndarray, num_sites: int) -> np.ndarray:
+    """Reindex from Q# big-endian to Python MPS convention.
+
+    The Q# circuit uses little-endian within each 2-qubit site, so
+    DumpMachine's big-endian bits need to be reversed within each site.
+    """
+    site_bits = 2
+    state_dim = len(state_amplitudes)
+    reordered = np.zeros_like(state_amplitudes)
+    for dm_idx in range(state_dim):
+        py_idx = 0
+        for site in range(num_sites):
+            shift = (num_sites - 1 - site) * site_bits
+            site_val = (dm_idx >> shift) & ((1 << site_bits) - 1)
+            # Reverse bits within this site
+            rev_val = 0
+            for b in range(site_bits):
+                if site_val & (1 << b):
+                    rev_val |= 1 << (site_bits - 1 - b)
+            py_idx |= rev_val << shift
+        reordered[py_idx] = state_amplitudes[dm_idx]
+    return reordered
+
+
+def _float_to_qsharp(x: float) -> str:
+    """Format float for Q#."""
+    return f"{x:.15f}"
+
+
+def _build_mps_sparse_eval_code(params: dict) -> str:
+    """Build Q# eval code for MPSSparse from the params dict.
+
+    Assumes `state` and `ancilla` qubit registers are already allocated in scope.
+    """
+    initial_state_str = ", ".join(_float_to_qsharp(x) for x in params["initialStateVec"])
+    args = [
+        f"[{initial_state_str}]",
+        str(params["numSites"]),
+        str(params["rotationBits"]),
+        _nested_list_to_qsharp_3d_bool(params["siteColPermTargets"]),
+        _nested_list_to_qsharp_3d_bool(params["siteColInvPermTargets"]),
+        _nested_list_to_qsharp_3d_bool(params["siteRowPermTargets"]),
+        _nested_list_to_qsharp_3d_bool(params["siteRowInvPermTargets"]),
+        _nested_list_to_qsharp_3d(params["siteBlockLayerAngles"]),
+        _nested_list_to_qsharp_2d_bool(params["siteBlockLayerShifted"]),
+        _nested_list_to_qsharp_2d_bool(params["siteBlockPhases"]),
+        "state",
+        "ancilla",
+    ]
+    args_str = ",\n                ".join(args)
+    return f"MPSSparse.MPSSparse(\n                {args_str}\n    )"
+
+
+def _nested_list_to_qsharp_3d(data: list) -> str:
+    """Convert list[list[list[float]]] to Q# literal."""
+    site_strs = []
+    for site in data:
+        layer_strs = []
+        for layer in site:
+            angles = ", ".join(_float_to_qsharp(a) for a in layer)
+            layer_strs.append(f"[{angles}]")
+        site_strs.append(f"[{', '.join(layer_strs)}]")
+    return f"[{', '.join(site_strs)}]"
+
+
+def _nested_list_to_qsharp_3d_bool(data: list) -> str:
+    """Convert list[list[list[bool]]] to Q# literal."""
+    site_strs = []
+    for site in data:
+        row_strs = []
+        for row in site:
+            vals = ", ".join("true" if b else "false" for b in row)
+            row_strs.append(f"[{vals}]")
+        site_strs.append(f"[{', '.join(row_strs)}]")
+    return f"[{', '.join(site_strs)}]"
+
+
+def _nested_list_to_qsharp_2d_bool(data: list) -> str:
+    """Convert list[list[bool]] to Q# literal."""
+    site_strs = []
+    for site in data:
+        vals = ", ".join("true" if b else "false" for b in site)
+        site_strs.append(f"[{vals}]")
+    return f"[{', '.join(site_strs)}]"
diff --git a/python/tests/test_state_preparation.py b/python/tests/test_state_preparation.py
index 54a4478a2..3a3ba0be7 100644
--- a/python/tests/test_state_preparation.py
+++ b/python/tests/test_state_preparation.py
@@ -348,7 +348,12 @@ def size(self):
             return len(self.get_active_determinants())
 
     mock_wfn = MockWavefunction()
+    # Skip MPS algorithms — they require MPSWavefunction and raise TypeError
+    # before reaching the asymmetric active space validation.
+    mps_keys = {"mps_sequential", "mps_sparse"}
     for sp_key in available("state_prep"):
+        if sp_key in mps_keys:
+            continue
         prep = create("state_prep", sp_key)
         with pytest.raises(
             ValueError,