From feab3fdfe22c86b8160b74b18e1720b0e26c9640 Mon Sep 17 00:00:00 2001
From: Alban Puech <nabla.hceup@gmail.com>
Date: Fri, 16 Jan 2026 18:49:52 +0100
Subject: [PATCH 1/3] fix of seed + gen cost permutation + pv to pq

---
 README.md                                     |   2 +
 gridfm_datakit/generate.py                    | 124 ++++++++------
 gridfm_datakit/interactive.py                 |   1 +
 gridfm_datakit/network.py                     |  87 +++++++++-
 .../perturbations/generator_perturbation.py   |  56 +++++--
 .../perturbations/load_perturbation.py        |  44 ++---
 gridfm_datakit/process/process_network.py     | 110 +++++++++----
 gridfm_datakit/utils/random_seed.py           |  47 ++++++
 scripts/compare_parquet_files.py              |  25 +++
 .../config/Texas2k_case1_2016summerpeak.yaml  |   1 +
 scripts/config/default.yaml                   |   5 +-
 scripts/user_config.yaml                      |  39 +++++
 tests/config/consistency_test.yaml            |   1 +
 tests/config/default_opf_mode_test.yaml       |   1 +
 tests/config/default_pf_mode_test.yaml        |   1 +
 .../default_without_perturbation_test.yaml    |   1 +
 tests/test_generate.py                        | 118 ++++++++++++--
 tests/test_generator_perturbation.py          | 153 ++++++++++++++----
 tests/test_load_perturbation.py               |  16 +-
 tests/test_random_seed.py                     | 105 ++++++++++++
 tests/test_verify_network.py                  |  47 ++++++
 21 files changed, 823 insertions(+), 161 deletions(-)
 create mode 100644 gridfm_datakit/utils/random_seed.py
 create mode 100644 scripts/user_config.yaml
 create mode 100644 tests/test_random_seed.py
diff --git a/README.md b/README.md
index 61c3edb6b..ed919c5cd 100644
--- a/README.md
+++ b/README.md
@@ -187,6 +187,8 @@ settings:
   pf_fast: true # Whether to use fast PF solver by default (compute_ac_pf from powermodels.jl); if false, uses Ipopt-based PF. Some networks (typically large ones e.g. case10000_goc) do not work with pf_fast: true. pf_fast is faster and more accurate than the Ipopt-based PF.
   dcpf_fast: true # Whether to use fast DCPF solver by default (compute_dc_pf from PowerModels.jl)
   max_iter: 200 # Max iterations for Ipopt-based solvers
+  seed: null # Seed for random number generation. If null, a random seed is generated (RECOMMENDED). To get the same data across runs, set the seed and note that ALL OTHER PARAMETERS IN THE CONFIG FILE MUST BE THE SAME.
+
 ```
 
 <br>
diff --git a/gridfm_datakit/generate.py b/gridfm_datakit/generate.py
index 157a23e3d..37c82a3be 100644
--- a/gridfm_datakit/generate.py
+++ b/gridfm_datakit/generate.py
@@ -36,11 +36,12 @@
 import sys
 from gridfm_datakit.network import Network
 from gridfm_datakit.process.process_network import init_julia
+from gridfm_datakit.utils.random_seed import custom_seed
 
 
 def _setup_environment(
     config: Union[str, Dict[str, Any], NestedNamespace],
-) -> Tuple[NestedNamespace, str, Dict[str, str]]:
+) -> Tuple[NestedNamespace, str, Dict[str, str], int]:
     """Setup the environment for data generation.
 
     Args:
@@ -50,7 +51,7 @@ def _setup_environment(
             3. NestedNamespace object (NestedNamespace)
 
     Returns:
-        Tuple of (args, base_path, file_paths)
+        Tuple of (args, base_path, file_paths, seed)
     """
     # Load config from file if a path is provided
     if isinstance(config, str):
@@ -63,6 +64,25 @@ def _setup_environment(
     else:
         args = config
 
+        # Set global seed if provided, otherwise generate a unique seed for this generation
+    if (
+        hasattr(args.settings, "seed")
+        and args.settings.seed is not None
+        and args.settings.seed != ""
+    ):
+        seed = args.settings.seed
+        print(f"Global random seed set to: {seed}")
+
+    else:
+        # Generate a unique seed for non-reproducible but independent scenarios
+        # This ensures scenarios are i.i.d. within a run, but different across runs
+        import secrets
+
+        seed = secrets.randbelow(50_000)
+        # chunk_seed = seed * 20000 + start_idx + 1 < 2^31 - 1
+        # seed < (2,147,483,647 - n_scenarios) / 20,000 ~= 100_000 so taking 50_000 to be safe
+        print(f"No seed provided. Using seed={seed}")
+
     # Setup output directory
     base_path = os.path.join(args.settings.data_dir, args.network.name, "raw")
     if os.path.exists(base_path) and args.settings.overwrite:
@@ -115,18 +135,20 @@ def _setup_environment(
             if log_file == file_paths["args_log"]:
                 yaml.safe_dump(args.to_dict(), f)
 
-    return args, base_path, file_paths
+    return args, base_path, file_paths, seed
 
 
 def _prepare_network_and_scenarios(
     args: NestedNamespace,
     file_paths: Dict[str, str],
+    seed: int,
 ) -> Tuple[Network, np.ndarray]:
     """Prepare the network and generate load scenarios.
 
     Args:
         args: Configuration object
         file_paths: Dictionary of file paths
+        seed: Global random seed for reproducibility.
 
     Returns:
         Tuple of (network, scenarios)
@@ -147,6 +169,7 @@ def _prepare_network_and_scenarios(
         args.load.scenarios,
         file_paths["scenarios_log"],
         max_iter=args.settings.max_iter,
+        seed=seed,
     )
     scenarios_df = load_scenarios_to_df(scenarios)
     scenarios_df.to_parquet(file_paths["scenarios"], index=False, engine="pyarrow")
@@ -230,10 +253,10 @@ def generate_power_flow_data(
     """
 
     # Setup environment
-    args, base_path, file_paths = _setup_environment(config)
+    args, base_path, file_paths, seed = _setup_environment(config)
 
     # Prepare network and scenarios
-    net, scenarios = _prepare_network_and_scenarios(args, file_paths)
+    net, scenarios = _prepare_network_and_scenarios(args, file_paths, seed)
 
     # Initialize topology generator
     topology_generator = initialize_topology_generator(args.topology_perturbation, net)
@@ -254,48 +277,50 @@ def generate_power_flow_data(
 
     processed_data = []
 
-    # Process scenarios sequentially
-    with open(file_paths["tqdm_log"], "a") as f:
-        with tqdm(
-            total=args.load.scenarios,
-            desc="Processing scenarios",
-            file=Tee(sys.stdout, f),
-            miniters=5,
-        ) as pbar:
-            for scenario_index in range(args.load.scenarios):
-                # Process the scenario
-                if args.settings.mode == "opf":
-                    processed_data = process_scenario_opf_mode(
-                        net,
-                        scenarios,
-                        scenario_index,
-                        topology_generator,
-                        generation_generator,
-                        admittance_generator,
-                        processed_data,
-                        file_paths["error_log"],
-                        args.settings.include_dc_res,
-                        jl,
-                    )
-                elif args.settings.mode == "pf":
-                    processed_data = process_scenario_pf_mode(
-                        net,
-                        scenarios,
-                        scenario_index,
-                        topology_generator,
-                        generation_generator,
-                        admittance_generator,
-                        processed_data,
-                        file_paths["error_log"],
-                        args.settings.include_dc_res,
-                        args.settings.pf_fast,
-                        args.settings.dcpf_fast,
-                        jl,
-                    )
-                else:
-                    raise ValueError("Invalid mode!")
-
-                pbar.update(1)
+    # Process scenarios sequentially with deterministic seed
+    # Use custom_seed to control randomness for reproducibility
+    with custom_seed(seed + 1):
+        with open(file_paths["tqdm_log"], "a") as f:
+            with tqdm(
+                total=args.load.scenarios,
+                desc="Processing scenarios",
+                file=Tee(sys.stdout, f),
+                miniters=5,
+            ) as pbar:
+                for scenario_index in range(args.load.scenarios):
+                    # Process the scenario
+                    if args.settings.mode == "opf":
+                        processed_data = process_scenario_opf_mode(
+                            net,
+                            scenarios,
+                            scenario_index,
+                            topology_generator,
+                            generation_generator,
+                            admittance_generator,
+                            processed_data,
+                            file_paths["error_log"],
+                            args.settings.include_dc_res,
+                            jl,
+                        )
+                    elif args.settings.mode == "pf":
+                        processed_data = process_scenario_pf_mode(
+                            net,
+                            scenarios,
+                            scenario_index,
+                            topology_generator,
+                            generation_generator,
+                            admittance_generator,
+                            processed_data,
+                            file_paths["error_log"],
+                            args.settings.include_dc_res,
+                            args.settings.pf_fast,
+                            args.settings.dcpf_fast,
+                            jl,
+                        )
+                    else:
+                        raise ValueError("Invalid mode!")
+
+                    pbar.update(1)
 
     # Save final data
     _save_generated_data(
@@ -339,14 +364,14 @@ def generate_power_flow_data_distributed(
         - scenarios_{generator}.log: Load scenario generation notes
     """
     # Setup environment
-    args, base_path, file_paths = _setup_environment(config)
+    args, base_path, file_paths, seed = _setup_environment(config)
 
     # check if mode is valid
     if args.settings.mode not in ["opf", "pf"]:
         raise ValueError("Invalid mode!")
 
     # Prepare network and scenarios
-    net, scenarios = _prepare_network_and_scenarios(args, file_paths)
+    net, scenarios = _prepare_network_and_scenarios(args, file_paths, seed)
 
     # Initialize topology generator
     topology_generator = initialize_topology_generator(args.topology_perturbation, net)
@@ -405,6 +430,7 @@ def generate_power_flow_data_distributed(
                         args.settings.dcpf_fast,
                         file_paths["solver_log_dir"],
                         args.settings.max_iter,
+                        seed,
                     )
                     for chunk in scenario_chunks
                 ]
diff --git a/gridfm_datakit/interactive.py b/gridfm_datakit/interactive.py
index 17e9f9db0..96d21b996 100644
--- a/gridfm_datakit/interactive.py
+++ b/gridfm_datakit/interactive.py
@@ -97,6 +97,7 @@ def create_config() -> Dict[str, Any]:
             "dcpf_fast": dcpf_fast.value,
             "enable_solver_logs": enable_solver_logs.value,
             "max_iter": max_iter.value,
+            "seed": None,  # seed is not used in the interactive interface
         },
     }
     return config
diff --git a/gridfm_datakit/network.py b/gridfm_datakit/network.py
index 2881aaaea..b2220e1c5 100644
--- a/gridfm_datakit/network.py
+++ b/gridfm_datakit/network.py
@@ -22,6 +22,8 @@
     VM,
     VA,
     REF,
+    PV,
+    PQ,
 )
 from gridfm_datakit.utils.idx_gen import GEN_BUS, GEN_STATUS, PG, QG
 from gridfm_datakit.utils.idx_brch import (
@@ -36,12 +38,71 @@
     BR_R_ASYM,
     BR_X_ASYM,
 )
-from gridfm_datakit.utils.idx_cost import NCOST
+from gridfm_datakit.utils.idx_cost import NCOST, MODEL, POLYNOMIAL
 import warnings
 import networkx as nx
 import numpy as np
 import copy
 from typing import Dict, Tuple, Any
+import tempfile
+from juliapkg.state import STATE
+from juliapkg.deps import run_julia, executable
+
+
+def correct_network(network_path: str, force: bool = False) -> str:
+    """
+    Load a MATPOWER network using PowerModels via run_julia
+    and save a corrected version.
+
+    Args:
+        network_path: Path to the original MATPOWER .m file.
+        force: If True, regenerate the corrected file even if it exists.
+
+    Returns:
+        Path to the corrected network file.
+
+    Raises:
+        FileNotFoundError: If input file does not exist.
+        RuntimeError: If PowerModels fails.
+    """
+    if not os.path.exists(network_path):
+        raise FileNotFoundError(f"Network file not found: {network_path}")
+
+    base_path, ext = os.path.splitext(network_path)
+    corrected_path = f"{base_path}_corrected{ext}"
+
+    if os.path.exists(corrected_path) and not force:
+        return corrected_path
+
+    # Use temporary file for atomic replace
+    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".m")
+    os.close(tmp_fd)
+
+    try:
+        project = STATE["project"]
+        jl_exe = executable()
+
+        # Julia script as a list of lines
+        julia_code = [
+            "using PowerModels",
+            f'data = PowerModels.parse_file("{network_path}")',
+            f'PowerModels.export_matpower("{tmp_path}", data)',
+        ]
+
+        # Run Julia
+        run_julia(julia_code, project=project, executable=jl_exe)
+
+        # Sanity check
+        if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
+            raise RuntimeError("Julia produced empty MATPOWER file")
+
+        # Atomically replace target file
+        os.replace(tmp_path, corrected_path)
+        return corrected_path
+
+    finally:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
 
 
 def numpy_to_matlab_matrix(array: np.ndarray, name: str) -> str:
@@ -132,6 +193,11 @@ def __init__(self, mpc: Dict[str, Any]) -> None:
         assert np.all(np.isin(self.gens[:, GEN_BUS], self.buses[:, BUS_I])), (
             "All generator buses should be in bus IDs"
         )
+
+        assert np.all(self.gencosts[:, MODEL] == POLYNOMIAL), (
+            "MODEL should be POLYNOMIAL"
+        )
+
         # assert all generators have the same number of cost coefficients
         assert np.all(self.gencosts[:, NCOST] == self.gencosts[:, NCOST][0]), (
             "All generators must have the same number of cost coefficients"
@@ -345,6 +411,21 @@ def deactivate_gens(self, idx_gens: np.ndarray) -> None:
             )
         self.gens[idx_gens, GEN_STATUS] = 0
 
+        # -----------------------------
+        # Update PV buses that lost all generators → PQ
+        # -----------------------------
+        n_buses = self.buses.shape[0]
+
+        # Count in-service generators per bus
+        gens_on = self.gens[self.idx_gens_in_service]
+        gen_count = np.bincount(gens_on[:, GEN_BUS].astype(int), minlength=n_buses)
+
+        # Boolean mask: PV buses with no in-service generator
+        pv_no_gen = (self.buses[:, BUS_TYPE] == PV) & (gen_count == 0)
+
+        # Set them to PQ
+        self.buses[pv_no_gen, BUS_TYPE] = PQ
+
     def check_single_connected_component(self) -> bool:
         """
         Check that the network forms a single connected component.
@@ -541,6 +622,7 @@ def load_net_from_file(network_path: str) -> Network:
         ValueError: If the file format is invalid.
     """
     # Load network using matpowercaseframes
+    network_path = correct_network(network_path)
     mpc_frames = CaseFrames(network_path)
     mpc = {
         key: mpc_frames.__getattribute__(key)
@@ -569,6 +651,7 @@ def load_net_from_pglib(grid_name: str) -> Network:
         FileNotFoundError: If the file cannot be found after download.
         ValueError: If the file format is invalid.
     """
+
     # Construct file paths
     file_path = str(
         resources.files("gridfm_datakit.grids").joinpath(f"pglib_opf_{grid_name}.m"),
@@ -586,6 +669,8 @@ def load_net_from_pglib(grid_name: str) -> Network:
         with open(file_path, "wb") as f:
             f.write(response.content)
 
+    file_path = correct_network(file_path)
+
     # Load network using matpowercaseframes
     mpc_frames = CaseFrames(file_path)
     mpc = {
diff --git a/gridfm_datakit/perturbations/generator_perturbation.py b/gridfm_datakit/perturbations/generator_perturbation.py
index 5492bff66..b1a079e63 100644
--- a/gridfm_datakit/perturbations/generator_perturbation.py
+++ b/gridfm_datakit/perturbations/generator_perturbation.py
@@ -58,8 +58,10 @@ class PermuteGenCostGenerator(GenerationGenerator):
     """Class for permuting generator costs.
 
     This class is for generating different generation scenarios
-    by permuting all the coeffiecient costs between and among
-    generators of power grid networks.
+    by permuting cost coefficients between and among generators
+    of power grid networks. Only generators with non-zero linear (c1)
+    or quadratic (c2) cost coefficients are permuted, preserving
+    zero-cost and constant-only generators.
     """
 
     def __init__(self, base_net: Network) -> None:
@@ -75,6 +77,16 @@ def __init__(self, base_net: Network) -> None:
             "All generators must have the same number of cost coefficients"
         )
 
+        # Identify generators to permute (skip zero-cost and constant-only cost)
+        costs = base_net.gencosts[
+            :,
+            COST:,
+        ]  # all cost coefficients [c2, c1, c0] for NCOST=3
+        # Keep only generators with non-constant terms (c1, c2, ...) non-zero
+        # costs[:, :-1] excludes the last coefficient (c0, the constant term)
+        self.permutable_mask = np.any(costs[:, :-1] != 0, axis=1)
+        self.permutable_indices = np.where(self.permutable_mask)[0]
+
     def generate(
         self,
         example_generator: Generator[Network, None, None],
@@ -88,12 +100,16 @@ def generate(
 
         Yields:
             An example scenario with cost coeffiecients in the
-            poly_cost table permuted
+            poly_cost table permuted (only for dispatchable generators)
         """
         for scenario in example_generator:
-            new_idx = np.random.permutation(self.num_gens)
+            # Only permute the selected generators
+            new_idx = np.random.permutation(self.permutable_indices)
             # Permute the rows (generators) of the cost coefficients (and NCOST, although we assume it is the same for all generators)
-            scenario.gencosts[:, NCOST:] = scenario.gencosts[:, NCOST:][new_idx]
+            scenario.gencosts[self.permutable_indices, NCOST:] = scenario.gencosts[
+                new_idx,
+                NCOST:,
+            ]
             yield scenario
 
 
@@ -101,9 +117,11 @@ class PerturbGenCostGenerator(GenerationGenerator):
     """Class for perturbing generator cost.
 
     This class is for generating different generation scenarios
-    by randomly perturbing all the cost coeffiecient of generators
-    in a power network by multiplying with a scaling factor sampled
-    from a uniform distribution.
+    by randomly perturbing cost coefficients of generators in a
+    power network by multiplying with a scaling factor sampled
+    from a uniform distribution. Only generators with non-zero
+    linear (c1) or quadratic (c2) cost coefficients are perturbed,
+    preserving zero-cost and constant-only generators.
     """
 
     def __init__(self, base_net: Network, sigma: float) -> None:
@@ -126,7 +144,19 @@ def __init__(self, base_net: Network, sigma: float) -> None:
         n_costs = int(base_net.gencosts[:, NCOST][0])
         self.lower = np.max([0.0, 1.0 - sigma])
         self.upper = 1.0 + sigma
-        self.sample_size = [self.num_gens, n_costs]
+
+        # Mask generators to perturb (skip zero-cost or constant-only)
+        costs = base_net.gencosts[
+            :,
+            COST:,
+        ]  # all cost coefficients [c2, c1, c0] for NCOST=3
+        # Keep only generators with non-constant terms (c1, c2, ...) non-zero
+        # costs[:, :-1] excludes the last coefficient (c0, the constant term)
+        self.perturb_mask = np.any(costs[:, :-1] != 0, axis=1)
+        self.n_perturbable = np.sum(self.perturb_mask)
+
+        # Sample size for only the perturbable generators
+        self.sample_size = [self.n_perturbable, n_costs]
 
     def generate(
         self,
@@ -140,13 +170,17 @@ def generate(
 
         Yields:
             An example scenario with cost coeffiecients in the poly_cost
-            table perturbed by multiplying with a scaling factor.
+            table perturbed by multiplying with a scaling factor (only for dispatchable generators).
         """
         for example in example_generator:
+            # Generate scaling factors only for generators that can be perturbed
             scale_fact = np.random.uniform(
                 low=self.lower,
                 high=self.upper,
                 size=self.sample_size,
             )
-            example.gencosts[:, COST:] = example.gencosts[:, COST:] * scale_fact
+            # Apply scaling only to selected generators
+            example.gencosts[self.perturb_mask, COST:] = (
+                example.gencosts[self.perturb_mask, COST:] * scale_fact
+            )
             yield example
diff --git a/gridfm_datakit/perturbations/load_perturbation.py b/gridfm_datakit/perturbations/load_perturbation.py
index b25a73c3c..0d2b2e335 100644
--- a/gridfm_datakit/perturbations/load_perturbation.py
+++ b/gridfm_datakit/perturbations/load_perturbation.py
@@ -9,6 +9,7 @@
 from multiprocessing import Pool
 from gridfm_datakit.network import Network
 from gridfm_datakit.process.process_network import init_julia
+from gridfm_datakit.utils.random_seed import custom_seed
 from typing import Tuple, Any
 
 
@@ -172,6 +173,7 @@ def __call__(
         n_scenarios: int,
         scenario_log: str,
         max_iter: int,
+        seed: int,
     ) -> np.ndarray:
         """Generates load scenarios for a power network.
 
@@ -180,7 +182,7 @@ def __call__(
             n_scenarios: Number of scenarios to generate.
             scenario_log: Path to log file for scenario generation details.
             max_iter: Maximum iterations for the OPF solver.
-
+            seed: Global random seed for reproducibility.
         Returns:
             numpy.ndarray: Array of shape (n_loads, n_scenarios, 2) containing p_mw and q_mvar values.
         """
@@ -352,6 +354,7 @@ def __call__(
         n_scenarios: int,
         scenarios_log: str,
         max_iter: int,
+        seed: int,
     ) -> np.ndarray:
         """Generates load profiles based on aggregated load data.
 
@@ -360,7 +363,7 @@ def __call__(
             n_scenarios: Number of scenarios to generate.
             scenarios_log: Path to log file for scenario generation details.
             max_iter: Maximum iterations for the OPF solver.
-
+            seed: Global random seed for reproducibility.
         Returns:
             numpy.ndarray: Array of shape (n_loads, n_scenarios, 2) containing p_mw and q_mvar values.
 
@@ -430,25 +433,29 @@ def __call__(
             )
             ref_curve = self.interpolate_row(ref_curve, data_points=n_scenarios)
 
-        load_profile_pmw = p_mw_array[:, np.newaxis] * ref_curve
-        noise = np.random.uniform(
-            1 - self.sigma,
-            1 + self.sigma,
-            size=load_profile_pmw.shape,
-        )  # Add uniform noise
-        load_profile_pmw *= noise
-
-        if self.change_reactive_power:
-            load_profile_qmvar = q_mvar_array[:, np.newaxis] * ref_curve
+        # Use custom_seed context manager to temporarily set seed for noise generation
+        with custom_seed(seed):
+            load_profile_pmw = p_mw_array[:, np.newaxis] * ref_curve
             noise = np.random.uniform(
                 1 - self.sigma,
                 1 + self.sigma,
-                size=load_profile_qmvar.shape,
+                size=load_profile_pmw.shape,
             )  # Add uniform noise
-            load_profile_qmvar *= noise
-        else:
-            load_profile_qmvar = q_mvar_array[:, np.newaxis] * np.ones_like(ref_curve)
-            print("No change in reactive power across scenarios")
+            load_profile_pmw *= noise
+
+            if self.change_reactive_power:
+                load_profile_qmvar = q_mvar_array[:, np.newaxis] * ref_curve
+                noise = np.random.uniform(
+                    1 - self.sigma,
+                    1 + self.sigma,
+                    size=load_profile_qmvar.shape,
+                )  # Add uniform noise
+                load_profile_qmvar *= noise
+            else:
+                load_profile_qmvar = q_mvar_array[:, np.newaxis] * np.ones_like(
+                    ref_curve,
+                )
+                print("No change in reactive power across scenarios")
 
         # Stack profiles along the last dimension
         load_profiles = np.stack((load_profile_pmw, load_profile_qmvar), axis=-1)
@@ -508,6 +515,7 @@ def __call__(
         n_scenarios: int,
         scenario_log: str,
         max_iter: int,
+        seed: int,
     ) -> np.ndarray:
         """Generates load profiles based on aggregated load data.
 
@@ -516,7 +524,7 @@ def __call__(
             n_scenarios: Number of scenarios to generate.
             scenario_log: Path to log file for scenario generation details.
             max_iter: Maximum iterations for the OPF solver (unused for Powergraph).
-
+            seed: Global random seed for reproducibility.
         Returns:
             numpy.ndarray: Array of shape (n_loads, n_scenarios, 2) containing p_mw and q_mvar values.
         """
diff --git a/gridfm_datakit/process/process_network.py b/gridfm_datakit/process/process_network.py
index 1dcd6f0c6..3b52dc028 100644
--- a/gridfm_datakit/process/process_network.py
+++ b/gridfm_datakit/process/process_network.py
@@ -23,6 +23,7 @@
 from gridfm_datakit.network import makeYbus, branch_vectors
 import copy
 from gridfm_datakit.process.solvers import run_opf, run_pf, run_dcpf, run_dcopf
+from gridfm_datakit.utils.random_seed import custom_seed
 from gridfm_datakit.utils.idx_bus import (
     GS,
     BS,
@@ -694,6 +695,7 @@ def pf_post_processing(
     assert np.all(np.isin(net.buses[:, BUS_TYPE], [PQ, PV, REF])), (
         "Bus type should be PQ, PV, or REF, no disconnected buses (4)"
     )
+
     X_bus[np.arange(n_buses), 8 + net.buses[:, BUS_TYPE].astype(int) - 1] = (
         1  # because type is 1, 2, 3, not 0, 1, 2
     )
@@ -725,7 +727,11 @@ def pf_post_processing(
             X_bus[:, 17] = np.nan
 
     # --- Generator data ---
-    assert np.all(net.gencosts[:, NCOST] == 3), "NCOST should be 3"
+
+    n_cost = net.gencosts[0, NCOST]
+    assert np.all(net.gencosts[:, NCOST] == n_cost), (
+        "NCOST should be the same for all generators"
+    )
     n_gens = net.gens.shape[0]
     n_cols = (
         len(GEN_COLUMNS) + len(DC_GEN_COLUMNS) if include_dc_res else len(GEN_COLUMNS)
@@ -741,9 +747,22 @@ def pf_post_processing(
     X_gen[:, 6] = net.gens[:, PMAX]
     X_gen[:, 7] = net.gens[:, QMIN]
     X_gen[:, 8] = net.gens[:, QMAX]
-    X_gen[:, 9] = net.gencosts[:, COST + 2]
-    X_gen[:, 10] = net.gencosts[:, COST + 1]
-    X_gen[:, 11] = net.gencosts[:, COST]
+
+    if n_cost == 3:  # order in .m file is c2, c1, c0
+        X_gen[:, 9] = net.gencosts[:, COST + 2]
+        X_gen[:, 10] = net.gencosts[:, COST + 1]
+        X_gen[:, 11] = net.gencosts[:, COST]
+
+    if n_cost == 2:  # order in .m file is c1, c0, and there is no cp2 cost
+        X_gen[:, 9] = net.gencosts[:, COST + 1]
+        X_gen[:, 10] = net.gencosts[:, COST]
+        X_gen[:, 11] = 0  # no cp2 cost for linear cost function
+
+    if n_cost == 1:  # order in .m file is c0, and there is no cp1 or cp2 cost
+        X_gen[:, 9] = net.gencosts[:, COST]
+        X_gen[:, 10] = 0  # no cp1 cost for constant cost function
+        X_gen[:, 11] = 0  # no cp2 cost for constant cost function
+
     X_gen[net.idx_gens_in_service, 12] = 1
 
     # slack gen (can be any generator connected to the ref node)
@@ -833,6 +852,9 @@ def process_scenario_pf_mode(
 
     Returns:
         Updated list of processed data (bus, gen, branch, Y_bus arrays)
+
+    Note:
+        Random seed is controlled by the calling context (process_scenario_chunk).
     """
     net = copy.deepcopy(net)
 
@@ -923,6 +945,7 @@ def process_scenario_chunk(
     dcpf_fast: bool,
     solver_log_dir: str,
     max_iter: int,
+    seed: int,
 ) -> Tuple[
     Union[None, Exception],
     Union[None, str],
@@ -945,6 +968,11 @@ def process_scenario_chunk(
         admittance_generator: Generator for line admittance perturbations.
         error_log_path: Path to error log file for recording failures.
         include_dc_res: Whether to include DC power flow results in output.
+        pf_fast: Whether to use fast AC PF solver.
+        dcpf_fast: Whether to use fast DC PF solver.
+        solver_log_dir: Directory for solver logs.
+        max_iter: Maximum iterations for the solver.
+        seed: Global random seed for reproducibility.
 
     Returns:
         Tuple containing:
@@ -956,37 +984,47 @@ def process_scenario_chunk(
     try:
         jl = init_julia(max_iter, solver_log_dir)
         local_processed_data = []
-        for scenario_index in range(start_idx, end_idx):
-            if mode == "opf":
-                local_processed_data = process_scenario_opf_mode(
-                    net,
-                    scenarios,
-                    scenario_index,
-                    topology_generator,
-                    generation_generator,
-                    admittance_generator,
-                    local_processed_data,
-                    error_log_path,
-                    include_dc_res,
-                    jl,
-                )
-            elif mode == "pf":
-                local_processed_data = process_scenario_pf_mode(
-                    net,
-                    scenarios,
-                    scenario_index,
-                    topology_generator,
-                    generation_generator,
-                    admittance_generator,
-                    local_processed_data,
-                    error_log_path,
-                    include_dc_res,
-                    pf_fast,
-                    dcpf_fast,
-                    jl,
-                )
 
-            progress_queue.put(1)  # update queue
+        # Use custom_seed to set seed based on start_idx for this chunk
+        # This ensures each chunk gets a unique but deterministic seed
+        # we multiply by 20_000 to ensure there is no collision with other runs where the seed would be close to each other
+        # example (assuming we have chunks of length 1, hence an increment of 1 between start indices)
+        # Run A: base seed = 42 → scenario seeds = 42, 43, 44, …, 10041 (for 10,000 scenarios)
+        # Run B: base seed = 120 → scenario seeds = 120, 121, 122, …, 10119
+        # These sets overlap on seeds 120..10041 (so 9,922 overlapping seeds).
+        # we also add 1 in case the seed is 0, to not have collision witht he seed used for the load perturbations
+        with custom_seed(seed * 20_000 + start_idx + 1):
+            for scenario_index in range(start_idx, end_idx):
+                if mode == "opf":
+                    local_processed_data = process_scenario_opf_mode(
+                        net,
+                        scenarios,
+                        scenario_index,
+                        topology_generator,
+                        generation_generator,
+                        admittance_generator,
+                        local_processed_data,
+                        error_log_path,
+                        include_dc_res,
+                        jl,
+                    )
+                elif mode == "pf":
+                    local_processed_data = process_scenario_pf_mode(
+                        net,
+                        scenarios,
+                        scenario_index,
+                        topology_generator,
+                        generation_generator,
+                        admittance_generator,
+                        local_processed_data,
+                        error_log_path,
+                        include_dc_res,
+                        pf_fast,
+                        dcpf_fast,
+                        jl,
+                    )
+
+                progress_queue.put(1)  # update queue
 
         return (
             None,
@@ -1031,9 +1069,13 @@ def process_scenario_opf_mode(
         local_processed_data: List to accumulate processed data tuples.
         error_log_file: Path to error log file for recording failures.
         include_dc_res: Whether to include DC power flow results in output.
+        jl: Julia interface object for running power flow calculations.
 
     Returns:
         Updated list of processed data (bus, gen, branch, Y_bus arrays)
+
+    Note:
+        Random seed is controlled by the calling context (process_scenario_chunk).
     """
 
     # apply the load scenario to the network
diff --git a/gridfm_datakit/utils/random_seed.py b/gridfm_datakit/utils/random_seed.py
new file mode 100644
index 000000000..84b30977a
--- /dev/null
+++ b/gridfm_datakit/utils/random_seed.py
@@ -0,0 +1,47 @@
+"""Context manager for temporary random seed management."""
+
+import numpy as np
+from typing import Optional
+
+
+class custom_seed:
+    """Context manager to temporarily set a custom random seed.
+
+    This context manager saves the current numpy random state, sets a new seed,
+    and restores the previous state upon exit. This is useful for ensuring
+    reproducibility in specific code blocks while maintaining the overall
+    random state flow.
+
+    Example:
+        >>> import numpy as np
+        >>> np.random.seed(42)
+        >>> print(np.random.rand())  # Will use seed 42
+        >>> with custom_seed(100):
+        ...     print(np.random.rand())  # Will use seed 100
+        >>> print(np.random.rand())  # Will continue from seed 42's sequence
+
+    Args:
+        seed: The seed value to use within the context. If None, no seed is set.
+    """
+
+    def __init__(self, seed: Optional[int] = None):
+        """Initialize the context manager with a custom seed.
+
+        Args:
+            seed: The seed value to use. If None, state is saved but no new seed is set.
+        """
+        self.seed = seed
+        self.saved_state = None
+
+    def __enter__(self):
+        """Save current random state and set the custom seed."""
+        self.saved_state = np.random.get_state()
+        if self.seed is not None:
+            np.random.seed(self.seed)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Restore the previously saved random state."""
+        if self.saved_state is not None:
+            np.random.set_state(self.saved_state)
+        return False  # Don't suppress exceptions
diff --git a/scripts/compare_parquet_files.py b/scripts/compare_parquet_files.py
index 7d87d9467..821ddf0e0 100644
--- a/scripts/compare_parquet_files.py
+++ b/scripts/compare_parquet_files.py
@@ -55,6 +55,8 @@
   pf_fast: true # Whether to use fast PF solver by default (compute_ac_pf from powermodels.jl); if false, uses Ipopt-based PF. Some networks (typically large ones e.g. case10000_goc) do not work with pf_fast: true. pf_fast is faster and more accurate than the Ipopt-based PF.
   dcpf_fast: true # Whether to use fast DCPF solver by default (compute_dc_pf from PowerModels.jl)
   max_iter: 200 # Max iterations for Ipopt-based solvers
+  seed: null # Seed for random number generation. If null, a random seed is generated (RECOMMENDED). To get the same data across runs, set the seed and note that ALL OTHER PARAMETERS IN THE CONFIG FILE MUST BE THE SAME.
+
 
 
 """
@@ -178,6 +180,29 @@ def compare_parquet_files(dir1: str, dir2: str, verbose: bool = False) -> None:
             df1 = pd.read_parquet(files_dir1[filename])
             df2 = pd.read_parquet(files_dir2[filename])
 
+            # sort by load_scenario_idx if exists
+            if "load_scenario_idx" in df1.columns:
+                print(f"Sorting {filename} by load_scenario_idx")
+                df1 = df1.sort_values(
+                    by="load_scenario_idx",
+                    kind="stable",
+                ).reset_index(drop=True)
+                df2 = df2.sort_values(
+                    by="load_scenario_idx",
+                    kind="stable",
+                ).reset_index(drop=True)
+
+            else:
+                print(
+                    f"No load_scenario_idx column found in {filename}, using sort by scenario",
+                )
+                df1 = df1.sort_values(by="scenario", kind="stable").reset_index(
+                    drop=True,
+                )
+                df2 = df2.sort_values(by="scenario", kind="stable").reset_index(
+                    drop=True,
+                )
+
             cols1 = set(df1.columns)
             cols2 = set(df2.columns)
 
diff --git a/scripts/config/Texas2k_case1_2016summerpeak.yaml b/scripts/config/Texas2k_case1_2016summerpeak.yaml
index a0cfef664..308f07588 100644
--- a/scripts/config/Texas2k_case1_2016summerpeak.yaml
+++ b/scripts/config/Texas2k_case1_2016summerpeak.yaml
@@ -45,3 +45,4 @@ settings:
   pf_fast: true # Whether to use fast PF solver by default (compute_ac_pf from powermodels.jl); if false, uses Ipopt-based PF. Some networks (typically large ones e.g. case10000_goc) do not work with pf_fast: true. pf_fast is faster and more accurate than the Ipopt-based PF.
   dcpf_fast: true # Whether to use fast DCPF solver by default (compute_dc_pf from PowerModels.jl)
   max_iter: 200 # Max iterations for Ipopt-based solvers
+  seed: null # Seed for random number generation. If null, a random seed is generated (RECOMMENDED). To get the same data across runs, set the seed and note that ALL OTHER PARAMETERS IN THE CONFIG FILE MUST BE THE SAME.
diff --git a/scripts/config/default.yaml b/scripts/config/default.yaml
index 17dbaa680..37e8d56c0 100644
--- a/scripts/config/default.yaml
+++ b/scripts/config/default.yaml
@@ -7,7 +7,7 @@ network:
 load:
   generator: "agg_load_profile" # Name of the load generator; options: agg_load_profile, powergraph
   agg_profile: "default" # Name of the aggregated load profile
-  scenarios: 10000 # Number of different load scenarios to generate
+  scenarios: 100 # Number of different load scenarios to generate
   # WARNING: the following parameters are only used if generator is "agg_load_profile"
   # if using generator "powergraph", these parameters are ignored
   sigma: 0.2 # max local noise
@@ -36,7 +36,7 @@ admittance_perturbation:
 
 settings:
   num_processes: 16 # Number of parallel processes to use
-  data_dir: "./data_out" # Directory to save generated data relative to the project root
+  data_dir: "./data_out_seed_none2" # Directory to save generated data relative to the project root
   large_chunk_size: 1000 # Number of load scenarios processed before saving
   overwrite: true # If true, overwrites existing files, if false, appends to files
   mode: "pf" # Mode of the script; options: pf, opf. pf: power flow data where one or more operating limits – the inequality constraints defined in OPF, e.g., voltage magnitude or branch limits – may be violated. opf:  generates datapoints for training OPF solvers, with cost-optimal dispatches that satisfy all operating limits (OPF-feasible)
@@ -45,3 +45,4 @@ settings:
   pf_fast: true # Whether to use fast PF solver by default (compute_ac_pf from powermodels.jl); if false, uses Ipopt-based PF. Some networks (typically large ones e.g. case10000_goc) do not work with pf_fast: true. pf_fast is faster and more accurate than the Ipopt-based PF.
   dcpf_fast: true # Whether to use fast DCPF solver by default (compute_dc_pf from PowerModels.jl)
   max_iter: 200 # Max iterations for Ipopt-based solvers
+  seed: null # Seed for random number generation. If null, a random seed is generated (RECOMMENDED). To get the same data across runs, set the seed and note that ALL OTHER PARAMETERS IN THE CONFIG FILE MUST BE THE SAME.
diff --git a/scripts/user_config.yaml b/scripts/user_config.yaml
new file mode 100644
index 000000000..037bc6100
--- /dev/null
+++ b/scripts/user_config.yaml
@@ -0,0 +1,39 @@
+admittance_perturbation:
+  sigma: 0.2
+  type: random_perturbation
+generation_perturbation:
+  sigma: 1.0
+  type: cost_permutation
+load:
+  agg_profile: default
+  change_reactive_power: true
+  generator: agg_load_profile
+  global_range: 0.4
+  max_scaling_factor: 4.0
+  scenarios: 10000
+  sigma: 0.2
+  start_scaling_factor: 1.0
+  step_size: 0.1
+network:
+  name: case24_ieee_rts
+  network_dir: none
+  source: pglib
+settings:
+  data_dir: ./data_out
+  dcpf_fast: true
+  enable_solver_logs: true
+  include_dc_res: true
+  large_chunk_size: 1000
+  max_iter: 200
+  mode: pf
+  num_processes: 16
+  overwrite: true
+  pf_fast: true
+  seed: null
+topology_perturbation:
+  elements:
+  - branch
+  - gen
+  k: 1
+  n_topology_variants: 20
+  type: random
diff --git a/tests/config/consistency_test.yaml b/tests/config/consistency_test.yaml
index 351edaf28..95d9f3be8 100644
--- a/tests/config/consistency_test.yaml
+++ b/tests/config/consistency_test.yaml
@@ -45,3 +45,4 @@ settings:
   pf_fast: true # Whether to use fast PF solver by default (compute_ac_pf from powermodels.jl); if false, uses Ipopt-based PF. Some networks (typically large ones e.g. case10000_goc) do not work with pf_fast: true. pf_fast is faster and more accurate than the Ipopt-based PF.
   dcpf_fast: true # Whether to use fast DCPF solver by default (compute_dc_pf from PowerModels.jl)
   max_iter: 200 # Max iterations for Ipopt-based solvers
+  seed: null # Seed for random number generation. If null, a random seed is generated (RECOMMENDED). To get the same data across runs, set the seed and note that ALL OTHER PARAMETERS IN THE CONFIG FILE MUST BE THE SAME.
diff --git a/tests/config/default_opf_mode_test.yaml b/tests/config/default_opf_mode_test.yaml
index 87318b41f..75fe384bf 100644
--- a/tests/config/default_opf_mode_test.yaml
+++ b/tests/config/default_opf_mode_test.yaml
@@ -45,3 +45,4 @@ settings:
   pf_fast: true # Whether to use fast PF solver by default (compute_ac_pf from powermodels.jl); if false, uses Ipopt-based PF. Some networks (typically large ones e.g. case10000_goc) do not work with pf_fast: true. pf_fast is faster and more accurate than the Ipopt-based PF.
   dcpf_fast: true # Whether to use fast DCPF solver by default (compute_dc_pf from PowerModels.jl)
   max_iter: 200 # Max iterations for Ipopt-based solvers
+  seed: null # Seed for random number generation. If null, a random seed is generated (RECOMMENDED). To get the same data across runs, set the seed and note that ALL OTHER PARAMETERS IN THE CONFIG FILE MUST BE THE SAME.
diff --git a/tests/config/default_pf_mode_test.yaml b/tests/config/default_pf_mode_test.yaml
index c84864f45..dfc057784 100644
--- a/tests/config/default_pf_mode_test.yaml
+++ b/tests/config/default_pf_mode_test.yaml
@@ -45,3 +45,4 @@ settings:
   pf_fast: true # Whether to use fast PF solver by default (compute_ac_pf from powermodels.jl); if false, uses Ipopt-based PF. Some networks (typically large ones e.g. case10000_goc) do not work with pf_fast: true. pf_fast is faster and more accurate than the Ipopt-based PF.
   dcpf_fast: true # Whether to use fast DCPF solver by default (compute_dc_pf from PowerModels.jl)
   max_iter: 200 # Max iterations for Ipopt-based solvers
+  seed: null # Seed for random number generation. If null, a random seed is generated (RECOMMENDED). To get the same data across runs, set the seed and note that ALL OTHER PARAMETERS IN THE CONFIG FILE MUST BE THE SAME.
diff --git a/tests/config/default_without_perturbation_test.yaml b/tests/config/default_without_perturbation_test.yaml
index a3bee11c3..989f1fcbe 100644
--- a/tests/config/default_without_perturbation_test.yaml
+++ b/tests/config/default_without_perturbation_test.yaml
@@ -45,3 +45,4 @@ settings:
   pf_fast: true # Whether to use fast PF solver by default (compute_ac_pf from powermodels.jl); if false, uses Ipopt-based PF. Some networks (typically large ones e.g. case10000_goc) do not work with pf_fast: true. pf_fast is faster and more accurate than the Ipopt-based PF.
   dcpf_fast: true # Whether to use fast DCPF solver by default (compute_dc_pf from PowerModels.jl)
   max_iter: 200 # Max iterations for Ipopt-based solvers
+  seed: null # Seed for random number generation. If null, a random seed is generated (RECOMMENDED). To get the same data across runs, set the seed and note that ALL OTHER PARAMETERS IN THE CONFIG FILE MUST BE THE SAME.
diff --git a/tests/test_generate.py b/tests/test_generate.py
index c2a44d884..4a1bebcb5 100644
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@@ -45,7 +45,7 @@ def test_setup_environment(conf):
     """
     Tests if environment setup works correctly
     """
-    args, base_path, file_paths = _setup_environment(conf)
+    args, base_path, file_paths, seed = _setup_environment(conf)
     assert isinstance(file_paths, dict), "File paths should be a dictionary"
     assert "y_bus_data" in file_paths, (
         "Y-bus data file path should be in the dictionary"
@@ -66,7 +66,7 @@ def test_fail_setup_environment():
     """
     # Test with a non-existent configuration file
     with pytest.raises(FileNotFoundError):
-        args, base_path, file_paths = _setup_environment(
+        args, base_path, file_paths, seed = _setup_environment(
             "scripts/config/non_existent_config.yaml",
         )
 
@@ -77,8 +77,8 @@ def test_prepare_network_and_scenarios(conf):
     Tests if network and scenarios are prepared correctly
     """
     # Ensure the configuration is valid
-    args, base_path, file_paths = _setup_environment(conf)
-    net, scenarios = _prepare_network_and_scenarios(args, file_paths)
+    args, base_path, file_paths, seed = _setup_environment(conf)
+    net, scenarios = _prepare_network_and_scenarios(args, file_paths, seed)
 
     assert isinstance(net, Network), "Network should be a Network object"
     assert len(scenarios) > 0, "There should be at least one scenario"
@@ -91,18 +91,18 @@ def test_fail_prepare_network_and_scenarios():
     # Test with a non-existent configuration file
     config = "scripts/config/non_existent_config.yaml"
     with pytest.raises(FileNotFoundError):
-        args, base_path, file_paths = _setup_environment(config)
-        net, scenarios = _prepare_network_and_scenarios(args, file_paths)
+        args, base_path, file_paths, seed = _setup_environment(config)
+        net, scenarios = _prepare_network_and_scenarios(args, file_paths, seed)
 
 
 def test_fail_prepare_network_and_scenarios_config(conf):
     """
     Tests if preparing network and scenarios fails with an invalid grid source in the configuration file
     """
-    args, base_path, file_paths = _setup_environment(conf)
+    args, base_path, file_paths, seed = _setup_environment(conf)
     conf.network.source = "invalid_source"  # Set invalid source
     with pytest.raises(ValueError, match="Invalid grid source!"):
-        net, scenarios = _prepare_network_and_scenarios(conf, file_paths)
+        net, scenarios = _prepare_network_and_scenarios(conf, file_paths, seed)
 
 
 # Test save network function
@@ -221,7 +221,7 @@ def test_setup_environment_overwrite_behavior():
         }
 
         # First setup creates directories
-        args, base_path, file_paths = _setup_environment(config)
+        args, base_path, file_paths, seed = _setup_environment(config)
         marker = os.path.join(base_path, "marker.txt")
         os.makedirs(base_path, exist_ok=True)
         with open(marker, "w") as f:
@@ -229,13 +229,13 @@ def test_setup_environment_overwrite_behavior():
 
         # overwrite=False should keep existing contents
         config["settings"]["overwrite"] = False
-        _args2, base_path2, _ = _setup_environment(config)
+        _args2, base_path2, _, _ = _setup_environment(config)
         assert base_path2 == base_path
         assert os.path.exists(marker), "Marker should persist when overwrite is False"
 
         # overwrite=True should remove and recreate directory
         config["settings"]["overwrite"] = True
-        _args3, base_path3, _ = _setup_environment(config)
+        _args3, base_path3, _, _ = _setup_environment(config)
         assert base_path3 == base_path
         assert not os.path.exists(marker), (
             "Marker should be removed when overwrite is True"
@@ -287,5 +287,99 @@ def test_parquet_append_vs_overwrite():
         )
 
 
+def test_reproducibility_with_same_seed():
+    """
+    Test that generating data with the same seed produces identical results.
+    Uses case14_ieee network with 10 scenarios and 2 processes.
+    """
+    # Load base config
+    config_path = "tests/config/default_pf_mode_test.yaml"
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+
+    # Modify config for this test
+    config["network"]["name"] = "case14_ieee"
+    config["load"]["scenarios"] = 6
+    config["settings"]["num_processes"] = 2
+    config["settings"]["seed"] = 42  # Fixed seed for reproducibility
+    config["settings"]["overwrite"] = True
+
+    # Use temporary directories for both runs
+    with (
+        tempfile.TemporaryDirectory() as tmpdir1,
+        tempfile.TemporaryDirectory() as tmpdir2,
+    ):
+        # First run
+        config["settings"]["data_dir"] = tmpdir1
+        config1 = NestedNamespace(**config)
+        file_paths_1 = generate_power_flow_data_distributed(config1)
+
+        # Second run with same seed
+        config["settings"]["data_dir"] = tmpdir2
+        config2 = NestedNamespace(**config)
+        file_paths_2 = generate_power_flow_data_distributed(config2)
+
+        # Load all generated parquet files and compare
+        data_files = ["bus_data", "gen_data", "branch_data", "y_bus_data"]
+
+        for data_type in data_files:
+            df1 = pd.read_parquet(file_paths_1[data_type], engine="pyarrow")
+            df2 = pd.read_parquet(file_paths_2[data_type], engine="pyarrow")
+
+            # sort by load_scenario_idx
+            df1 = df1.sort_values(by="load_scenario_idx", kind="stable").reset_index(
+                drop=True,
+            )
+            df2 = df2.sort_values(by="load_scenario_idx", kind="stable").reset_index(
+                drop=True,
+            )
+
+            # Check that shapes match
+            assert df1.shape == df2.shape, (
+                f"{data_type}: Shape mismatch - Run 1: {df1.shape}, Run 2: {df2.shape}"
+            )
+
+            # Check that columns match
+            assert list(df1.columns) == list(df2.columns), (
+                f"{data_type}: Column mismatch - Run 1: {df1.columns}, Run 2: {df2.columns}"
+            )
+
+            # Check that all values are identical
+            # Use pandas testing to handle floating point comparison properly
+            pd.testing.assert_frame_equal(
+                df1,
+                df2,
+                check_dtype=True,
+                check_exact=False,
+                rtol=1e-10,
+                atol=1e-10,
+                obj=f"{data_type}",
+            )
+
+            print(
+                f"   ✓ {data_type}: Both runs produced identical data ({df1.shape[0]} rows)",
+            )
+
+        # Also check the scenarios file
+        scenarios_1 = pd.read_parquet(file_paths_1["scenarios"], engine="pyarrow")
+        scenarios_2 = pd.read_parquet(file_paths_2["scenarios"], engine="pyarrow")
+
+        os.makedirs("tests/test_data", exist_ok=True)
+
+        pd.testing.assert_frame_equal(
+            scenarios_1,
+            scenarios_2,
+            check_dtype=True,
+            check_exact=False,
+            rtol=1e-10,
+            atol=1e-10,
+        )
+
+        print(
+            f"   ✓ scenarios: Both runs produced identical data ({scenarios_1.shape[0]} rows)",
+        )
+        print("\n✓ All data files are identical across runs with the same seed!")
+
+
 if __name__ == "__main__":
-    test_parquet_append_vs_overwrite()
+    test_reproducibility_with_same_seed()
diff --git a/tests/test_generator_perturbation.py b/tests/test_generator_perturbation.py
index 67d5bf7eb..6e5b21604 100644
--- a/tests/test_generator_perturbation.py
+++ b/tests/test_generator_perturbation.py
@@ -50,53 +50,52 @@ def example_generator():
         )
 
     def test_generator_cost_perturbation_changes_values(self):
-        """Test that PerturbGenCostGenerator changes cost values"""
+        """Test that PerturbGenCostGenerator changes ALL cost coefficients for non-constant generators"""
         # Load the original network
         original_network = load_net_from_pglib("case24_ieee_rts")
 
         # Store original cost values
         original_gencosts = original_network.gencosts.copy()
 
-        # Create perturbation generator with sigma=0.1
+        # Identify generators with non-constant costs (c1 or c2 != 0)
+        # MATPOWER cost order: [c2, c1, c0] for NCOST=3
+        costs = original_gencosts[:, COST:]
+        non_constant_mask = np.any(costs[:, :-1] != 0, axis=1)
+
+        # Ensure there are actually non-constant generators in the test
+        assert np.sum(non_constant_mask) > 0, (
+            "Test requires generators with non-constant costs"
+        )
+
+        # Create perturbation generator with significant sigma
         perturb_generator = PerturbGenCostGenerator(
             base_net=original_network,
             sigma=0.1,
         )
 
-        # Create a simple generator that yields a copy of the original network
-        def example_generator():
-            yield copy.deepcopy(original_network)
+        # Generate one perturbation
+        test_network = copy.deepcopy(original_network)
 
-        # Generate perturbed networks
-        perturbed_networks = list(perturb_generator.generate(example_generator()))
+        def example_gen():
+            yield test_network
 
-        # Verify we got exactly one network
-        assert len(perturbed_networks) == 1, (
-            f"Expected 1 network, got {len(perturbed_networks)}"
-        )
+        perturbed_networks = list(perturb_generator.generate(example_gen()))
+        perturbed_gencosts = perturbed_networks[0].gencosts
 
-        perturbed_network = perturbed_networks[0]
-        perturbed_gencosts = perturbed_network.gencosts
+        # Check that ALL cost coefficients have changed for each non-constant generator
+        for gen_idx in np.where(non_constant_mask)[0]:
+            original_costs = original_gencosts[gen_idx, COST:]
+            perturbed_costs = perturbed_gencosts[gen_idx, COST:]
 
-        # Check that cost values are actually different
-        # This might not always be true due to randomness, so we'll check multiple times
-        different_perturbations = 0
-        for _ in range(10):  # Try multiple perturbations
-            test_network = copy.deepcopy(original_network)
-
-            def example_gen():
-                yield test_network
-
-            perturbed_networks = list(perturb_generator.generate(example_gen()))
-            perturbed_gencosts = perturbed_networks[0].gencosts
-
-            if not np.array_equal(original_gencosts, perturbed_gencosts):
-                different_perturbations += 1
-
-        # We expect at least some perturbations to be different
-        assert different_perturbations > 0, (
-            "Perturbation should change the generator cost values"
-        )
+            # Check each coefficient individually (c2, c1, c0)
+            for coeff_idx in range(len(original_costs)):
+                if original_costs[coeff_idx] != 0:
+                    assert original_costs[coeff_idx] != perturbed_costs[coeff_idx], (
+                        f"Generator {gen_idx}, coefficient {coeff_idx}: "
+                        f"Expected value to change from {original_costs[coeff_idx]}, "
+                        f"but it remained the same. All non-constant generators should have "
+                        f"all their cost coefficients changed by perturbation."
+                    )
 
     def test_generator_cost_perturbation_preserves_structure(self):
         """Test that PerturbGenCostGenerator preserves the structure of cost coefficients"""
@@ -301,3 +300,93 @@ def gen_net():
     assert np.allclose(net_out.gencosts, costs0, rtol=0.0, atol=0.0), (
         "gencosts should be unchanged when sigma=0"
     )
+
+
+def test_permute_skips_zero_and_constant_cost_generators():
+    """PermuteGenCostGenerator should skip generators with zero or constant-only costs."""
+    net = load_net_from_pglib("case24_ieee_rts")
+
+    # Modify some generators to have zero or constant-only costs
+    # MATPOWER cost order: [c2, c1, c0] for NCOST=3
+
+    # Generator 0: zero cost (all coefficients = 0)
+    net.gencosts[0, COST:] = 0
+
+    # Generator 1: constant only (c2=0, c1=0, c0=100)
+    net.gencosts[1, COST : COST + 2] = 0  # Set c2 and c1 to 0
+    net.gencosts[1, COST + 2] = 100  # Set c0 to 100
+
+    # Save original costs for these generators
+    original_zero_cost = net.gencosts[0, COST:].copy()
+    original_constant_cost = net.gencosts[1, COST:].copy()
+
+    # Create permutation generator
+    permute_gen = PermuteGenCostGenerator(base_net=net)
+
+    # Test multiple permutations
+    for _ in range(10):
+        test_net = copy.deepcopy(net)
+
+        def gen_net():
+            yield test_net
+
+        [perturbed_net] = list(permute_gen.generate(gen_net()))
+
+        # Verify zero-cost generator is unchanged
+        np.testing.assert_array_equal(
+            perturbed_net.gencosts[0, COST:],
+            original_zero_cost,
+            "Zero-cost generator should not be permuted",
+        )
+
+        # Verify constant-only generator is unchanged
+        np.testing.assert_array_equal(
+            perturbed_net.gencosts[1, COST:],
+            original_constant_cost,
+            "Constant-only generator should not be permuted",
+        )
+
+
+def test_perturb_skips_zero_and_constant_cost_generators():
+    """PerturbGenCostGenerator should skip generators with zero or constant-only costs."""
+    net = load_net_from_pglib("case24_ieee_rts")
+
+    # Modify some generators to have zero or constant-only costs
+    # MATPOWER cost order: [c2, c1, c0] for NCOST=3
+
+    # Generator 0: zero cost (all coefficients = 0)
+    net.gencosts[0, COST:] = 0
+
+    # Generator 1: constant only (c2=0, c1=0, c0=100)
+    net.gencosts[1, COST : COST + 2] = 0  # Set c2 and c1 to 0
+    net.gencosts[1, COST + 2] = 100  # Set c0 to 100
+
+    # Save original costs for these generators
+    original_zero_cost = net.gencosts[0, COST:].copy()
+    original_constant_cost = net.gencosts[1, COST:].copy()
+
+    # Create perturbation generator with significant sigma
+    perturb_gen = PerturbGenCostGenerator(base_net=net, sigma=0.5)
+
+    # Test multiple perturbations
+    for _ in range(10):
+        test_net = copy.deepcopy(net)
+
+        def gen_net():
+            yield test_net
+
+        [perturbed_net] = list(perturb_gen.generate(gen_net()))
+
+        # Verify zero-cost generator is unchanged
+        np.testing.assert_array_equal(
+            perturbed_net.gencosts[0, COST:],
+            original_zero_cost,
+            "Zero-cost generator should not be perturbed",
+        )
+
+        # Verify constant-only generator is unchanged
+        np.testing.assert_array_equal(
+            perturbed_net.gencosts[1, COST:],
+            original_constant_cost,
+            "Constant-only generator should not be perturbed",
+        )
diff --git a/tests/test_load_perturbation.py b/tests/test_load_perturbation.py
index ff3f8620d..03a010273 100644
--- a/tests/test_load_perturbation.py
+++ b/tests/test_load_perturbation.py
@@ -23,7 +23,13 @@ def test_agg_load_no_variation_when_sigma_and_range_zero_change_q_true():
             step_size=0.1,
             start_scaling_factor=1.0,
         )
-        scenarios = gen(net, n_scenarios=5, scenarios_log=scenario_log, max_iter=200)
+        scenarios = gen(
+            net,
+            n_scenarios=5,
+            scenarios_log=scenario_log,
+            max_iter=200,
+            seed=0,
+        )
         # shape: (n_loads, n_scenarios, 2)
         # All scenarios across axis=1 should be equal to the first scenario
         p_first = scenarios[:, 0, 0]
@@ -47,7 +53,13 @@ def test_agg_load_no_variation_when_sigma_and_range_zero_change_q_false():
             step_size=0.1,
             start_scaling_factor=1.0,
         )
-        scenarios = gen(net, n_scenarios=4, scenarios_log=scenario_log, max_iter=200)
+        scenarios = gen(
+            net,
+            n_scenarios=4,
+            scenarios_log=scenario_log,
+            max_iter=200,
+            seed=0,
+        )
         # p columns identical across scenarios
         p_first = scenarios[:, 0, 0]
         assert np.allclose(scenarios[:, :, 0], p_first[:, None])
diff --git a/tests/test_random_seed.py b/tests/test_random_seed.py
new file mode 100644
index 000000000..c7f663d2c
--- /dev/null
+++ b/tests/test_random_seed.py
@@ -0,0 +1,105 @@
+"""Test suite for custom_seed context manager."""
+
+import numpy as np
+import pytest
+from gridfm_datakit.utils.random_seed import custom_seed
+
+
+def test_seed_set_inside_with_block():
+    """Verify that the seed is properly set inside the with block."""
+    # Get expected value with seed=100
+    np.random.seed(100)
+    expected_value = np.random.rand()
+
+    # Now test with context manager from a different seed
+    np.random.seed(42)
+    with custom_seed(100):
+        actual_value = np.random.rand()
+
+    assert np.isclose(actual_value, expected_value), (
+        "Seed not set correctly inside with block"
+    )
+
+
+def test_state_restored_after_with_block():
+    """Verify that the random state is restored after exiting the with block."""
+    # Set up: seed=42, consume one value
+    np.random.seed(42)
+    _ = np.random.rand()  # Skip first value
+
+    # Use different seed temporarily
+    with custom_seed(999):
+        _ = np.random.rand()  # Use seed=999 inside
+
+    # After exiting, should continue from seed=42 sequence (2nd value)
+    second_value = np.random.rand()
+
+    # Verify it matches expected 2nd value from seed=42 sequence
+    np.random.seed(42)
+    _ = np.random.rand()  # skip first
+    expected_second = np.random.rand()
+
+    assert np.isclose(second_value, expected_second), (
+        "State not restored correctly after exiting with block"
+    )
+
+
+def test_reproducibility_with_same_seed():
+    """Verify that using the same seed produces identical results."""
+
+    def generate_data_with_seed(seed):
+        with custom_seed(seed):
+            return np.random.randn(10)
+
+    # Generate data twice with same seed but different outer states
+    np.random.seed(999)
+    data1 = generate_data_with_seed(100)
+
+    np.random.seed(777)
+    data2 = generate_data_with_seed(100)
+
+    assert np.allclose(data1, data2), "Same seed didn't produce identical results"
+
+
+def test_none_seed_preserves_state():
+    """Verify that passing None as seed just saves/restores state without setting a new seed."""
+    np.random.seed(42)
+    values_before = [np.random.rand() for _ in range(3)]
+
+    np.random.seed(42)
+    values_with_none = []
+    with custom_seed(None):
+        values_with_none = [np.random.rand() for _ in range(3)]
+
+    assert np.allclose(values_before, values_with_none), (
+        "None seed should not change the random sequence"
+    )
+
+
+def test_exception_still_restores_state():
+    """Verify that state is restored even if an exception occurs inside the with block."""
+    np.random.seed(42)
+    _ = np.random.rand()  # Skip first value
+
+    try:
+        with custom_seed(999):
+            _ = np.random.rand()
+            raise ValueError("Test exception")
+    except ValueError:
+        pass
+
+    # State should still be restored
+    value_after = np.random.rand()
+
+    # Verify it matches expected sequence
+    np.random.seed(42)
+    _ = np.random.rand()  # skip first
+    expected_value = np.random.rand()
+
+    assert np.isclose(value_after, expected_value), (
+        "State not restored after exception in with block"
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_verify_network.py b/tests/test_verify_network.py
index 7bb53d970..a1b19ce10 100644
--- a/tests/test_verify_network.py
+++ b/tests/test_verify_network.py
@@ -12,7 +12,9 @@
     T_BUS,
     GEN_BUS,
 )
+from gridfm_datakit.utils.idx_bus import BUS_TYPE, PV, PQ
 from gridfm_datakit.process.process_network import init_julia
+import numpy as np
 
 
 def verify_deactivated_branches():
@@ -167,6 +169,48 @@ def verify_generator_bus_assignment():
     print(f"   ✓ All {net.gens.shape[0]} generators have matching bus assignments")
 
 
+def verify_pv_to_pq_on_gen_deactivation():
+    """Check that PV buses become PQ when all their generators are deactivated."""
+    print("\n5. Testing PV→PQ bus type change on generator deactivation...")
+
+    case_name = "case24_ieee_rts"
+    net = load_net_from_pglib(case_name)
+
+    # Find the continuous index for original bus 21
+    original_bus_21 = 21
+    if original_bus_21 not in net.bus_index_mapping:
+        raise ValueError(f"Bus {original_bus_21} not found in network")
+
+    continuous_bus_21_idx = net.bus_index_mapping[original_bus_21]
+
+    # Assert bus 21 is initially PV type
+    initial_bus_type = net.buses[continuous_bus_21_idx, BUS_TYPE]
+    assert initial_bus_type == PV, (
+        f"Bus {original_bus_21} (continuous index {continuous_bus_21_idx}) should be PV type, "
+        f"but has type {initial_bus_type}"
+    )
+    print(f"   ✓ Bus {original_bus_21} is initially PV type")
+
+    # Find all generators connected to bus 21
+    gens_at_bus_21 = np.where(net.gens[:, GEN_BUS] == continuous_bus_21_idx)[0]
+    assert len(gens_at_bus_21) > 0, f"No generators found at bus {original_bus_21}"
+    print(
+        f"   ✓ Found {len(gens_at_bus_21)} generator(s) at bus {original_bus_21}: {gens_at_bus_21}",
+    )
+
+    # Deactivate all generators at bus 21
+    net.deactivate_gens(gens_at_bus_21)
+    print(f"   ✓ Deactivated generator(s): {gens_at_bus_21}")
+
+    # Check that bus 21 has changed to PQ type
+    final_bus_type = net.buses[continuous_bus_21_idx, BUS_TYPE]
+    assert final_bus_type == PQ, (
+        f"Bus {original_bus_21} should have changed to PQ type after generator deactivation, "
+        f"but has type {final_bus_type}"
+    )
+    print(f"   ✓ Bus {original_bus_21} successfully changed from PV to PQ type")
+
+
 class TestVerifyNetwork:
     @classmethod
     def setup_class(cls):
@@ -184,3 +228,6 @@ def test_deactivated_generators(self):
 
     def test_generator_bus_assignment(self):
         verify_generator_bus_assignment()
+
+    def test_pv_to_pq_on_gen_deactivation(self):
+        verify_pv_to_pq_on_gen_deactivation()

From b03b63110268a7cf8860b0304815c3b3c30eda06 Mon Sep 17 00:00:00 2001
From: Alban Puech <alban.puech2@ibm.com>
Date: Fri, 16 Jan 2026 13:37:11 -0500
Subject: [PATCH 2/3] fixed default

---
 gridfm_datakit/network.py   | 5 +++--
 scripts/config/default.yaml | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/gridfm_datakit/network.py b/gridfm_datakit/network.py
index b2220e1c5..9364e9de7 100644
--- a/gridfm_datakit/network.py
+++ b/gridfm_datakit/network.py
@@ -6,6 +6,7 @@
 """
 
 import os
+import shutil
 import requests
 from importlib import resources
 import pandas as pd
@@ -96,8 +97,8 @@ def correct_network(network_path: str, force: bool = False) -> str:
         if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
             raise RuntimeError("Julia produced empty MATPOWER file")
 
-        # Atomically replace target file
-        os.replace(tmp_path, corrected_path)
+        # Atomically replace target file (use shutil.move to allow cross-device)
+        shutil.move(tmp_path, corrected_path)
         return corrected_path
 
     finally:
diff --git a/scripts/config/default.yaml b/scripts/config/default.yaml
index 37e8d56c0..cecfd31c6 100644
--- a/scripts/config/default.yaml
+++ b/scripts/config/default.yaml
@@ -7,7 +7,7 @@ network:
 load:
   generator: "agg_load_profile" # Name of the load generator; options: agg_load_profile, powergraph
   agg_profile: "default" # Name of the aggregated load profile
-  scenarios: 100 # Number of different load scenarios to generate
+  scenarios: 10000 # Number of different load scenarios to generate
   # WARNING: the following parameters are only used if generator is "agg_load_profile"
   # if using generator "powergraph", these parameters are ignored
   sigma: 0.2 # max local noise
@@ -36,7 +36,7 @@ admittance_perturbation:
 
 settings:
   num_processes: 16 # Number of parallel processes to use
-  data_dir: "./data_out_seed_none2" # Directory to save generated data relative to the project root
+  data_dir: "./data_out" # Directory to save generated data relative to the project root
   large_chunk_size: 1000 # Number of load scenarios processed before saving
   overwrite: true # If true, overwrites existing files, if false, appends to files
   mode: "pf" # Mode of the script; options: pf, opf. pf: power flow data where one or more operating limits – the inequality constraints defined in OPF, e.g., voltage magnitude or branch limits – may be violated. opf:  generates datapoints for training OPF solvers, with cost-optimal dispatches that satisfy all operating limits (OPF-feasible)

From 9aa34ba60d1821674d2951b61b21dcc999e05e59 Mon Sep 17 00:00:00 2001
From: PUECH Alban <72336171+albanpuech@users.noreply.github.com>
Date: Fri, 16 Jan 2026 19:47:31 +0100
Subject: [PATCH 3/3] Delete scripts/user_config.yaml

Signed-off-by: PUECH Alban <72336171+albanpuech@users.noreply.github.com>
---
 scripts/user_config.yaml | 39 ---------------------------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 scripts/user_config.yaml

diff --git a/scripts/user_config.yaml b/scripts/user_config.yaml
deleted file mode 100644
index 037bc6100..000000000
--- a/scripts/user_config.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-admittance_perturbation:
-  sigma: 0.2
-  type: random_perturbation
-generation_perturbation:
-  sigma: 1.0
-  type: cost_permutation
-load:
-  agg_profile: default
-  change_reactive_power: true
-  generator: agg_load_profile
-  global_range: 0.4
-  max_scaling_factor: 4.0
-  scenarios: 10000
-  sigma: 0.2
-  start_scaling_factor: 1.0
-  step_size: 0.1
-network:
-  name: case24_ieee_rts
-  network_dir: none
-  source: pglib
-settings:
-  data_dir: ./data_out
-  dcpf_fast: true
-  enable_solver_logs: true
-  include_dc_res: true
-  large_chunk_size: 1000
-  max_iter: 200
-  mode: pf
-  num_processes: 16
-  overwrite: true
-  pf_fast: true
-  seed: null
-topology_perturbation:
-  elements:
-  - branch
-  - gen
-  k: 1
-  n_topology_variants: 20
-  type: random