diff --git a/.gitignore b/.gitignore
index 3bbd875a..44c7f5a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -154,4 +154,12 @@ checkpoints/*
 # Ignore Python packaging build directories
 dist/
 build/
-*.egg-info/
\ No newline at end of file
+*.egg-info/
+
+# USD files
+*.usd
+*.usda
+*.usdc
+*.usd.gz
+*.usd.zip
+*.usd.bz2
\ No newline at end of file
diff --git a/examples/ibm/uav.py b/examples/ibm/uav.py
new file mode 100644
index 00000000..d9ebbe32
--- /dev/null
+++ b/examples/ibm/uav.py
@@ -0,0 +1,662 @@
+import os
+import xlb
+import trimesh
+import time
+from tqdm import tqdm
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.operator.stepper import IBMStepper
+from xlb.operator.boundary_condition import (
+    FullwayBounceBackBC,
+    RegularizedBC,
+    ExtrapolationOutflowBC,
+)
+from xlb.operator.macroscopic import Macroscopic
+from xlb.utils import save_fields_vtk, save_image
+import warp as wp
+import numpy as np
+import jax.numpy as jnp
+import matplotlib.pyplot as plt
+from xlb.helper.ibm_helper import (
+    prepare_immersed_boundary,
+    transform_mesh,
+    reconstruct_mesh_from_vertices_and_faces,
+)
+from xlb.grid import grid_factory
+
+from pxr import Usd, UsdGeom, Vt, Gf
+
+from xlb.operator.postprocess import QCriterion, Vorticity, GridToPoint
+
+
+@wp.kernel
+def get_color(
+    low: float,
+    high: float,
+    values: wp.array(dtype=float),
+    out_color: wp.array(dtype=wp.vec3),
+) -> wp.vec3:
+    tid = wp.tid()
+
+    v = values[tid]
+
+    r = 1.0
+    g = 1.0
+    b = 1.0
+
+    if v < low:
+        v = low
+    if v > high:
+        v = high
+
+    dv = high - low
+
+    if v < (low + 0.25 * dv):
+        r = 0.0
+        g = 4.0 * (v - low) / dv
+    elif v < (low + 0.5 * dv):
+        r = 0.0
+        b = 1.0 + 4.0 * (low + 0.25 * dv - v) / dv
+    elif v < (low + 0.75 * dv):
+        r = 4.0 * (v - low - 0.5 * dv) / dv
+        b = 0.0
+    else:
+        g = 1.0 + 4.0 * (low + 0.75 * dv - v) / dv
+        b = 0.0
+
+    out_color[tid] = wp.vec3(r, g, b)
+
+
+# -------------------------- Simple grid_to_point Helper --------------------------
+def grid_to_point(field, verts, out):
+    """
+    A simple nearest-neighbor sampler.
+    field: NumPy array of shape (nx, ny, nz)
+    verts: Warp array of shape (n, 3) containing vertex positions (assumed to be in grid indices)
+    out: Warp array of shape (n,) to store the sampled scalar values.
+    """
+    verts_np = verts.numpy()  # Convert to NumPy array
+    nx, ny, nz = field.shape
+    out_np = out.numpy()  # Make a NumPy copy to allow assignment
+    for i, v in enumerate(verts_np):
+        x = int(round(v[0]))
+        y = int(round(v[1]))
+        z = int(round(v[2]))
+        x = min(max(x, 0), nx - 1)
+        y = min(max(y, 0), ny - 1)
+        z = min(max(z, 0), nz - 1)
+        out_np[i] = field[x, y, z]
+    return wp.array(out_np, dtype=wp.float32, device=out.device)
+
+
+# -------------------------- USD Output Helper using Q Criterion --------------------------
+def save_usd_vorticity(
+    timestep, post_process_interval, bc_mask, f_current, grid_shape, usd_mesh, vorticity_operator, precision_policy, vorticity_threshold, usd_stage
+):
+    # Compute macroscopic quantities using WARP backend for vorticity
+    device = "cuda:0"  # Use the same device as the simulation
+    macro_wp = Macroscopic(
+        compute_backend=ComputeBackend.WARP,
+        precision_policy=precision_policy,
+        velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP),
+    )
+    rho = wp.zeros((1, *grid_shape), dtype=wp.float32, device=device)
+    u = wp.zeros((3, *grid_shape), dtype=wp.float32, device=device)
+    rho, u = macro_wp(f_current, rho, u)
+    u = u[:, 20:-20, 20:-20, 20:-20]  # Remove bc
+
+    # Allocate arrays for vorticity and its magnitude
+    vorticity = wp.zeros((3, *u.shape[1:]), dtype=wp.float32, device=device)
+    vorticity_magnitude = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+
+    # Compute vorticity
+    vorticity, vorticity_magnitude = vorticity_operator(u, bc_mask, vorticity, vorticity_magnitude)
+
+    # Setup marching cubes on vorticity_magnitude[0]
+    max_verts = grid_shape[0] * grid_shape[1] * grid_shape[2] * 5
+    max_tris = grid_shape[0] * grid_shape[1] * grid_shape[2] * 3
+    mc = wp.MarchingCubes(
+        nx=u.shape[1],
+        ny=u.shape[2],
+        nz=u.shape[3],
+        max_verts=max_verts,
+        max_tris=max_tris,
+        device=device,
+    )
+
+    # Extract isosurface at vorticity_threshold
+    mc.surface(vorticity_magnitude[0], vorticity_threshold)
+
+    if mc.verts.shape[0] == 0:
+        print(f"Warning: No vertices found for vorticity at timestep {timestep}. Try adjusting vorticity_threshold.")
+        return
+
+    grid_to_point_op = GridToPoint(
+        precision_policy=precision_policy,
+        compute_backend=ComputeBackend.WARP,
+    )
+
+    scalars = wp.zeros(mc.verts.shape[0], dtype=wp.float32, device=device)
+    scalars = grid_to_point_op(vorticity_magnitude, mc.verts, scalars)
+
+    colors = wp.empty(mc.verts.shape[0], dtype=wp.vec3, device=device)
+    # Compute dynamic range based on actual vorticity values
+    scalars_np = scalars.numpy()
+    vorticity_min = float(np.percentile(scalars_np, 5))  # 5th percentile to exclude outliers
+    vorticity_max = float(np.percentile(scalars_np, 95))  # 95th percentile to exclude outliers
+
+    if abs(vorticity_max - vorticity_min) < 1e-6:
+        vorticity_max = vorticity_min + 1e-6
+
+    wp.launch(
+        get_color,
+        dim=mc.verts.shape[0],
+        inputs=(vorticity_min, vorticity_max, scalars),
+        outputs=(colors,),
+        device=device,
+    )
+
+    vertices = mc.verts.numpy()
+    indices = mc.indices.numpy()
+    tri_count = len(indices) // 3
+
+    usd_mesh.GetPointsAttr().Set(vertices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexCountsAttr().Set([3] * tri_count, time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexIndicesAttr().Set(indices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetDisplayColorAttr().Set(colors.numpy().tolist(), time=timestep // post_process_interval)
+    UsdGeom.Primvar(usd_mesh.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+    print(f"Vorticity visualization at timestep {timestep}:")
+    print(f"  Number of vertices: {len(vertices)}")
+    print(f"  Number of triangles: {tri_count}")
+    print(f"  Vorticity range: [{vorticity_min:.6f}, {vorticity_max:.6f}]")
+
+
+def save_usd_q_criterion(timestep, bc_mask, f_current, grid_shape, usd_mesh, q_criterion_operator, precision_policy, q_threshold, usd_stage):
+    # Compute macroscopic quantities using WARP backend for Q-criterion
+    device = "cuda:0"  # Use the same device as the simulation
+    macro_wp = Macroscopic(
+        compute_backend=ComputeBackend.WARP,
+        precision_policy=precision_policy,
+        velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP),
+    )
+    rho = wp.zeros((1, *grid_shape), dtype=wp.float32, device=device)
+    u = wp.zeros((3, *grid_shape), dtype=wp.float32, device=device)
+    rho, u = macro_wp(f_current, rho, u)
+    u = u[:, 1:-1, 1:-1, 1:-1]  # Remove bc
+
+    # Allocate arrays for norm_mu and q_field on the same device as input
+    norm_mu = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+    q_field = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+
+    # Compute Q criterion
+    norm_mu, q_field = q_criterion_operator(u, bc_mask, norm_mu, q_field)
+
+    # Setup marching cubes on q_field[0] (use grid_shape minus ghost layers)
+    max_verts = grid_shape[0] * grid_shape[1] * grid_shape[2] * 5
+    max_tris = grid_shape[0] * grid_shape[1] * grid_shape[2] * 3
+    mc = wp.MarchingCubes(
+        nx=u.shape[1],
+        ny=u.shape[2],
+        nz=u.shape[3],
+        max_verts=max_verts,
+        max_tris=max_tris,
+        device=device,
+    )
+
+    # Extract isosurface at q_threshold
+    mc.surface(q_field[0], q_threshold)
+
+    if mc.verts.shape[0] == 0:
+        print(f"Warning: No vertices found for Q-criterion at timestep {timestep}. Try adjusting q_threshold.")
+        return
+
+    grid_to_point_op = GridToPoint(
+        precision_policy=precision_policy,
+        compute_backend=ComputeBackend.WARP,
+    )
+
+    scalars = wp.zeros(mc.verts.shape[0], dtype=wp.float32, device=device)
+    scalars = grid_to_point_op(norm_mu, mc.verts, scalars)
+
+    colors = wp.empty(mc.verts.shape[0], dtype=wp.vec3, device=device)
+    vorticity_min = 0.0
+    vorticity_max = 0.1
+    wp.launch(
+        get_color,
+        dim=mc.verts.shape[0],
+        inputs=(vorticity_min, vorticity_max, scalars),
+        outputs=(colors,),
+        device=device,
+    )
+
+    vertices = mc.verts.numpy()
+    indices = mc.indices.numpy()
+    tri_count = len(indices) // 3
+
+    usd_mesh.GetPointsAttr().Set(vertices.tolist(), time=timestep)
+    usd_mesh.GetFaceVertexCountsAttr().Set([3] * tri_count, time=timestep)
+    usd_mesh.GetFaceVertexIndicesAttr().Set(indices.tolist(), time=timestep)
+    usd_mesh.GetDisplayColorAttr().Set(colors.numpy().tolist(), time=timestep)
+    UsdGeom.Primvar(usd_mesh.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+    print(f"Q-criterion visualization at timestep {timestep}:")
+    print(f"  Number of vertices: {len(vertices)}")
+    print(f"  Number of triangles: {tri_count}")
+    print(f"  Vorticity range: [{vorticity_min:.6f}, {vorticity_max:.6f}]")
+
+
+# -------------------------- Function to Save UAV Body and Blades to USD --------------------------
+def save_usd_uav_and_blades(
+    timestep, post_process_interval, vertices_wp, num_body_vertices, body_faces_np, blades_faces_np, usd_uav_body, usd_uav_blades, usd_stage
+):
+    # Get vertices and adjust for boundary condition offset
+    body_vertices = vertices_wp.numpy()[:num_body_vertices]
+    blades_vertices = vertices_wp.numpy()[num_body_vertices:]
+
+    # Adjust vertices by subtracting the boundary condition offset (20 cells)
+    body_vertices = body_vertices - 20
+    blades_vertices = blades_vertices - 20
+
+    # Process body mesh
+    uav_body_tri_count = len(body_faces_np)
+    usd_uav_body.GetPointsAttr().Set(body_vertices.tolist(), time=timestep // post_process_interval)
+    usd_uav_body.GetFaceVertexCountsAttr().Set(Vt.IntArray([3] * uav_body_tri_count), time=timestep // post_process_interval)
+    usd_uav_body.GetFaceVertexIndicesAttr().Set(Vt.IntArray(body_faces_np.flatten().tolist()), time=timestep // post_process_interval)
+
+    # Process blades mesh
+    # Rebase blade face indices to the local blade vertex array
+    blades_faces_np_corrected = blades_faces_np - num_body_vertices
+    uav_blades_tri_count = len(blades_faces_np_corrected)
+    usd_uav_blades.GetPointsAttr().Set(blades_vertices.tolist(), time=timestep // post_process_interval)
+    usd_uav_blades.GetFaceVertexCountsAttr().Set(Vt.IntArray([3] * uav_blades_tri_count), time=timestep // post_process_interval)
+    usd_uav_blades.GetFaceVertexIndicesAttr().Set(Vt.IntArray(blades_faces_np_corrected.flatten().tolist()), time=timestep // post_process_interval)
+
+    print(f"UAV body and blades USD updated at timestep {timestep}")
+
+
+# -------------------------- Mesh Loading and IBM Setup --------------------------
+def load_and_prepare_meshes(grid_shape, stl_dir="../stl-files/X8_new"):
+    if not os.path.exists(stl_dir):
+        raise FileNotFoundError(f"STL directory {stl_dir} does not exist.")
+    main_body_stl = os.path.join(stl_dir, "X8_Main_Body.stl")
+    if not os.path.isfile(main_body_stl):
+        raise FileNotFoundError(f"Main body STL file {main_body_stl} not found.")
+    main_body_mesh = trimesh.load_mesh(main_body_stl, process=False)
+    num_blades = 8
+    blade_ranges = []
+    cumulative_vertex_count = 0
+    target_size = 50.0
+    body_scale = target_size / max(main_body_mesh.extents)
+    domain_center = np.array([(s - 1) / 2 for s in grid_shape])
+    main_body_mesh = transform_mesh(main_body_mesh, translation=domain_center, scale=body_scale)
+    body_vertices_wp, body_vertex_areas_wp, body_faces = prepare_immersed_boundary(
+        main_body_mesh, max_lbm_length=target_size, translation=None, scale=None
+    )
+    all_vertices = body_vertices_wp.numpy()
+    all_vertex_areas = body_vertex_areas_wp.numpy()
+    all_faces = body_faces.copy()
+    transformed_centers = []
+    for blade_id in range(1, num_blades + 1):
+        blade_stl = os.path.join(stl_dir, f"Blade{blade_id}.stl")
+        if not os.path.isfile(blade_stl):
+            raise FileNotFoundError(f"Blade STL file {blade_stl} not found.")
+        blade_mesh = trimesh.load_mesh(blade_stl, process=False)
+        blade_com = blade_mesh.center_mass
+        blade_mesh = transform_mesh(blade_mesh, translation=domain_center, scale=body_scale)
+        blade_vertices_wp, blade_vertex_areas_wp, blade_faces = prepare_immersed_boundary(
+            blade_mesh, max_lbm_length=target_size, translation=None, scale=None
+        )
+        blade_vertices_np = blade_vertices_wp.numpy()
+        blade_vertex_areas_np = blade_vertex_areas_wp.numpy()
+        blade_faces_offset = blade_faces + cumulative_vertex_count + len(body_vertices_wp)
+        all_faces = np.vstack([all_faces, blade_faces_offset])
+        all_vertices = np.vstack([all_vertices, blade_vertices_np])
+        all_vertex_areas = np.hstack([all_vertex_areas, blade_vertex_areas_np])
+        start_idx = cumulative_vertex_count + len(body_vertices_wp)
+        end_idx = start_idx + len(blade_vertices_np)
+        blade_ranges.append((start_idx, end_idx))
+        cumulative_vertex_count += len(blade_vertices_np)
+        transformed_com = (blade_com * body_scale) + domain_center
+        transformed_centers.append(transformed_com)
+        print(f"Blade {blade_id} processed:")
+        print(f"  Number of vertices: {len(blade_vertices_np)}")
+        print(f"  Center of mass (transformed): {transformed_com}")
+    vertices_wp = wp.array(all_vertices, dtype=wp.vec3)
+    vertex_areas_wp = wp.array(all_vertex_areas, dtype=wp.float32)
+    faces_np = all_faces
+    num_body_vertices = len(body_vertices_wp)
+    num_blade_vertices = len(all_vertices) - num_body_vertices
+
+    # Separate body and blade faces based on the original ordering
+    body_faces_np = body_faces
+    blades_faces_np = all_faces[len(body_faces_np) :]
+
+    print("\nBlade Centers (Transformed COM):")
+    for i, center in enumerate(transformed_centers):
+        print(f"Blade {i + 1}: {center}")
+    print(f"\nTotal number of vertices: {len(all_vertices)}")
+    print(f"Number of body vertices: {num_body_vertices}")
+    print(f"Number of blade vertices: {num_blade_vertices}")
+    return {
+        "vertices_wp": vertices_wp,
+        "vertex_areas_wp": vertex_areas_wp,
+        "faces_np": faces_np,
+        "blade_ranges": blade_ranges,
+        "blade_centers": np.array(transformed_centers),
+        "num_body_vertices": num_body_vertices,
+        "num_blade_vertices": num_blade_vertices,
+        "body_faces_np": body_faces_np,
+        "blades_faces_np": blades_faces_np,
+    }
+
+
+# -------------------------- Boundary Conditions --------------------------
+def define_boundary_indices(grid, velocity_set):
+    box = grid.bounding_box_indices()
+    box_no_edge = grid.bounding_box_indices(remove_edges=True)
+    inlet = box_no_edge["top"]
+    outlet = box_no_edge["bottom"]
+    walls = [box["front"][i] + box["back"][i] + box["left"][i] + box["right"][i] for i in range(velocity_set.d)]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+    return inlet, outlet, walls
+
+
+def bc_profile(precision_policy, grid_shape, u_max):
+    _dtype = precision_policy.store_precision.wp_dtype
+    u_max_d = _dtype(u_max)
+    L_x = _dtype(grid_shape[0] - 1)
+    L_y = _dtype(grid_shape[1] - 1)
+
+    @wp.func
+    def bc_profile_warp(index: wp.vec3i):
+        x = _dtype(index[0])
+        y = _dtype(index[1])
+        x_center = x - (L_x / _dtype(2.0))
+        y_center = y - (L_y / _dtype(2.0))
+        r_squared = (_dtype(2.0) * x_center / L_x) ** _dtype(2.0) + (_dtype(2.0) * y_center / L_y) ** _dtype(2.0)
+        velocity_z = u_max_d * wp.max(_dtype(0.0), _dtype(1.0) - r_squared)
+        return wp.vec(-velocity_z, length=1)
+
+    return bc_profile_warp
+
+
+def setup_boundary_conditions(grid, velocity_set, precision_policy, grid_shape, u_max):
+    inlet, outlet, walls = define_boundary_indices(grid, velocity_set)
+    bc_inlet = RegularizedBC("velocity", indices=inlet, profile=bc_profile(precision_policy, grid_shape, -u_max))
+    bc_outlet = ExtrapolationOutflowBC(indices=outlet)
+    bc_walls = FullwayBounceBackBC(indices=walls)
+    return [bc_walls, bc_inlet, bc_outlet]
+
+
+# -------------------------- Stepper Setup --------------------------
+def setup_stepper(grid, boundary_conditions, lbm_omega):
+    return IBMStepper(
+        grid=grid,
+        boundary_conditions=boundary_conditions,
+        collision_type="KBC",
+    )
+
+
+def post_process(
+    i,
+    post_process_interval,
+    f_current,
+    bc_mask,
+    grid,
+    faces_np,
+    vertices_wp,
+    precision_policy,
+    grid_shape,
+    blade_ranges,
+    num_blades,
+    usd_mesh,
+    vorticity_operator,
+    usd_stage,
+):
+    # Compute macroscopic quantities using the JAX backend for visualization
+    if not isinstance(f_current, jnp.ndarray):
+        f_jax = wp.to_jax(f_current)
+    macro_jax = Macroscopic(
+        compute_backend=ComputeBackend.JAX,
+        precision_policy=precision_policy,
+        velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
+    )
+    rho, u = macro_jax(f_jax)
+    u = u[:, 1:-1, 1:-1, 1:-1]  # Remove ghost layers if present
+
+    # Save standard visualization outputs
+    fields = {
+        "u_magnitude": (u[0] ** 2 + u[1] ** 2 + u[2] ** 2) ** 0.5,
+        "u_x": u[0],
+        "u_y": u[1],
+        "u_z": u[2],
+    }
+    save_fields_vtk(fields, timestep=i)
+    reconstruct_mesh_from_vertices_and_faces(vertices_wp=vertices_wp, faces_np=faces_np, save_path=f"mesh_{i:06d}.stl")
+    save_image(fields["u_magnitude"][grid_shape[0] // 2, :, :], timestep=i)
+
+    # Plot blade positions
+    plt.figure(figsize=(10, 10))
+    padding = 20
+    x_min = -padding
+    x_max = grid_shape[0] + padding
+    y_min = -padding
+    y_max = grid_shape[1] + padding
+    plt.xlim(x_min, x_max)
+    plt.ylim(y_min, y_max)
+    plt.autoscale(False)
+    for blade_id in range(num_blades):
+        start, end = blade_ranges[blade_id]
+        blade_vertices = vertices_wp[start:end].numpy()
+        plt.scatter(blade_vertices[:, 0], blade_vertices[:, 1], s=20, label=f"Blade {blade_id + 1}")
+    plt.gca().set_aspect("equal", adjustable="box")
+    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize="small", markerscale=0.8)
+    plt.title(f"Blade Positions at Step {i}")
+    plt.xlabel("X")
+    plt.ylabel("Y")
+    plt.savefig(f"blade_positions_{i:06d}.png", bbox_inches="tight", dpi=150, pad_inches=0.2)
+    plt.close()
+
+    save_usd_vorticity(
+        i,
+        post_process_interval,
+        bc_mask,
+        f_current,
+        grid_shape,
+        usd_mesh,
+        vorticity_operator,
+        precision_policy=precision_policy,
+        vorticity_threshold=9e-3,
+        usd_stage=usd_stage,
+    )
+
+
+# -------------------------- Simulation Parameters --------------------------
+grid_shape = (512, 512, 512)
+u_max = 0.01
+num_steps = 100000
+post_process_interval = 100
+print_interval = 1000
+num_blades = 8
+angular_velocity = 0.001
+
+Re = 50000.0
+clength = grid_shape[2] - 1
+visc = u_max * clength / Re
+omega = 1.0 / (3.0 * visc + 0.5)
+compute_backend = ComputeBackend.WARP
+precision_policy = PrecisionPolicy.FP32FP32
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
+
+# -------------------------- Initialize Simulation --------------------------
+xlb.init(
+    velocity_set=velocity_set,
+    default_backend=compute_backend,
+    default_precision_policy=precision_policy,
+)
+
+grid = grid_factory(grid_shape, compute_backend=compute_backend)
+
+print("\n" + "=" * 50 + "\n")
+print("Simulation Configuration:")
+print(f"Grid size: {grid_shape[0]} x {grid_shape[1]} x {grid_shape[2]}")
+print(f"Omega: {omega}")
+print(f"Backend: {compute_backend}")
+print(f"Velocity set: {velocity_set}")
+print(f"Precision policy: {precision_policy}")
+print(f"Prescribed velocity: {-u_max}")
+print(f"Reynolds number: {Re}")
+print(f"Max iterations: {num_steps}")
+print("\n" + "=" * 50 + "\n")
+
+# -------------------------- Create USD stage and meshes for the outputs --------------------------
+usd_output_directory = "usd_output"
+os.makedirs(usd_output_directory, exist_ok=True)
+usd_file = os.path.join(usd_output_directory, "output.usd")
+usd_stage = Usd.Stage.CreateNew(usd_file)
+usd_mesh = UsdGeom.Mesh.Define(usd_stage, "/World/FluidField")
+usd_uav_body = UsdGeom.Mesh.Define(usd_stage, "/World/UAVBody")
+usd_uav_blades = UsdGeom.Mesh.Define(usd_stage, "/World/UAVBlades")
+
+mesh_data = load_and_prepare_meshes(grid_shape)
+vertices_wp = mesh_data["vertices_wp"]
+vertex_areas_wp = mesh_data["vertex_areas_wp"]
+faces_np = mesh_data["faces_np"]
+blade_ranges = mesh_data["blade_ranges"]
+blade_centers = mesh_data["blade_centers"]
+num_body_vertices = mesh_data["num_body_vertices"]
+num_blade_vertices = mesh_data["num_blade_vertices"]
+body_faces_np = mesh_data["body_faces_np"]
+blades_faces_np = mesh_data["blades_faces_np"]
+
+bc_list = setup_boundary_conditions(grid, velocity_set, precision_policy, grid_shape, u_max)
+stepper = setup_stepper(grid, bc_list, omega)
+f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
+
+q_criterion_operator = QCriterion(
+    velocity_set=velocity_set,
+    precision_policy=precision_policy,
+    compute_backend=compute_backend,
+)
+
+vorticity_operator = Vorticity(
+    velocity_set=velocity_set,
+    precision_policy=precision_policy,
+    compute_backend=compute_backend,
+)
+_num_body_vertices = wp.constant(num_body_vertices)
+_num_blades = wp.constant(num_blades)
+velocities_wp = wp.zeros(shape=vertices_wp.shape[0], dtype=wp.vec3)
+blade_ranges_start = [start for (start, end) in blade_ranges]
+blade_ranges_end = [end for (start, end) in blade_ranges]
+
+
+blade_ranges_start_wp = wp.constant(wp.vec(len(blade_ranges_start), dtype=int)(blade_ranges_start))
+blade_ranges_end_wp = wp.constant(wp.vec(len(blade_ranges_end), dtype=int)(blade_ranges_end))
+
+# Flatten blade centers into separate x, y, z components
+blade_centers_x = [center[0] for center in blade_centers]
+blade_centers_y = [center[1] for center in blade_centers]
+blade_centers_z = [center[2] for center in blade_centers]
+
+blade_centers_x_wp = wp.constant(wp.vec(len(blade_centers_x), dtype=float)(blade_centers_x))
+blade_centers_y_wp = wp.constant(wp.vec(len(blade_centers_y), dtype=float)(blade_centers_y))
+blade_centers_z_wp = wp.constant(wp.vec(len(blade_centers_z), dtype=float)(blade_centers_z))
+
+
+# -------------------------- Blade Rotation Kernel --------------------------
+@wp.kernel
+def rotate_blades(
+    timestep: int,
+    forces: wp.array(dtype=wp.vec3),
+    vertices: wp.array(dtype=wp.vec3),
+    velocities: wp.array(dtype=wp.vec3),
+):
+    idx = wp.tid()
+    if idx < _num_body_vertices:
+        velocities[idx] = wp.vec3(0.0, 0.0, 0.0)
+        return
+    blade_id = wp.int32(0)
+    for b in range(_num_blades):
+        start = blade_ranges_start_wp[b]
+        end = blade_ranges_end_wp[b]
+        if start <= idx < end:
+            blade_id = b
+            break
+    else:
+        velocities[idx] = wp.vec3(0.0, 0.0, 0.0)
+        return
+
+    # Reconstruct center from flattened components
+    center = wp.vec3(blade_centers_x_wp[blade_id], blade_centers_y_wp[blade_id], blade_centers_z_wp[blade_id])
+
+    rel_pos = vertices[idx] - center
+    direction = wp.float32(1.0)
+    if blade_id == 1 or blade_id == 5 or blade_id == 3 or blade_id == 7:
+        direction = wp.float32(-1.0)
+    theta = direction * angular_velocity
+    cos_theta = wp.cos(theta)
+    sin_theta = wp.sin(theta)
+    new_rel_pos = wp.vec3(cos_theta * rel_pos[0] - sin_theta * rel_pos[1], sin_theta * rel_pos[0] + cos_theta * rel_pos[1], rel_pos[2])
+    vertices[idx] = new_rel_pos + center
+    velocities[idx] = wp.vec3(-direction * angular_velocity * rel_pos[1], direction * angular_velocity * rel_pos[0], 0.0)
+
+
+start_time = time.time()
+
+for i in range(num_steps):
+    f_0, f_1, lag_forces = stepper(
+        f_0,
+        f_1,
+        vertices_wp,
+        vertex_areas_wp,
+        velocities_wp,
+        bc_mask,
+        missing_mask,
+        omega,
+        i,
+    )
+    f_0, f_1 = f_1, f_0
+
+    # Update solid velocities and positions
+    wp.launch(
+        kernel=rotate_blades,
+        dim=vertices_wp.shape[0],
+        inputs=[i, lag_forces, vertices_wp, velocities_wp],
+    )
+
+    if i % post_process_interval == 0 or i == num_steps - 1:
+        post_process(
+            i,
+            post_process_interval,
+            f_0,
+            bc_mask,
+            grid,
+            faces_np,
+            vertices_wp,
+            precision_policy,
+            grid_shape,
+            blade_ranges,
+            num_blades,
+            usd_mesh,
+            vorticity_operator,
+            usd_stage,
+        )
+        save_usd_uav_and_blades(
+            i,
+            post_process_interval,
+            vertices_wp,
+            num_body_vertices,
+            body_faces_np,
+            blades_faces_np,
+            usd_uav_body,
+            usd_uav_blades,
+            usd_stage,
+        )
+
+usd_stage.SetStartTimeCode(0)
+usd_stage.SetEndTimeCode(num_steps // post_process_interval)
+usd_stage.SetTimeCodesPerSecond(30)
+
+usd_stage.Save()
diff --git a/examples/ibm/wind_turbine/wind_turbine.py b/examples/ibm/wind_turbine/wind_turbine.py
new file mode 100644
index 00000000..cf09f321
--- /dev/null
+++ b/examples/ibm/wind_turbine/wind_turbine.py
@@ -0,0 +1,708 @@
+import os
+import xlb
+import trimesh
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.operator.stepper import IBMStepper
+from xlb.operator.boundary_condition import (
+    FullwayBounceBackBC,
+    RegularizedBC,
+    ExtrapolationOutflowBC,
+)
+from xlb.operator.macroscopic import Macroscopic
+from xlb.utils import save_fields_vtk, save_image
+import warp as wp
+import numpy as np
+import matplotlib.pyplot as plt
+from xlb.helper.ibm_helper import prepare_immersed_boundary
+from xlb.grid import grid_factory
+from pxr import Usd, UsdGeom, Vt
+from xlb.operator.postprocess import QCriterion, Vorticity, GridToPoint
+
+
+@wp.kernel
+def get_color(
+    low: float,
+    high: float,
+    values: wp.array(dtype=float),
+    out_color: wp.array(dtype=wp.vec3),
+):
+    tid = wp.tid()
+    v = values[tid]
+    r = 1.0
+    g = 1.0
+    b = 1.0
+
+    if v < low:
+        v = low
+    if v > high:
+        v = high
+
+    dv = high - low
+    if v < (low + 0.25 * dv):
+        r = 0.0
+        g = 4.0 * (v - low) / dv
+    elif v < (low + 0.5 * dv):
+        r = 0.0
+        b = 1.0 + 4.0 * (low + 0.25 * dv - v) / dv
+    elif v < (low + 0.75 * dv):
+        r = 4.0 * (v - low - 0.5 * dv) / dv
+        b = 0.0
+    else:
+        g = 1.0 + 4.0 * (low + 0.75 * dv - v) / dv
+        b = 0.0
+
+    out_color[tid] = wp.vec3(r, g, b)
+
+
+def grid_to_point(field, verts, out):
+    verts_np = verts.numpy()
+    nx, ny, nz = field.shape
+    out_np = out.numpy()
+    for i, v in enumerate(verts_np):
+        x = int(round(v[0]))
+        y = int(round(v[1]))
+        z = int(round(v[2]))
+        x = min(max(x, 0), nx - 1)
+        y = min(max(y, 0), ny - 1)
+        z = min(max(z, 0), nz - 1)
+        out_np[i] = field[x, y, z]
+    return wp.array(out_np, dtype=wp.float32, device=out.device)
+
+
+def save_usd_vorticity(
+    timestep,
+    post_process_interval,
+    bc_mask,
+    f_current,
+    grid_shape,
+    usd_mesh,
+    vorticity_operator,
+    precision_policy,
+    vorticity_threshold,
+    usd_stage,
+):
+    device = "cuda:1"
+    f_current_new = wp.clone(f_current, device=device)
+    bc_mask_new = wp.clone(bc_mask, device=device)
+    with wp.ScopedDevice(device):
+        macro_wp = Macroscopic(
+            compute_backend=ComputeBackend.WARP,
+            precision_policy=precision_policy,
+            velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP),
+        )
+        rho = wp.zeros((1, *grid_shape), dtype=wp.float32, device=device)
+        u = wp.zeros((3, *grid_shape), dtype=wp.float32, device=device)
+        rho, u = macro_wp(f_current_new, rho, u)
+        # Clip a boundary slice just for clarity in visualization
+        u = u[:, 20:-20, 20:-20, 5:-20]
+
+        vorticity = wp.zeros((3, *u.shape[1:]), dtype=wp.float32, device=device)
+        vorticity_magnitude = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+        vorticity, vorticity_magnitude = vorticity_operator(u, bc_mask_new, vorticity, vorticity_magnitude)
+
+        max_verts = grid_shape[0] * grid_shape[1] * grid_shape[2] * 5
+        max_tris = grid_shape[0] * grid_shape[1] * grid_shape[2] * 3
+        mc = wp.MarchingCubes(nx=u.shape[1], ny=u.shape[2], nz=u.shape[3], max_verts=max_verts, max_tris=max_tris, device=device)
+        mc.surface(vorticity_magnitude[0], vorticity_threshold)
+        if mc.verts.shape[0] == 0:
+            print(f"Warning: No vertices found for vorticity at timestep {timestep}.")
+            return
+
+        grid_to_point_op = GridToPoint(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP)
+        scalars = wp.zeros(mc.verts.shape[0], dtype=wp.float32, device=device)
+        scalars = grid_to_point_op(vorticity_magnitude, mc.verts, scalars)
+
+        colors = wp.empty(mc.verts.shape[0], dtype=wp.vec3, device=device)
+        scalars_np = scalars.numpy()
+        vorticity_min = float(np.percentile(scalars_np, 5))
+        vorticity_max = float(np.percentile(scalars_np, 95))
+        if abs(vorticity_max - vorticity_min) < 1e-6:
+            vorticity_max = vorticity_min + 1e-6
+
+        wp.launch(
+            get_color,
+            dim=mc.verts.shape[0],
+            inputs=(vorticity_min, vorticity_max, scalars),
+            outputs=(colors,),
+            device=device,
+        )
+
+        vertices = mc.verts.numpy()
+        indices = mc.indices.numpy()
+        tri_count = len(indices) // 3
+
+    usd_mesh.GetPointsAttr().Set(vertices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexCountsAttr().Set([3] * tri_count, time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexIndicesAttr().Set(indices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetDisplayColorAttr().Set(colors.numpy().tolist(), time=timestep // post_process_interval)
+    UsdGeom.Primvar(usd_mesh.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+    print(f"Vorticity visualization at timestep {timestep}:")
+    print(f"  Number of vertices: {len(vertices)}")
+    print(f"  Number of triangles: {tri_count}")
+    print(f"  Vorticity range: [{vorticity_min:.6f}, {vorticity_max:.6f}]")
+
+
+def save_usd_q_criterion(
+    timestep,
+    post_process_interval,
+    bc_mask,
+    f_current,
+    grid_shape,
+    usd_mesh,
+    q_criterion_operator,
+    precision_policy,
+    q_threshold,
+    usd_stage,
+):
+    device = "cuda:1"
+    f_current_new = wp.clone(f_current, device=device)
+    bc_mask_new = wp.clone(bc_mask, device=device)
+    with wp.ScopedDevice(device):
+        macro_wp = Macroscopic(
+            compute_backend=ComputeBackend.WARP,
+            precision_policy=precision_policy,
+            velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP),
+        )
+        rho = wp.zeros((1, *grid_shape), dtype=wp.float32, device=device)
+        u = wp.zeros((3, *grid_shape), dtype=wp.float32, device=device)
+        rho, u = macro_wp(f_current_new, rho, u)
+        # Clip a boundary slice just for clarity
+        u = u[:, 20:-20, 20:-20, 5:-20]
+
+        norm_mu = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+        q_field = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+        norm_mu, q_field = q_criterion_operator(u, bc_mask_new, norm_mu, q_field)
+
+        max_verts = grid_shape[0] * grid_shape[1] * grid_shape[2] * 5
+        max_tris = grid_shape[0] * grid_shape[1] * grid_shape[2] * 3
+        mc = wp.MarchingCubes(nx=u.shape[1], ny=u.shape[2], nz=u.shape[3], max_verts=max_verts, max_tris=max_tris, device=device)
+        mc.surface(q_field[0], q_threshold)
+        if mc.verts.shape[0] == 0:
+            print(f"Warning: No vertices found for Q-criterion at timestep {timestep}.")
+            return
+
+        grid_to_point_op = GridToPoint(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP)
+        scalars = wp.zeros(mc.verts.shape[0], dtype=wp.float32, device=device)
+        scalars = grid_to_point_op(norm_mu, mc.verts, scalars)
+
+        colors = wp.empty(mc.verts.shape[0], dtype=wp.vec3, device=device)
+        vorticity_min = 0.0
+        vorticity_max = 0.1
+        wp.launch(
+            get_color,
+            dim=mc.verts.shape[0],
+            inputs=(vorticity_min, vorticity_max, scalars),
+            outputs=(colors,),
+            device=device,
+        )
+
+    vertices = mc.verts.numpy()
+    indices = mc.indices.numpy()
+    tri_count = len(indices) // 3
+
+    usd_mesh.GetPointsAttr().Set(vertices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexCountsAttr().Set([3] * tri_count, time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexIndicesAttr().Set(indices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetDisplayColorAttr().Set(colors.numpy().tolist(), time=timestep // post_process_interval)
+    UsdGeom.Primvar(usd_mesh.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+    print(f"Q-criterion visualization at timestep {timestep}:")
+    print(f"  Number of vertices: {len(vertices)}")
+    print(f"  Number of triangles: {tri_count}")
+    print(f"  Vorticity range: [{vorticity_min:.6f}, {vorticity_max:.6f}]")
+
+
+def save_usd_turbine_parts(
+    timestep,
+    post_process_interval,
+    vertices_wp,
+    num_body_vertices,
+    body_faces_np,
+    rotor_faces_np,
+    usd_turbine_body,
+    usd_turbine_rotor,
+    usd_stage,
+    lag_forces=None,
+):
+    offset = np.array([20, 20, 5])
+    body_vertices = vertices_wp.numpy()[:num_body_vertices] - offset
+    rotor_vertices = vertices_wp.numpy()[num_body_vertices:] - offset
+    body_tri_count = len(body_faces_np)
+
+    usd_turbine_body.GetPointsAttr().Set(body_vertices.tolist(), time=timestep // post_process_interval)
+    usd_turbine_body.GetFaceVertexCountsAttr().Set(Vt.IntArray([3] * body_tri_count), time=timestep // post_process_interval)
+    usd_turbine_body.GetFaceVertexIndicesAttr().Set(Vt.IntArray(body_faces_np.flatten().tolist()), time=timestep // post_process_interval)
+
+    rotor_faces_np_corrected = rotor_faces_np - num_body_vertices
+    rotor_tri_count = len(rotor_faces_np_corrected)
+
+    usd_turbine_rotor.GetPointsAttr().Set(rotor_vertices.tolist(), time=timestep // post_process_interval)
+    usd_turbine_rotor.GetFaceVertexCountsAttr().Set(Vt.IntArray([3] * rotor_tri_count), time=timestep // post_process_interval)
+    usd_turbine_rotor.GetFaceVertexIndicesAttr().Set(Vt.IntArray(rotor_faces_np_corrected.flatten().tolist()), time=timestep // post_process_interval)
+
+    if lag_forces is not None:
+        body_force_np = lag_forces.numpy()[:num_body_vertices, 0]
+        rotor_force_np = lag_forces.numpy()[num_body_vertices:, 0]
+
+        min_body_force = np.min(body_force_np)
+        max_body_force = np.max(body_force_np)
+        min_rotor_force = np.min(rotor_force_np)
+        max_rotor_force = np.max(rotor_force_np)
+
+        body_colors = wp.zeros(num_body_vertices, dtype=wp.vec3)
+        rotor_colors = wp.zeros(vertices_wp.shape[0] - num_body_vertices, dtype=wp.vec3)
+
+        wp.launch(
+            kernel=get_color,
+            dim=num_body_vertices,
+            inputs=[min_body_force, max_body_force, wp.from_numpy(body_force_np), body_colors],
+        )
+
+        wp.launch(
+            kernel=get_color,
+            dim=vertices_wp.shape[0] - num_body_vertices,
+            inputs=[min_rotor_force, max_rotor_force, wp.from_numpy(rotor_force_np), rotor_colors],
+        )
+
+        body_colors_np = body_colors.numpy()
+        rotor_colors_np = rotor_colors.numpy()
+
+        usd_turbine_body.GetDisplayColorAttr().Set(body_colors_np.tolist(), time=timestep // post_process_interval)
+        UsdGeom.Primvar(usd_turbine_body.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+        usd_turbine_rotor.GetDisplayColorAttr().Set(rotor_colors_np.tolist(), time=timestep // post_process_interval)
+        UsdGeom.Primvar(usd_turbine_rotor.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+    print(f"Turbine mesh updated at timestep {timestep}")
+
+
+def define_boundary_indices(grid, velocity_set):
+    box = grid.bounding_box_indices()
+    box_no_edge = grid.bounding_box_indices(remove_edges=True)
+    inlet = box_no_edge["front"]
+    outlet = box_no_edge["back"]
+    walls = [box["right"][i] + box["left"][i] + box["top"][i] + box["bottom"][i] for i in range(velocity_set.d)]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+    return inlet, outlet, walls
+
+
+def bc_profile(precision_policy, grid_shape, u_max):
+    _dtype = precision_policy.store_precision.wp_dtype
+    u_max_d = _dtype(u_max)
+    @wp.func
+    def bc_profile_warp(index: wp.vec3i):
+        return wp.vec(u_max_d, length=1)
+
+    return bc_profile_warp
+
+def setup_boundary_conditions(grid, velocity_set, precision_policy, grid_shape, inlet_speed):
+    inlet, outlet, walls = define_boundary_indices(grid, velocity_set)
+    bc_inlet = RegularizedBC("velocity", indices=inlet, profile=bc_profile(precision_policy, grid_shape, inlet_speed))
+    bc_outlet = ExtrapolationOutflowBC(indices=outlet)
+    bc_walls = FullwayBounceBackBC(indices=walls)
+    return [bc_inlet, bc_outlet, bc_walls]
+
+
+def setup_stepper(grid, boundary_conditions, lbm_omega):
+    return IBMStepper(
+        grid=grid,
+        boundary_conditions=boundary_conditions,
+        collision_type="KBC",
+    )
+
+
+def visualize_turbine_placement(vertices_wp, grid_shape):
+    verts = vertices_wp.numpy()
+    turbine_min = verts.min(axis=0)
+    turbine_max = verts.max(axis=0)
+
+    plt.figure(figsize=(10, 5))
+    domain_x = [0, grid_shape[0], grid_shape[0], 0, 0]
+    domain_y = [0, 0, grid_shape[1], grid_shape[1], 0]
+    plt.plot(domain_x, domain_y, "k-", label="Domain", linewidth=1)
+
+    # Show the bounding box of the turbine in X-Y plane (ignore Z)
+    tx = [turbine_min[0], turbine_max[0], turbine_max[0], turbine_min[0], turbine_min[0]]
+    ty = [turbine_min[1], turbine_min[1], turbine_max[1], turbine_max[1], turbine_min[1]]
+    plt.plot(tx, ty, "r-", linewidth=2, label="Turbine bounding box")
+
+    plt.title("Turbine Placement (X-Y Top View)")
+    plt.xlabel("X (left→right)")
+    plt.ylabel("Y (front→back)")
+    plt.grid(True, linestyle="--", alpha=0.3)
+    plt.axis("equal")
+    plt.legend()
+
+    plt.savefig("turbine_placement.png", dpi=150, bbox_inches="tight")
+    plt.close()
+
+
+def load_and_prepare_meshes_turbine(grid_shape, stl_dir="./examples/ibm/wind_turbine/"):
+    rotor_stl = os.path.join(stl_dir, "turbine_wind_turbine.stl")
+    body_stl = os.path.join(stl_dir, "body_wind_turbine.stl")
+    if not os.path.isfile(rotor_stl):
+        raise FileNotFoundError(f"Cannot find {rotor_stl}")
+    if not os.path.isfile(body_stl):
+        raise FileNotFoundError(f"Cannot find {body_stl}")
+
+    rotor_mesh = trimesh.load_mesh(rotor_stl, process=False)
+    body_mesh = trimesh.load_mesh(body_stl, process=False)
+
+    # Identify rotor bounding box dimension that dictates scaling
+    rotor_bounds = rotor_mesh.bounds
+    rotor_size = rotor_bounds[1] - rotor_bounds[0]
+    rotor_diameter = max(rotor_size)  # largest dimension in the rotor
+    desired_diameter = 150.0
+
+    scale_factor = desired_diameter / rotor_diameter
+    print(f"Scale factor: {scale_factor:.4f}")
+
+    # We apply the scale to both rotor and body
+    rotor_mesh.apply_scale(scale_factor)
+    body_mesh.apply_scale(scale_factor)
+
+    R_dummy = trimesh.transformations.rotation_matrix(np.radians(0), [1, 0, 0])
+    rotor_mesh.apply_transform(R_dummy)
+    body_mesh.apply_transform(R_dummy)
+
+    combined = trimesh.util.concatenate([rotor_mesh, body_mesh])
+    bnds = combined.bounds
+    min_x, min_y, min_z = bnds[0]
+    max_x, max_y, max_z = bnds[1]
+    center_x = 0.5 * grid_shape[0]
+    center_y = 0.5 * grid_shape[1]
+
+    shift_x = center_x - 0.5 * (min_x + max_x)
+    shift_y = center_y - 0.5 * (min_y + max_y)
+    shift_z = -min_z
+
+    combined.apply_translation([shift_x, shift_y, shift_z])
+    rotor_mesh = trimesh.load_mesh(rotor_stl, process=False)
+    body_mesh = trimesh.load_mesh(body_stl, process=False)
+    rotor_mesh.apply_scale(scale_factor)
+    body_mesh.apply_scale(scale_factor)
+    rotor_mesh.apply_transform(R_dummy)
+    body_mesh.apply_transform(R_dummy)
+    rotor_mesh.apply_translation([shift_x, shift_y, shift_z])
+    body_mesh.apply_translation([shift_x, shift_y, shift_z])
+
+    Nx, Ny, Nz = grid_shape
+    rotor_v_wp, rotor_a_wp, rotor_faces = prepare_immersed_boundary(rotor_mesh, max_lbm_length=max(Nx, Ny, Nz))
+    body_v_wp, body_a_wp, body_faces = prepare_immersed_boundary(body_mesh, max_lbm_length=max(Nx, Ny, Nz))
+
+    all_vertices = body_v_wp.numpy()
+    all_areas = body_a_wp.numpy()
+    all_faces = body_faces.copy()
+    current_offset = len(body_v_wp)
+
+    rotor_v_np = rotor_v_wp.numpy()
+    rotor_a_np = rotor_a_wp.numpy()
+    rotor_f_offset = rotor_faces + current_offset
+    all_vertices = np.vstack([all_vertices, rotor_v_np])
+    all_areas = np.hstack([all_areas, rotor_a_np])
+    all_faces = np.vstack([all_faces, rotor_f_offset])
+
+    vertices_wp = wp.array(all_vertices, dtype=wp.vec3)
+    areas_wp = wp.array(all_areas, dtype=wp.float32)
+    faces_np = all_faces
+    num_body_vertices = len(body_v_wp)
+    num_rotor_vertices = len(rotor_v_wp)
+    body_faces_np = body_faces
+    rotor_faces_np = faces_np[len(body_faces_np):]
+
+    print("\nTurbine mesh preparation summary:")
+    print(f"  Scale factor: {scale_factor:.4f}")
+    print(f"  Rotor diameter (desired ~200): {desired_diameter}")
+    print(f"  Total vertices: {len(all_vertices)}")
+    print(f"  Body vertices: {num_body_vertices}")
+    print(f"  Rotor vertices: {num_rotor_vertices}")
+
+    return {
+        "vertices_wp": vertices_wp,
+        "areas_wp": areas_wp,
+        "faces_np": faces_np,
+        "num_body_vertices": num_body_vertices,
+        "num_rotor_vertices": num_rotor_vertices,
+        "body_faces_np": body_faces_np,
+        "rotor_faces_np": rotor_faces_np,
+    }
+
+
+@wp.kernel
+def rotate_rotor(
+    timestep: int,
+    forces: wp.array(dtype=wp.vec3),
+    vertices: wp.array(dtype=wp.vec3),
+    velocities: wp.array(dtype=wp.vec3),
+):
+    idx = wp.tid()
+
+    if idx < _num_body_vertices:
+        velocities[idx] = wp.vec3(0.0, 0.0, 0.0)
+        return
+
+    # For rotor vertices, rotate about the negative Y axis (axis = -Y).
+    center = wp.vec3(_rotor_center_x, _rotor_center_y, _rotor_center_z)
+    pos = vertices[idx]
+    rel_pos = pos - center
+
+    radius_x = rel_pos[0]
+    radius_z = rel_pos[2]
+    r = wp.sqrt(radius_x * radius_x + radius_z * radius_z)
+    if r < 1e-6:
+        velocities[idx] = wp.vec3(0.0, 0.0, 0.0)
+        return
+
+    nx = radius_x / r
+    nz = radius_z / r
+    theta = _rotor_speed
+    c = wp.cos(theta)
+    s = wp.sin(theta)
+
+    x_new = r * (c * nx - s * nz)
+    z_new = r * (s * nx + c * nz)
+
+    # Update position
+    new_rel = wp.vec3(x_new, rel_pos[1], z_new)
+    vertices[idx] = new_rel + center
+
+    # Tangential velocity from rotation
+    velocities[idx] = wp.vec3(z_new * _rotor_speed, 0.0, -x_new * _rotor_speed)
+
+
+def post_process(
+    i,
+    post_process_interval,
+    f_current,
+    bc_mask,
+    grid,
+    faces_np,
+    vertices_wp,
+    precision_policy,
+    grid_shape,
+    usd_mesh_vorticity,
+    usd_mesh_q_criterion,
+    vorticity_operator,
+    q_criterion_operator,
+    usd_stage,
+    turbine_body_mesh,
+    turbine_rotor_mesh,
+    num_body_vertices,
+    body_faces_np,
+    rotor_faces_np,
+    lag_forces=None,
+):
+    # if not isinstance(f_current, jnp.ndarray):
+    #     f_jax = wp.to_jax(f_current)
+    # else:
+    #     f_jax = f_current
+
+    # macro_jax = Macroscopic(
+    #     compute_backend=ComputeBackend.JAX,
+    #     precision_policy=precision_policy,
+    #     velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
+    # )
+    # rho, u = macro_jax(f_jax)
+    # u = u[:, 20:-20, 20:-20, 5:-20]
+
+    # fields = {
+    #     "u_magnitude": (u[0] ** 2.0 + u[1] ** 2.0 + u[2] ** 2.0) ** 0.5,
+    #     "u_x": u[0],
+    #     "u_y": u[1],
+    #     "u_z": u[2],
+    # }
+
+    # slice_idx = grid_shape[0] // 2
+    # slice_idy = grid_shape[1] // 2
+    # save_image(fields["u_magnitude"][slice_idx, :, :], timestep=i, prefix="slice_idx")
+    # save_image(fields["u_magnitude"][:, slice_idy, :], timestep=i, prefix="slice_idy")
+
+    # save_fields_vtk(fields, i)
+
+    save_usd_vorticity(
+        i,
+        post_process_interval,
+        bc_mask,
+        f_current,
+        grid_shape,
+        usd_mesh_vorticity,
+        vorticity_operator,
+        precision_policy=precision_policy,
+        vorticity_threshold=1e-2,
+        usd_stage=usd_stage,
+    )
+
+    save_usd_q_criterion(
+        i,
+        post_process_interval,
+        bc_mask,
+        f_current,
+        grid_shape,
+        usd_mesh_q_criterion,
+        q_criterion_operator,
+        precision_policy=precision_policy,
+        q_threshold=5e-6,
+        usd_stage=usd_stage,
+    )
+
+    save_usd_turbine_parts(
+        i,
+        post_process_interval,
+        vertices_wp,
+        num_body_vertices,
+        body_faces_np,
+        rotor_faces_np,
+        turbine_body_mesh,
+        turbine_rotor_mesh,
+        usd_stage,
+        lag_forces=lag_forces,
+    )
+
+
+#
+# Main simulation
+#
+
+grid_shape = (256, 450, 256)            # example domain size (Nx, Ny, Nz)
+u_inlet = 0.05                          # inlet flow speed
+num_steps = 25000
+post_process_interval = 100
+print_interval = 100
+turbine_rotation_speed = -0.0005         # user-controlled rotor speed (radians per timestep)
+Re = 5e5
+
+clength = grid_shape[0] - 1
+visc = u_inlet * clength / Re
+omega = 1.0 / (3.0 * visc + 0.5)
+
+compute_backend = ComputeBackend.WARP
+precision_policy = PrecisionPolicy.FP32FP32
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
+xlb.init(velocity_set=velocity_set, default_backend=compute_backend, default_precision_policy=precision_policy)
+grid = grid_factory(grid_shape, compute_backend=compute_backend)
+
+print("Wind Turbine Simulation Configuration:")
+print(f"  Grid size: {grid_shape}")
+print(f"  Omega: {omega:.6f}")
+print(f"  Backend: {compute_backend}")
+print(f"  Velocity set: {velocity_set}")
+print(f"  Precision policy: {precision_policy}")
+print(f"  Inlet velocity: {u_inlet}")
+print(f"  Reynolds number: {Re}")
+print(f"  Turbine rotation speed (rad/step): {turbine_rotation_speed}")
+print(f"  Max steps: {num_steps}")
+
+usd_output_directory = "usd_output_turbine"
+os.makedirs(usd_output_directory, exist_ok=True)
+usd_file = os.path.join(usd_output_directory, "turbine_output.usd")
+usd_stage = Usd.Stage.CreateNew(usd_file)
+usd_mesh_vorticity = UsdGeom.Mesh.Define(usd_stage, "/World/Vorticity")
+usd_mesh_q_criterion = UsdGeom.Mesh.Define(usd_stage, "/World/QCriterion")
+usd_turbine_body = UsdGeom.Mesh.Define(usd_stage, "/World/TurbineBody")
+usd_turbine_rotor = UsdGeom.Mesh.Define(usd_stage, "/World/TurbineRotor")
+
+mesh_data = load_and_prepare_meshes_turbine(grid_shape)
+vertices_wp = mesh_data["vertices_wp"]
+areas_wp = mesh_data["areas_wp"]
+faces_np = mesh_data["faces_np"]
+num_body_vertices = mesh_data["num_body_vertices"]
+num_rotor_vertices = mesh_data["num_rotor_vertices"]
+body_faces_np = mesh_data["body_faces_np"]
+rotor_faces_np = mesh_data["rotor_faces_np"]
+
+visualize_turbine_placement(vertices_wp, grid_shape)
+
+# Calculate rotor center (simple approach: average rotor vertex positions)
+rotor_center_np = vertices_wp.numpy()[num_body_vertices:].mean(axis=0)
+_num_body_vertices = wp.constant(int(num_body_vertices))
+_rotor_speed = wp.constant(float(turbine_rotation_speed))
+_rotor_center_x = wp.constant(float(rotor_center_np[0]))
+_rotor_center_y = wp.constant(float(rotor_center_np[1]))
+_rotor_center_z = wp.constant(float(rotor_center_np[2]))
+
+bc_list = setup_boundary_conditions(grid, velocity_set, precision_policy, grid_shape, u_inlet)
+stepper = setup_stepper(grid, bc_list, omega)
+f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
+
+device = "cuda:1"
+with wp.ScopedDevice(device):
+    q_criterion_operator = QCriterion(
+        velocity_set=velocity_set,
+        precision_policy=precision_policy,
+        compute_backend=compute_backend,
+    )
+    vorticity_operator = Vorticity(
+        velocity_set=velocity_set,
+        precision_policy=precision_policy,
+        compute_backend=compute_backend,
+    )
+
+velocities_wp = wp.zeros(shape=vertices_wp.shape[0], dtype=wp.vec3)
+
+try:
+    for i in range(num_steps):
+        f_0, f_1, lag_forces = stepper(
+            f_0,
+            f_1,
+            vertices_wp,
+            areas_wp,
+            velocities_wp,
+            bc_mask,
+            missing_mask,
+            omega,
+            i,
+        )
+        f_0, f_1 = f_1, f_0
+
+        wp.launch(
+            kernel=rotate_rotor,
+            dim=vertices_wp.shape[0],
+            inputs=[
+                i,
+                lag_forces,
+                vertices_wp,
+                velocities_wp,
+            ],
+        )
+
+        if i % post_process_interval == 0 or i == num_steps - 1:
+            post_process(
+                i,
+                post_process_interval,
+                f_0,
+                bc_mask,
+                grid,
+                faces_np,
+                vertices_wp,
+                precision_policy,
+                grid_shape,
+                usd_mesh_vorticity,
+                usd_mesh_q_criterion,
+                vorticity_operator,
+                q_criterion_operator,
+                usd_stage,
+                usd_turbine_body,
+                usd_turbine_rotor,
+                num_body_vertices,
+                body_faces_np,
+                rotor_faces_np,
+                lag_forces,
+            )
+except KeyboardInterrupt:
+    print("\nSimulation interrupted by user. Saving current USD state...")
+    current_time_code = i // post_process_interval
+    usd_stage.SetStartTimeCode(0)
+    usd_stage.SetEndTimeCode(current_time_code)
+    usd_stage.SetTimeCodesPerSecond(30)
+    usd_stage.Save()
+    print(f"USD file saved with {current_time_code+1} frames. Exiting.")
+    import sys
+    sys.exit(0)
+
+usd_stage.SetStartTimeCode(0)
+usd_stage.SetEndTimeCode(num_steps // post_process_interval)
+usd_stage.SetTimeCodesPerSecond(30)
+usd_stage.Save()
+print("Simulation finished. USD file saved.")
diff --git a/examples/ibm/windtunnel.py b/examples/ibm/windtunnel.py
new file mode 100644
index 00000000..3fd3e982
--- /dev/null
+++ b/examples/ibm/windtunnel.py
@@ -0,0 +1,925 @@
+import os
+import xlb
+import trimesh
+import time
+from tqdm import tqdm
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.operator.stepper import IBMStepper
+from xlb.operator.boundary_condition import (
+    FullwayBounceBackBC,
+    RegularizedBC,
+    ExtrapolationOutflowBC,
+)
+from xlb.operator.macroscopic import Macroscopic
+from xlb.utils import save_fields_vtk, save_image
+import warp as wp
+import numpy as np
+import jax.numpy as jnp
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from xlb.helper.ibm_helper import prepare_immersed_boundary
+from xlb.grid import grid_factory
+from pxr import Usd, UsdGeom, Sdf, Vt
+from xlb.operator.postprocess import QCriterion, Vorticity, GridToPoint
+
+
+@wp.kernel
+def get_color(
+    low: float,
+    high: float,
+    values: wp.array(dtype=float),
+    out_color: wp.array(dtype=wp.vec3),
+):
+    tid = wp.tid()
+    v = values[tid]
+    r = 1.0
+    g = 1.0
+    b = 1.0
+
+    if v < low:
+        v = low
+    if v > high:
+        v = high
+
+    dv = high - low
+    if v < (low + 0.25 * dv):
+        r = 0.0
+        g = 4.0 * (v - low) / dv
+    elif v < (low + 0.5 * dv):
+        r = 0.0
+        b = 1.0 + 4.0 * (low + 0.25 * dv - v) / dv
+    elif v < (low + 0.75 * dv):
+        r = 4.0 * (v - low - 0.5 * dv) / dv
+        b = 0.0
+    else:
+        g = 1.0 + 4.0 * (low + 0.75 * dv - v) / dv
+        b = 0.0
+
+    out_color[tid] = wp.vec3(r, g, b)
+
+
+def grid_to_point(field, verts, out):
+    verts_np = verts.numpy()
+    nx, ny, nz = field.shape
+    out_np = out.numpy()
+    for i, v in enumerate(verts_np):
+        x = int(round(v[0]))
+        y = int(round(v[1]))
+        z = int(round(v[2]))
+        x = min(max(x, 0), nx - 1)
+        y = min(max(y, 0), ny - 1)
+        z = min(max(z, 0), nz - 1)
+        out_np[i] = field[x, y, z]
+    return wp.array(out_np, dtype=wp.float32, device=out.device)
+
+
+def save_usd_vorticity(
+    timestep,
+    post_process_interval,
+    bc_mask,
+    f_current,
+    grid_shape,
+    usd_mesh,
+    vorticity_operator,
+    precision_policy,
+    vorticity_threshold,
+    usd_stage,
+):
+    device = "cuda:1"
+    f_current_new = wp.clone(f_current, device=device)
+    bc_mask_new = wp.clone(bc_mask, device=device)
+    with wp.ScopedDevice(device):
+        macro_wp = Macroscopic(
+            compute_backend=ComputeBackend.WARP,
+            precision_policy=precision_policy,
+            velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP),
+        )
+        rho = wp.zeros((1, *grid_shape), dtype=wp.float32, device=device)
+        u = wp.zeros((3, *grid_shape), dtype=wp.float32, device=device)
+        rho, u = macro_wp(f_current_new, rho, u)
+        u = u[:, 20:-20, 20:-20, 0:-20]
+
+        vorticity = wp.zeros((3, *u.shape[1:]), dtype=wp.float32, device=device)
+        vorticity_magnitude = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+        vorticity, vorticity_magnitude = vorticity_operator(u, bc_mask_new, vorticity, vorticity_magnitude)
+
+        max_verts = grid_shape[0] * grid_shape[1] * grid_shape[2] * 5
+        max_tris = grid_shape[0] * grid_shape[1] * grid_shape[2] * 3
+        mc = wp.MarchingCubes(nx=u.shape[1], ny=u.shape[2], nz=u.shape[3], max_verts=max_verts, max_tris=max_tris, device=device)
+        mc.surface(vorticity_magnitude[0], vorticity_threshold)
+        if mc.verts.shape[0] == 0:
+            print(f"Warning: No vertices found for vorticity at timestep {timestep}.")
+            return
+
+        grid_to_point_op = GridToPoint(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP)
+        scalars = wp.zeros(mc.verts.shape[0], dtype=wp.float32, device=device)
+        scalars = grid_to_point_op(vorticity_magnitude, mc.verts, scalars)
+
+        colors = wp.empty(mc.verts.shape[0], dtype=wp.vec3, device=device)
+        scalars_np = scalars.numpy()
+        vorticity_min = float(np.percentile(scalars_np, 5))
+        vorticity_max = float(np.percentile(scalars_np, 95))
+        if abs(vorticity_max - vorticity_min) < 1e-6:
+            vorticity_max = vorticity_min + 1e-6
+
+        wp.launch(
+            get_color,
+            dim=mc.verts.shape[0],
+            inputs=(vorticity_min, vorticity_max, scalars),
+            outputs=(colors,),
+            device=device,
+        )
+
+        vertices = mc.verts.numpy()
+        indices = mc.indices.numpy()
+        tri_count = len(indices) // 3
+
+    usd_mesh.GetPointsAttr().Set(vertices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexCountsAttr().Set([3] * tri_count, time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexIndicesAttr().Set(indices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetDisplayColorAttr().Set(colors.numpy().tolist(), time=timestep // post_process_interval)
+    UsdGeom.Primvar(usd_mesh.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+    print(f"Vorticity visualization at timestep {timestep}:")
+    print(f"  Number of vertices: {len(vertices)}")
+    print(f"  Number of triangles: {tri_count}")
+    print(f"  Vorticity range: [{vorticity_min:.6f}, {vorticity_max:.6f}]")
+
+
+def save_usd_q_criterion(
+    timestep,
+    post_process_interval,
+    bc_mask,
+    f_current,
+    grid_shape,
+    usd_mesh,
+    q_criterion_operator,
+    precision_policy,
+    q_threshold,
+    usd_stage,
+):
+    device = "cuda:1"
+    f_current_new = wp.clone(f_current, device=device)
+    bc_mask_new = wp.clone(bc_mask, device=device)
+    with wp.ScopedDevice(device):
+        macro_wp = Macroscopic(
+            compute_backend=ComputeBackend.WARP,
+            precision_policy=precision_policy,
+            velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP),
+        )
+        rho = wp.zeros((1, *grid_shape), dtype=wp.float32, device=device)
+        u = wp.zeros((3, *grid_shape), dtype=wp.float32, device=device)
+        rho, u = macro_wp(f_current_new, rho, u)
+        u = u[:, 20:-20, 20:-20, 0:-20]
+
+        norm_mu = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+        q_field = wp.zeros((1, *u.shape[1:]), dtype=wp.float32, device=device)
+        norm_mu, q_field = q_criterion_operator(u, bc_mask_new, norm_mu, q_field)
+
+        max_verts = grid_shape[0] * grid_shape[1] * grid_shape[2] * 5
+        max_tris = grid_shape[0] * grid_shape[1] * grid_shape[2] * 3
+        mc = wp.MarchingCubes(nx=u.shape[1], ny=u.shape[2], nz=u.shape[3], max_verts=max_verts, max_tris=max_tris, device=device)
+        mc.surface(q_field[0], q_threshold)
+        if mc.verts.shape[0] == 0:
+            print(f"Warning: No vertices found for Q-criterion at timestep {timestep}.")
+            return
+
+        grid_to_point_op = GridToPoint(precision_policy=precision_policy, compute_backend=ComputeBackend.WARP)
+        scalars = wp.zeros(mc.verts.shape[0], dtype=wp.float32, device=device)
+        scalars = grid_to_point_op(norm_mu, mc.verts, scalars)
+
+        colors = wp.empty(mc.verts.shape[0], dtype=wp.vec3, device=device)
+        vorticity_min = 0.0
+        vorticity_max = 0.1
+        wp.launch(
+            get_color,
+            dim=mc.verts.shape[0],
+            inputs=(vorticity_min, vorticity_max, scalars),
+            outputs=(colors,),
+            device=device,
+        )
+
+    vertices = mc.verts.numpy()
+    indices = mc.indices.numpy()
+    tri_count = len(indices) // 3
+
+    usd_mesh.GetPointsAttr().Set(vertices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexCountsAttr().Set([3] * tri_count, time=timestep // post_process_interval)
+    usd_mesh.GetFaceVertexIndicesAttr().Set(indices.tolist(), time=timestep // post_process_interval)
+    usd_mesh.GetDisplayColorAttr().Set(colors.numpy().tolist(), time=timestep // post_process_interval)
+    UsdGeom.Primvar(usd_mesh.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+    print(f"Q-criterion visualization at timestep {timestep}:")
+    print(f"  Number of vertices: {len(vertices)}")
+    print(f"  Number of triangles: {tri_count}")
+    print(f"  Vorticity range: [{vorticity_min:.6f}, {vorticity_max:.6f}]")
+
+
+def save_usd_car_parts(
+    timestep,
+    post_process_interval,
+    vertices_wp,
+    num_body_vertices,
+    body_faces_np,
+    wheels_faces_np,
+    usd_car_body,
+    usd_car_wheels,
+    usd_stage,
+    lag_forces=None,
+):
+    # Apply offset to match velocity field slicing:
+    offset = np.array([20.0, 20.0, 0.0])
+
+    body_vertices = vertices_wp.numpy()[:num_body_vertices] - offset
+    wheels_vertices = vertices_wp.numpy()[num_body_vertices:] - offset
+    car_body_tri_count = len(body_faces_np)
+
+    usd_car_body.GetPointsAttr().Set(body_vertices.tolist(), time=timestep // post_process_interval)
+    usd_car_body.GetFaceVertexCountsAttr().Set(Vt.IntArray([3] * car_body_tri_count), time=timestep // post_process_interval)
+    usd_car_body.GetFaceVertexIndicesAttr().Set(Vt.IntArray(body_faces_np.flatten().tolist()), time=timestep // post_process_interval)
+
+    wheels_faces_np_corrected = wheels_faces_np - num_body_vertices
+    car_wheels_tri_count = len(wheels_faces_np_corrected)
+
+    usd_car_wheels.GetPointsAttr().Set(wheels_vertices.tolist(), time=timestep // post_process_interval)
+    usd_car_wheels.GetFaceVertexCountsAttr().Set(Vt.IntArray([3] * car_wheels_tri_count), time=timestep // post_process_interval)
+    usd_car_wheels.GetFaceVertexIndicesAttr().Set(Vt.IntArray(wheels_faces_np_corrected.flatten().tolist()), time=timestep // post_process_interval)
+
+    # Apply coloring based on lagrangian forces
+    if lag_forces is not None:
+        body_force_np = lag_forces.numpy()[:num_body_vertices, 0]
+        wheels_force_np = lag_forces.numpy()[num_body_vertices:, 0]
+
+        min_body_force = np.min(body_force_np)
+        max_body_force = np.max(body_force_np)
+
+        min_wheels_force = np.min(wheels_force_np)
+        max_wheels_force = np.max(wheels_force_np)
+
+        # Generate colors using the get_color kernel
+        body_colors = wp.zeros(num_body_vertices, dtype=wp.vec3)
+        wheels_colors = wp.zeros(vertices_wp.shape[0] - num_body_vertices, dtype=wp.vec3)
+
+        # Colors for body - using body-specific min/max range
+        wp.launch(
+            kernel=get_color,
+            dim=num_body_vertices,
+            inputs=[min_body_force, max_body_force, wp.from_numpy(body_force_np), body_colors],
+        )
+
+        # Colors for wheels - using wheels-specific min/max range
+        wp.launch(
+            kernel=get_color,
+            dim=vertices_wp.shape[0] - num_body_vertices,
+            inputs=[min_wheels_force, max_wheels_force, wp.from_numpy(wheels_force_np), wheels_colors],
+        )
+
+        # Convert to USD color format and set attributes
+        body_colors_np = body_colors.numpy()
+        wheels_colors_np = wheels_colors.numpy()
+
+        # Set colors directly to DisplayColorAttr and set interpolation to "vertex"
+        usd_car_body.GetDisplayColorAttr().Set(body_colors_np.tolist(), time=timestep // post_process_interval)
+        UsdGeom.Primvar(usd_car_body.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+        usd_car_wheels.GetDisplayColorAttr().Set(wheels_colors_np.tolist(), time=timestep // post_process_interval)
+        UsdGeom.Primvar(usd_car_wheels.GetDisplayColorAttr()).SetInterpolation("vertex")
+
+    print(f"Car body and wheels USD updated at timestep {timestep}")
+
+
+def visualize_car_placement(vertices_wp, grid_shape):
+    verts = vertices_wp.numpy()
+    car_min = verts.min(axis=0)
+    car_max = verts.max(axis=0)
+
+    plt.figure(figsize=(10, 5))
+    domain_x = [0, grid_shape[0], grid_shape[0], 0, 0]
+    domain_y = [0, 0, grid_shape[1], grid_shape[1], 0]
+    plt.plot(domain_x, domain_y, "k-", label="Domain", linewidth=1)
+
+    car_x = [car_min[0], car_max[0], car_max[0], car_min[0], car_min[0]]
+    car_y = [car_min[1], car_min[1], car_max[1], car_max[1], car_min[1]]
+    plt.plot(car_x, car_y, "r-", linewidth=2, label="Car")
+
+    plt.title("Car Placement (Top View)")
+    plt.xlabel("X (Flow Direction →)")
+    plt.ylabel("Y (Width)")
+    plt.grid(True, linestyle="--", alpha=0.3)
+    plt.axis("equal")
+    plt.legend()
+
+    plt.savefig("car_placement.png", dpi=150, bbox_inches="tight")
+    plt.close()
+
+
+def load_and_prepare_meshes_car(grid_shape, stl_dir="/home/mehdi/Repos/stl-files/car"):
+    if not os.path.exists(stl_dir):
+        raise FileNotFoundError(f"STL directory {stl_dir} does not exist.")
+
+    body_stl = os.path.join(stl_dir, "S550_GT500_BS_5p.stl")
+    wheel_fr_stl = os.path.join(stl_dir, "S550_GT500_BS_FR_5p.stl")
+    wheel_fl_stl = os.path.join(stl_dir, "S550_GT500_BS_FL_5p.stl")
+    wheel_rr_stl = os.path.join(stl_dir, "S550_GT500_BS_RR_5p.stl")
+    wheel_rl_stl = os.path.join(stl_dir, "S550_GT500_BS_RL_5p.stl")
+    for stl_file in [body_stl, wheel_fr_stl, wheel_fl_stl, wheel_rr_stl, wheel_rl_stl]:
+        if not os.path.isfile(stl_file):
+            raise FileNotFoundError(f"STL file {stl_file} not found.")
+
+    # Load body FIRST to get reference dimensions
+    body_mesh = trimesh.load_mesh(body_stl, process=False)
+    body_bounds = body_mesh.bounds
+    orig_body_width = body_bounds[1][1] - body_bounds[0][1]  # Use body width for scaling
+
+    # Now load other components
+    fr_mesh = trimesh.load_mesh(wheel_fr_stl, process=False)
+    fl_mesh = trimesh.load_mesh(wheel_fl_stl, process=False)
+    rr_mesh = trimesh.load_mesh(wheel_rr_stl, process=False)
+    rl_mesh = trimesh.load_mesh(wheel_rl_stl, process=False)
+
+    # Calculate scale based on BODY width only
+    Nx, Ny, Nz = grid_shape
+    target_body_width = Ny / 3.0
+    scale_factor = target_body_width / orig_body_width  # Use body's original width
+
+    # Apply scale to ALL components
+    body_mesh.apply_scale(scale_factor)
+    fr_mesh.apply_scale(scale_factor)
+    fl_mesh.apply_scale(scale_factor)
+    rr_mesh.apply_scale(scale_factor)
+    rl_mesh.apply_scale(scale_factor)
+
+    # Now combine scaled meshes
+    combined = trimesh.util.concatenate([body_mesh, fr_mesh, fl_mesh, rr_mesh, rl_mesh])
+
+    R_y = trimesh.transformations.rotation_matrix(np.radians(180), [0, 1, 0])
+    R_x = trimesh.transformations.rotation_matrix(np.radians(180), [1, 0, 0])
+    combined.apply_transform(R_y)
+    combined.apply_transform(R_x)
+
+    bnds = combined.bounds
+    min_x, min_y, min_z = bnds[0]
+    max_x, max_y, max_z = bnds[1]
+    car_length = max_x - min_x
+    car_height = max_z - min_z
+
+    target_front_x = Nx / 3.0
+    shift_x = target_front_x - min_x
+
+    back_x_after_shift = max_x + shift_x
+    if back_x_after_shift > Nx:
+        raise ValueError(
+            f"Car too long to fit in domain with front at 1/3!\n"
+            f"Car length: {car_length:.1f}\n"
+            f"Available space: {Nx - target_front_x:.1f}\n"
+            f"Would need additional {back_x_after_shift - Nx:.1f} units"
+        )
+
+    shift_y = (Ny - (max_y - min_y)) / 2.0 - min_y
+
+    ground_clearance = 2.0
+    shift_z = ground_clearance - min_z
+
+    combined.apply_translation([shift_x, shift_y, shift_z])
+
+    bnds = combined.bounds
+    min_x, min_y, min_z = bnds[0]
+    max_x, max_y, max_z = bnds[1]
+
+    tolerance = 0.1
+    front_tolerance = target_front_x * tolerance
+    width_tolerance = target_body_width * tolerance
+
+    if not (target_front_x - front_tolerance < min_x < target_front_x + front_tolerance):
+        raise ValueError(f"Car front not at 1/3 domain length! Front at {min_x:.1f}, should be {target_front_x:.1f}")
+
+    scaled_body_width = body_mesh.bounds[1][1] - body_mesh.bounds[0][1]
+    if not (target_body_width - width_tolerance < scaled_body_width < target_body_width + width_tolerance):
+        raise ValueError(f"Car BODY width mismatch! Current: {scaled_body_width:.1f}, Target: {target_body_width:.1f}")
+
+    print("\nScaled dimensions verification:")
+    print(f"Body width: {scaled_body_width:.1f} (target: {target_body_width:.1f})")
+    print(f"Total car width (body + wheels): {max_y - min_y:.1f}")
+
+    def apply_transform(mesh):
+        mesh.apply_scale(scale_factor)
+        mesh.apply_transform(R_y)
+        mesh.apply_transform(R_x)
+        mesh.apply_translation([shift_x, shift_y, shift_z])
+
+    body_mesh = trimesh.load_mesh(body_stl, process=False)
+    fr_mesh = trimesh.load_mesh(wheel_fr_stl, process=False)
+    fl_mesh = trimesh.load_mesh(wheel_fl_stl, process=False)
+    rr_mesh = trimesh.load_mesh(wheel_rr_stl, process=False)
+    rl_mesh = trimesh.load_mesh(wheel_rl_stl, process=False)
+
+    apply_transform(body_mesh)
+    apply_transform(fr_mesh)
+    apply_transform(fl_mesh)
+    apply_transform(rr_mesh)
+    apply_transform(rl_mesh)
+
+    body_v_wp, body_a_wp, body_faces = prepare_immersed_boundary(body_mesh, max_lbm_length=Ny / 2.0)
+    fr_v_wp, fr_a_wp, fr_faces = prepare_immersed_boundary(fr_mesh, max_lbm_length=Ny / 2.0)
+    fl_v_wp, fl_a_wp, fl_faces = prepare_immersed_boundary(fl_mesh, max_lbm_length=Ny / 2.0)
+    rr_v_wp, rr_a_wp, rr_faces = prepare_immersed_boundary(rr_mesh, max_lbm_length=Ny / 2.0)
+    rl_v_wp, rl_a_wp, rl_faces = prepare_immersed_boundary(rl_mesh, max_lbm_length=Ny / 2.0)
+
+    all_vertices = body_v_wp.numpy()
+    all_areas = body_a_wp.numpy()
+    all_faces = body_faces.copy()
+    current_offset = len(body_v_wp)
+
+    wheels = [
+        (fr_v_wp, fr_a_wp, fr_faces),
+        (fl_v_wp, fl_a_wp, fl_faces),
+        (rr_v_wp, rr_a_wp, rr_faces),
+        (rl_v_wp, rl_a_wp, rl_faces),
+    ]
+    wheel_ranges = []
+    for wv_wp, wa_wp, wf in wheels:
+        wv_np = wv_wp.numpy()
+        wa_np = wa_wp.numpy()
+        wf_offset = wf + current_offset
+        all_vertices = np.vstack([all_vertices, wv_np])
+        all_areas = np.hstack([all_areas, wa_np])
+        all_faces = np.vstack([all_faces, wf_offset])
+        start_idx = current_offset
+        end_idx = start_idx + len(wv_np)
+        wheel_ranges.append((start_idx, end_idx))
+        current_offset += len(wv_np)
+
+    vertices_wp = wp.array(all_vertices, dtype=wp.vec3)
+    areas_wp = wp.array(all_areas, dtype=wp.float32)
+    faces_np = all_faces
+    num_body_vertices = len(body_v_wp)
+    num_wheel_vertices = len(all_vertices) - num_body_vertices
+    body_faces_np = body_faces
+    wheels_faces_np = faces_np[len(body_faces_np) :]
+
+    wheel_centers = []
+    for start_idx, end_idx in wheel_ranges:
+        center = np.mean(all_vertices[start_idx:end_idx], axis=0)
+        wheel_centers.append(center)
+
+    # Calculate frontal area using the bounding box cross-section (YZ plane)
+    mesh = trimesh.Trimesh(vertices=all_vertices, faces=faces_np)
+    bounds = mesh.bounds
+    min_y, min_z = bounds[0][1], bounds[0][2]
+    max_y, max_z = bounds[1][1], bounds[1][2]
+    width = max_y - min_y
+    height = max_z - min_z
+    frontal_area = width * height
+
+    print(f"Calculated frontal area (bounding box): {frontal_area:.2f} square units")
+    print(f"  Width: {width:.2f}, Height: {height:.2f}")
+
+    print("\nMesh preparation summary:")
+    print(f"Target size (car width): {target_body_width:.2f}")
+    print(f"Scale factor applied: {scale_factor:.4f}")
+    print(f"Total vertices: {len(all_vertices)}")
+    print(f"Body vertices: {num_body_vertices}")
+    print(f"Wheel vertices: {num_wheel_vertices}")
+    print(f"Number of wheels: {len(wheel_ranges)}")
+
+    return {
+        "vertices_wp": vertices_wp,
+        "areas_wp": areas_wp,
+        "faces_np": faces_np,
+        "wheel_ranges": wheel_ranges,
+        "wheel_centers": wheel_centers,
+        "num_body_vertices": num_body_vertices,
+        "num_wheel_vertices": num_wheel_vertices,
+        "body_faces_np": body_faces_np,
+        "wheels_faces_np": wheels_faces_np,
+        "frontal_area": frontal_area,
+    }
+
+
+def define_boundary_indices(grid, velocity_set):
+    box = grid.bounding_box_indices()
+    box_no_edge = grid.bounding_box_indices(remove_edges=True)
+    inlet = box_no_edge["right"]
+    outlet = box_no_edge["left"]
+    walls = [box["front"][i] + box["back"][i] + box["top"][i] + box["bottom"][i] for i in range(velocity_set.d)]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+    return inlet, outlet, walls
+
+
+def bc_profile(precision_policy, grid_shape, u_max):
+    _dtype = precision_policy.store_precision.wp_dtype
+    u_max_d = _dtype(u_max)
+    L_y = _dtype(grid_shape[1] - 1)
+    L_z = _dtype(grid_shape[2] - 1)
+
+    @wp.func
+    def bc_profile_warp(index: wp.vec3i):
+        y = _dtype(index[1])
+        z = _dtype(index[2])
+        y_center = y - (L_y / _dtype(2.0))
+        z_center = z - (L_z / _dtype(2.0))
+        r_sq = ((_dtype(2.0) * y_center) / L_y) ** _dtype(2.0) + ((_dtype(2.0) * z_center) / L_z) ** _dtype(2.0)
+        velocity_x = u_max_d * wp.max(_dtype(0.0), _dtype(1.0) - r_sq)
+        return wp.vec(velocity_x, length=1)
+
+    return bc_profile_warp
+
+
+def calculate_drag_coefficient(lag_forces, reference_velocity, frontal_area, areas_wp):
+    """
+    Calculate the drag coefficient (Cd) from lagrangian forces
+
+    Args:
+        lag_forces: Warp array of forces on vertices
+        reference_velocity: Reference velocity (u_max)
+        frontal_area: Frontal area of the car (from bounding box)
+        areas_wp: Warp array of vertex areas
+
+    Returns:
+        cd: Calculated drag coefficient
+    """
+    forces_np = lag_forces.numpy()
+    drag_forces = forces_np[:, 0]  # X-component is the drag direction
+
+    areas_np = areas_wp.numpy()
+
+    drag_forces = drag_forces * areas_np
+    total_drag = np.sum(drag_forces)
+
+    # Calculate dynamic pressure (rho = 1.0 in lattice units)
+    dynamic_pressure = 0.5 * reference_velocity**2
+
+    cd = total_drag / (dynamic_pressure * frontal_area)
+
+    return cd
+
+
+def setup_boundary_conditions(grid, velocity_set, precision_policy, grid_shape, u_max):
+    inlet, outlet, walls = define_boundary_indices(grid, velocity_set)
+    bc_inlet = RegularizedBC("velocity", indices=inlet, profile=bc_profile(precision_policy, grid_shape, u_max))
+    bc_outlet = ExtrapolationOutflowBC(indices=outlet)
+    bc_walls = FullwayBounceBackBC(indices=walls)
+    return [bc_walls, bc_inlet, bc_outlet]
+
+
+def setup_stepper(grid, boundary_conditions, lbm_omega):
+    return IBMStepper(
+        grid=grid,
+        boundary_conditions=boundary_conditions,
+        collision_type="KBC",
+    )
+
+
+@wp.kernel
+def rotate_wheels(
+    timestep: int,
+    forces: wp.array(dtype=wp.vec3),
+    vertices: wp.array(dtype=wp.vec3),
+    velocities: wp.array(dtype=wp.vec3),
+):
+    idx = wp.tid()
+
+    if idx < _num_body_vertices:
+        velocities[idx] = wp.vec3(0.0, 0.0, 0.0)
+        return
+
+    wheel_id = wp.int32(-1)
+    for b in range(_num_wheels):
+        start = _wheel_starts[b]
+        end = _wheel_ends[b]
+        if (idx >= start) and (idx < end):
+            wheel_id = b
+            break
+
+    if wheel_id == -1:
+        velocities[idx] = wp.vec3(0.0, 0.0, 0.0)
+        return
+
+    center = wp.vec3(
+        _wheel_centers_x[wheel_id],
+        _wheel_centers_y[wheel_id],
+        _wheel_centers_z[wheel_id],
+    )
+
+    rel_pos = vertices[idx] - center
+
+    # Use normalized length to prevent shrinking
+    radius = wp.sqrt(rel_pos[0] * rel_pos[0] + rel_pos[2] * rel_pos[2])
+    if radius > 1e-6:  # Avoid division by zero
+        # Normalize x-z components
+        norm_x = rel_pos[0] / radius
+        norm_z = rel_pos[2] / radius
+
+        # Apply rotation
+        theta = _wheel_speed
+        c = wp.cos(theta)
+        s = wp.sin(theta)
+
+        # Compute new position while preserving the radius
+        x_new = radius * (c * norm_x - s * norm_z)
+        z_new = radius * (s * norm_x + c * norm_z)
+
+        new_rel_pos = wp.vec3(x_new, rel_pos[1], z_new)
+        vertices[idx] = new_rel_pos + center
+        velocities[idx] = wp.vec3(-_wheel_speed * rel_pos[2], 0.0, _wheel_speed * rel_pos[0])
+    else:
+        # For points very close to rotation axis, just keep them as is
+        velocities[idx] = wp.vec3(0.0, 0.0, 0.0)
+
+
+def post_process(
+    i,
+    post_process_interval,
+    f_current,
+    bc_mask,
+    vertices_wp,
+    precision_policy,
+    grid_shape,
+    usd_mesh_vorticity,
+    usd_mesh_q_criterion,
+    usd_stage,
+    vorticity_operator,
+    q_criterion_operator,
+    lag_forces=None,
+    cd_values=None,
+    reference_velocity=None,
+    frontal_area=None,
+    areas_wp=None,
+):
+    if not isinstance(f_current, jnp.ndarray):
+        f_jax = wp.to_jax(f_current)
+    else:
+        f_jax = f_current
+
+    macro_jax = Macroscopic(
+        compute_backend=ComputeBackend.JAX,
+        precision_policy=precision_policy,
+        velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
+    )
+    rho, u = macro_jax(f_jax)
+    u = u[:, 1:-1, 1:-1, 1:-1]
+
+    fields = {
+        "u_magnitude": (u[0] ** 2.0 + u[1] ** 2.0 + u[2] ** 2.0) ** 0.5,
+        "u_x": u[0],
+        "u_y": u[1],
+        "u_z": u[2],
+    }
+    slice_idy = grid_shape[1] // 2
+    save_image(fields["u_magnitude"][:, slice_idy, :], timestep=i)
+    save_fields_vtk(fields, i)
+
+    # Calculate and store drag coefficient if forces are available
+    cd = calculate_drag_coefficient(lag_forces, reference_velocity, frontal_area, areas_wp)
+    cd_values.append((i, cd))
+    if i % post_process_interval == 0:
+        print(f"Step {i}: Drag Coefficient (Cd) = {cd:.6f}")
+        # Save current Cd values to file
+        save_drag_coefficient(cd_values, "drag_coefficient.csv")
+
+    save_usd_vorticity(
+        i,
+        post_process_interval,
+        bc_mask,
+        f_current,
+        grid_shape,
+        usd_mesh_vorticity,
+        vorticity_operator,
+        precision_policy=precision_policy,
+        vorticity_threshold=1e-2,
+        usd_stage=usd_stage,
+    )
+
+    save_usd_q_criterion(
+        i,
+        post_process_interval,
+        bc_mask,
+        f_current,
+        grid_shape,
+        usd_mesh_q_criterion,
+        q_criterion_operator,
+        precision_policy=precision_policy,
+        q_threshold=5e-6,
+        usd_stage=usd_stage,
+    )
+
+    save_usd_car_parts(
+        i,
+        post_process_interval,
+        vertices_wp,
+        num_body_vertices,
+        body_faces_np,
+        wheels_faces_np,
+        usd_car_body,
+        usd_car_wheels,
+        usd_stage,
+        lag_forces=lag_forces,
+    )
+
+
+def save_drag_coefficient(cd_values, filename):
+    """
+    Save drag coefficient values to a CSV file
+
+    Args:
+        cd_values: List of (timestep, cd) tuples
+        filename: Output CSV filename
+    """
+    with open(filename, "w") as f:
+        f.write("timestep,cd\n")
+        for timestep, cd in cd_values:
+            f.write(f"{timestep},{cd}\n")
+
+    # Also create a plot if matplotlib is available
+    try:
+        timesteps = [t for t, _ in cd_values]
+        cds = [cd for _, cd in cd_values]
+
+        plt.figure(figsize=(10, 6))
+        plt.plot(timesteps, cds, "b-")
+        plt.grid(True, linestyle="--", alpha=0.7)
+        plt.xlabel("Timestep")
+        plt.ylabel("Drag Coefficient (Cd)")
+        plt.title("Drag Coefficient vs Time")
+        plt.tight_layout()
+        plt.savefig("drag_coefficient.png", dpi=150)
+        plt.close()
+    except Exception as e:
+        print(f"Could not create Cd plot: {e}")
+
+
+# grid_shape = (1024, 500, 200)
+grid_shape = (256, 100, 100)
+u_max = 0.02
+iter_per_flow_passes = grid_shape[0] / u_max
+num_steps = int(iter_per_flow_passes * 2)
+post_process_interval = 100
+print_interval = 100
+num_wheels = 4
+wheel_rotation_speed = -0.002
+
+Re = 1e6
+clength = grid_shape[0] - 1
+visc = u_max * clength / Re
+omega = 1.0 / (3.0 * visc + 0.5)
+
+compute_backend = ComputeBackend.WARP
+precision_policy = PrecisionPolicy.FP32FP32
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
+xlb.init(velocity_set=velocity_set, default_backend=compute_backend, default_precision_policy=precision_policy)
+grid = grid_factory(grid_shape, compute_backend=compute_backend)
+
+print("Car Simulation Configuration:")
+print(f"  Grid size: {grid_shape}")
+print(f"  Omega: {omega}")
+print(f"  Backend: {compute_backend}")
+print(f"  Velocity set: {velocity_set}")
+print(f"  Precision policy: {precision_policy}")
+print(f"  Inlet velocity: {u_max}")
+print(f"  Reynolds number: {Re}")
+print(f"  Max steps: {num_steps}")
+
+# Initialize list to store drag coefficient values
+cd_values = []
+
+usd_output_directory = "usd_output_car"
+os.makedirs(usd_output_directory, exist_ok=True)
+usd_file = os.path.join(usd_output_directory, "car_output.usd")
+usd_stage = Usd.Stage.CreateNew(usd_file)
+usd_mesh_vorticity = UsdGeom.Mesh.Define(usd_stage, "/World/Vorticity")
+usd_mesh_q_criterion = UsdGeom.Mesh.Define(usd_stage, "/World/QCriterion")
+usd_car_body = UsdGeom.Mesh.Define(usd_stage, "/World/CarBody")
+usd_car_wheels = UsdGeom.Mesh.Define(usd_stage, "/World/CarWheels")
+
+mesh_data = load_and_prepare_meshes_car(grid_shape)
+vertices_wp = mesh_data["vertices_wp"]
+areas_wp = mesh_data["areas_wp"]
+faces_np = mesh_data["faces_np"]
+wheel_ranges = mesh_data["wheel_ranges"]
+wheel_centers = mesh_data["wheel_centers"]
+num_body_vertices = mesh_data["num_body_vertices"]
+num_wheel_vertices = mesh_data["num_wheel_vertices"]
+body_faces_np = mesh_data["body_faces_np"]
+wheels_faces_np = mesh_data["wheels_faces_np"]
+frontal_area = mesh_data["frontal_area"]
+
+visualize_car_placement(vertices_wp, grid_shape)
+
+_num_body_vertices = wp.constant(num_body_vertices)
+_num_wheels = wp.constant(num_wheels)
+_wheel_speed = wp.constant(wheel_rotation_speed)
+
+starts_list = [rng[0] for rng in wheel_ranges]
+ends_list = [rng[1] for rng in wheel_ranges]
+cx_list = [c[0] for c in wheel_centers]
+cy_list = [c[1] for c in wheel_centers]
+cz_list = [c[2] for c in wheel_centers]
+
+_wheel_starts = wp.constant(wp.vec(len(starts_list), dtype=int)(starts_list))
+_wheel_ends = wp.constant(wp.vec(len(ends_list), dtype=int)(ends_list))
+_wheel_centers_x = wp.constant(wp.vec(len(cx_list), dtype=float)(cx_list))
+_wheel_centers_y = wp.constant(wp.vec(len(cy_list), dtype=float)(cy_list))
+_wheel_centers_z = wp.constant(wp.vec(len(cz_list), dtype=float)(cz_list))
+
+bc_list = setup_boundary_conditions(grid, velocity_set, precision_policy, grid_shape, u_max)
+stepper = setup_stepper(grid, bc_list, omega)
+f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
+
+device = "cuda:1"
+with wp.ScopedDevice(device):
+    q_criterion_operator = QCriterion(
+        velocity_set=velocity_set,
+        precision_policy=precision_policy,
+        compute_backend=compute_backend,
+    )
+    vorticity_operator = Vorticity(
+        velocity_set=velocity_set,
+        precision_policy=precision_policy,
+        compute_backend=compute_backend,
+    )
+
+velocities_wp = wp.zeros(shape=vertices_wp.shape[0], dtype=wp.vec3)
+
+try:
+    for i in range(num_steps):
+        f_0, f_1, lag_forces = stepper(
+            f_0,
+            f_1,
+            vertices_wp,
+            areas_wp,
+            velocities_wp,
+            bc_mask,
+            missing_mask,
+            omega,
+            i,
+        )
+        # Swap f_0 and f_1
+        f_0, f_1 = f_1, f_0
+
+        # Update solid velocities and positions
+        wp.launch(
+            kernel=rotate_wheels,
+            dim=vertices_wp.shape[0],
+            inputs=[
+                i,
+                lag_forces,
+                vertices_wp,
+                velocities_wp,
+            ],
+        )
+
+        if print_interval > 0 and i % print_interval == 0:
+            print(f"Step {i}/{num_steps} completed")
+
+        if i % post_process_interval == 0 or i == num_steps - 1:
+            post_process(
+                i,
+                post_process_interval,
+                f_0,
+                bc_mask,
+                vertices_wp,
+                precision_policy,
+                grid_shape,
+                usd_mesh_vorticity,
+                usd_mesh_q_criterion,
+                usd_stage,
+                vorticity_operator,
+                q_criterion_operator,
+                lag_forces=lag_forces,
+                cd_values=cd_values,
+                reference_velocity=u_max,
+                frontal_area=frontal_area,
+                areas_wp=areas_wp,
+            )
+
+except KeyboardInterrupt:
+    print("\nSimulation interrupted by user. Saving current USD state...")
+    current_time_code = i // post_process_interval
+    usd_stage.SetStartTimeCode(0)
+    usd_stage.SetEndTimeCode(current_time_code)
+    usd_stage.SetTimeCodesPerSecond(30)
+    usd_stage.Save()
+
+    # Save drag coefficient data before exiting
+    if cd_values:
+        save_drag_coefficient(cd_values, "drag_coefficient.csv")
+        print("Drag coefficient data saved to drag_coefficient.csv")
+
+    print(f"USD file saved with {current_time_code + 1} frames. Exiting.")
+    import sys
+
+    sys.exit(0)
+
+
+usd_stage.SetStartTimeCode(0)
+usd_stage.SetEndTimeCode(num_steps // post_process_interval)
+usd_stage.SetTimeCodesPerSecond(30)
+usd_stage.Save()
+
+# Save final drag coefficient data
+if cd_values:
+    save_drag_coefficient(cd_values, "drag_coefficient.csv")
+    print("Drag coefficient data saved to drag_coefficient.csv")
+
+print("Simulation finished. USD file saved.")
diff --git a/xlb/__init__.py b/xlb/__init__.py
index b58db3bb..f6e85bdb 100644
--- a/xlb/__init__.py
+++ b/xlb/__init__.py
@@ -15,7 +15,8 @@
 import xlb.operator.stream
 import xlb.operator.boundary_condition
 import xlb.operator.macroscopic
-
+import xlb.operator.immersed_boundary
+import xlb.operator.postprocess
 # Grids
 import xlb.grid
 
diff --git a/xlb/default_config.py b/xlb/default_config.py
index 3823353c..632b9143 100644
--- a/xlb/default_config.py
+++ b/xlb/default_config.py
@@ -18,7 +18,7 @@ def init(velocity_set, default_backend, default_precision_policy):
 
     if default_backend == ComputeBackend.WARP:
         import warp as wp
-
+        wp.max_unroll = 32
         wp.init()  # TODO: Must be removed in the future versions of WARP
     elif default_backend == ComputeBackend.JAX:
         check_backend_support()
diff --git a/xlb/helper/__init__.py b/xlb/helper/__init__.py
index d52f2063..aa6dc961 100644
--- a/xlb/helper/__init__.py
+++ b/xlb/helper/__init__.py
@@ -1,3 +1,9 @@
 from xlb.helper.nse_solver import create_nse_fields
 from xlb.helper.initializers import initialize_eq
 from xlb.helper.check_boundary_overlaps import check_bc_overlaps
+from xlb.helper.ibm_helper import (
+    reconstruct_mesh_from_vertices_and_faces,
+    transform_mesh,
+    prepare_immersed_boundary,
+    calculate_voronoi_areas,
+)
diff --git a/xlb/helper/ibm_helper.py b/xlb/helper/ibm_helper.py
new file mode 100644
index 00000000..404dbaf1
--- /dev/null
+++ b/xlb/helper/ibm_helper.py
@@ -0,0 +1,246 @@
+from xlb import DefaultConfig
+from xlb.grid import grid_factory
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import Precision
+from typing import Tuple
+import warp as wp
+import trimesh
+import numpy as np
+
+
+def create_ibm_fields(grid_shape: Tuple[int, int, int], velocity_set=None, precision_policy=None):
+    velocity_set = velocity_set or DefaultConfig.velocity_set
+    compute_backend = ComputeBackend.WARP
+    precision_policy = precision_policy or DefaultConfig.default_precision_policy
+    grid = grid_factory(grid_shape, compute_backend=compute_backend)
+
+    # Create fields
+    f_0 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
+    f_1 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
+    velocity_eulerian = grid.create_field(cardinality=3, dtype=precision_policy.store_precision)
+    missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
+    bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
+
+    return grid, f_0, f_1, missing_mask, bc_mask
+
+
+def transform_mesh(mesh, translation=None, rotation=None, rotation_order="xyz", scale=None):
+    """
+    Transform a mesh using translation, rotation, and scaling.
+
+    Parameters
+    ----------
+    mesh : trimesh.Trimesh
+        Input triangle mesh
+    translation : array-like or None, shape (3,)
+        Translation vector [x, y, z]. If None, no translation is applied
+    rotation : array-like or None, shape (3,)
+        Rotation angles in degrees [rx, ry, rz]. If None, no rotation is applied
+    rotation_order : str, default='xyz'
+        Order of rotations. Valid options: 'xyz', 'xzy', 'yxz', 'yzx', 'zxy', 'zyx'
+    scale : float or array-like or None, shape (3,)
+        Scale factor. If float, uniform scaling is applied.
+        If array-like, [sx, sy, sz] for non-uniform scaling.
+        If None, no scaling is applied
+
+    Returns
+    -------
+    trimesh.Trimesh
+        Transformed mesh
+    """
+    # Create a copy of the mesh to avoid modifying the original
+    transformed_mesh = mesh.copy()
+
+    # Apply scaling
+    if scale is not None:
+        if isinstance(scale, (int, float)):
+            scale = [scale, scale, scale]
+        transformed_mesh.apply_scale(scale)
+
+    # Apply rotation
+    if rotation is not None:
+        # Convert degrees to radians
+        rotation = np.array(rotation) * np.pi / 180.0
+
+        # Create rotation matrix based on the specified order
+        matrix = trimesh.transformations.euler_matrix(rotation[0], rotation[1], rotation[2], axes=f"r{rotation_order}")
+        transformed_mesh.apply_transform(matrix)
+
+    # Apply translation
+    if translation is not None:
+        translation_matrix = trimesh.transformations.translation_matrix(translation)
+        transformed_mesh.apply_transform(translation_matrix)
+
+    return transformed_mesh
+
+
+def prepare_immersed_boundary(mesh, max_lbm_length, translation=None, rotation=None, rotation_order="xyz", scale=None):
+    """
+    Prepare an immersed boundary from an STL file with optional transformations.
+
+    Parameters
+    ----------
+    mesh : trimesh.Trimesh
+        Input triangle mesh
+    max_lbm_length : float
+        Desired maximum length in lattice units
+    translation : array-like or None, shape (3,)
+        Translation vector [x, y, z]. If None, no translation is applied
+    rotation : array-like or None, shape (3,)
+        Rotation angles in degrees [rx, ry, rz]. If None, no rotation is applied
+    rotation_order : str, default='xyz'
+        Order of rotations. Valid options: 'xyz', 'xzy', 'yxz', 'yzx', 'zxy', 'zyx'
+    scale : float or array-like or None, shape (3,)
+        Additional scale factor applied after normalization.
+        If float, uniform scaling is applied.
+        If array-like, [sx, sy, sz] for non-uniform scaling.
+        If None, no additional scaling is applied
+
+    Returns
+    -------
+    tuple
+        (vertices_wp, vertex_areas_wp, faces_np)
+        - vertices_wp: Warp array containing vertex coordinates
+        - vertex_areas_wp: Warp array containing Voronoi areas for each vertex
+        - faces_np: NumPy array containing face indices
+    """
+    # First normalize the mesh to the desired LBM length
+    # max_length = mesh.extents.max()
+    # normalize_scale = max_lbm_length / max_length
+    # mesh.apply_scale(normalize_scale)
+
+    # # Apply additional transformations
+    # mesh = transform_mesh(mesh, translation=translation, rotation=rotation, rotation_order=rotation_order, scale=scale)
+
+    # Subdivide to ensure at least one vertex per cell
+    mesh = mesh.subdivide_to_size(max_edge=1.0, max_iter=200)
+
+    # Calculate vertices and voronoi areas
+    vertices_wp, vertex_areas_wp = calculate_voronoi_areas(mesh)
+
+    # Return the faces along with vertices and areas
+    return vertices_wp, vertex_areas_wp, mesh.faces
+
+
+def calculate_voronoi_areas(mesh, check_area=True):
+    """
+    Calculate Voronoi areas for vertices in a triangle mesh using Warp.
+
+    Parameters
+    ----------
+    mesh : trimesh.Trimesh
+        Input triangle mesh
+    check_area : bool, optional
+        Whether to check if the sum of the Voronoi areas matches the mesh area
+
+    Returns
+    -------
+    tuple
+        (vertex_areas_wp, vertex_areas)
+        - vertex_areas_wp: Warp array containing Voronoi areas for each vertex
+        - vertex_areas: NumPy array containing Voronoi areas for each vertex
+    """
+    # Get face areas and vertices of each face
+    face_areas = mesh.area_faces
+    faces = mesh.faces
+    vertices = mesh.vertices
+
+    # Define the number of vertices and faces
+    num_vertices = len(vertices)
+    num_faces = len(faces)
+
+    @wp.kernel
+    def voronoi_area_kernel(
+        faces: wp.array2d(dtype=int), vertices: wp.array(dtype=wp.vec3), face_areas: wp.array1d(dtype=float), vertex_areas: wp.array1d(dtype=float)
+    ):
+        tid = wp.tid()
+
+        # Get vertex indices of the face
+        v0 = faces[tid, 0]
+        v1 = faces[tid, 1]
+        v2 = faces[tid, 2]
+
+        # Get vertex positions
+        p0 = wp.vec3(vertices[v0][0], vertices[v0][1], vertices[v0][2])
+        p1 = wp.vec3(vertices[v1][0], vertices[v1][1], vertices[v1][2])
+        p2 = wp.vec3(vertices[v2][0], vertices[v2][1], vertices[v2][2])
+
+        # Compute edge lengths of the triangle
+        a = wp.length(p1 - p2)
+        b = wp.length(p0 - p2)
+        c = wp.length(p0 - p1)
+
+        # Compute area and cotangent weights
+        face_area = face_areas[tid]
+
+        cot_alpha = (b**2.0 + c**2.0 - a**2.0) / (4.0 * face_area)
+        cot_beta = (a**2.0 + c**2.0 - b**2.0) / (4.0 * face_area)
+        cot_gamma = (a**2.0 + b**2.0 - c**2.0) / (4.0 * face_area)
+
+        # Normalize the cotangent weights
+        total_cot = cot_alpha + cot_beta + cot_gamma
+        if total_cot > 0:
+            cot_alpha /= total_cot
+            cot_beta /= total_cot
+            cot_gamma /= total_cot
+
+        # Distribute the face area to each vertex based on the normalized weights
+        wp.atomic_add(vertex_areas, v0, face_area * cot_beta / 2.0 + face_area * cot_gamma / 2.0)
+        wp.atomic_add(vertex_areas, v1, face_area * cot_alpha / 2.0 + face_area * cot_gamma / 2.0)
+        wp.atomic_add(vertex_areas, v2, face_area * cot_alpha / 2.0 + face_area * cot_beta / 2.0)
+
+    # Convert data to Warp arrays
+    faces_wp = wp.array(faces, dtype=wp.int32)
+    vertices_wp = wp.array(vertices, dtype=wp.vec3)
+    face_areas_wp = wp.array(face_areas, dtype=wp.float32)
+    vertex_areas_wp = wp.zeros(num_vertices, dtype=wp.float32)
+
+    # Launch the kernel
+    wp.launch(kernel=voronoi_area_kernel, dim=num_faces, inputs=[faces_wp, vertices_wp, face_areas_wp, vertex_areas_wp], device="cuda")
+
+    # Validate the result
+    if check_area:
+        vertex_areas_np = vertex_areas_wp.numpy()
+        if abs(vertex_areas_np.sum() - mesh.area) > 1e-2:
+            # Copy the result back to the CPU
+            print("Warning: Sum of Voronoi areas does not match mesh area")
+        else:
+            print("Voronoi areas calculated successfully")
+        print(f"Sum of Voronoi areas: {vertex_areas_np.sum()}")
+        print(f"Mesh area: {mesh.area}")
+
+    return vertices_wp, vertex_areas_wp
+
+
+def reconstruct_mesh_from_vertices_and_faces(vertices_wp, faces_np, save_path=None):
+    """
+    Reconstruct a trimesh from Warp vertices and NumPy-based faces.
+
+    Parameters
+    ----------
+    vertices_wp : wp.array
+        Warp array containing vertex coordinates
+    faces_np : np.ndarray
+        NumPy array containing face indices (each row is a triangle with 3 vertex indices)
+    save_path : str or None, optional
+        If provided, saves the mesh to this path with .stl extension
+
+    Returns
+    -------
+    trimesh.Trimesh
+        Reconstructed mesh
+    """
+    # Convert Warp vertices to numpy
+    vertices = vertices_wp.numpy()
+
+    # Create the mesh using the provided faces
+    mesh = trimesh.Trimesh(vertices=vertices, faces=faces_np)
+
+    # Save if path is provided
+    if save_path is not None:
+        if not save_path.endswith(".stl"):
+            save_path += ".stl"
+        mesh.export(save_path)
+        print(f"Mesh saved to: {save_path}")
+
+    return mesh
diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 1950fc1b..8d741798 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -150,7 +150,7 @@ def functional_velocity(
             # Find the value of u from the missing directions
             # Since we are only considering normal velocity, we only need to find one value (stored at the center of f_1)
             # Create velocity vector by multiplying the prescribed value with the normal vector
-            prescribed_value = f_1[0, index[0], index[1], index[2]]
+            prescribed_value = self.compute_dtype(f_1[0, index[0], index[1], index[2]])
             _u = -prescribed_value * normals
 
             # calculate rho
@@ -186,7 +186,7 @@ def functional_pressure(
 
             # Find the value of rho from the missing directions
             # Since we need only one scalar value, we only need to find one value (stored at the center of f_1)
-            _rho = f_1[0, index[0], index[1], index[2]]
+            _rho = self.compute_dtype(f_1[0, index[0], index[1], index[2]])
 
             # calculate velocity
             fsum = bc_helper.get_bc_fsum(_f, missing_mask)
diff --git a/xlb/operator/immersed_boundary/__init__.py b/xlb/operator/immersed_boundary/__init__.py
new file mode 100644
index 00000000..acbe2d43
--- /dev/null
+++ b/xlb/operator/immersed_boundary/__init__.py
@@ -0,0 +1 @@
+from xlb.helper.ibm_helper import calculate_voronoi_areas
diff --git a/xlb/operator/postprocess/__init__.py b/xlb/operator/postprocess/__init__.py
new file mode 100644
index 00000000..5a590589
--- /dev/null
+++ b/xlb/operator/postprocess/__init__.py
@@ -0,0 +1,3 @@
+from xlb.operator.postprocess.q_criterion import QCriterion
+from xlb.operator.postprocess.grid_to_point import GridToPoint
+from xlb.operator.postprocess.vorticity import Vorticity
\ No newline at end of file
diff --git a/xlb/operator/postprocess/grid_to_point.py b/xlb/operator/postprocess/grid_to_point.py
new file mode 100644
index 00000000..5a7bc818
--- /dev/null
+++ b/xlb/operator/postprocess/grid_to_point.py
@@ -0,0 +1,102 @@
+import warp as wp
+from typing import Any
+from functools import partial
+from jax import jit
+
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.operator import Operator
+
+
+class GridToPoint(Operator):
+    """
+    Interpolate values from a grid to arbitrary points using trilinear interpolation.
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        # Call the parent constructor
+        super().__init__(
+            velocity_set,
+            precision_policy,
+            compute_backend,
+        )
+
+    def _construct_warp(self):
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(
+            grid: wp.array4d(dtype=Any),
+            points: wp.array(dtype=wp.vec3),
+            point_values: wp.array(dtype=Any),
+        ):
+            # Get the global index
+            i = wp.tid()
+
+            # Get the point
+            point = points[i]
+
+            # Get lower and upper bounds
+            lower_0_0_0 = wp.vec3i(wp.int32(point[0]), wp.int32(point[1]), wp.int32(point[2]))
+            lower_0_0_1 = lower_0_0_0 + wp.vec3i(0, 0, 1)
+            lower_0_1_0 = lower_0_0_0 + wp.vec3i(0, 1, 0)
+            lower_0_1_1 = lower_0_0_0 + wp.vec3i(0, 1, 1)
+            lower_1_0_0 = lower_0_0_0 + wp.vec3i(1, 0, 0)
+            lower_1_0_1 = lower_0_0_0 + wp.vec3i(1, 0, 1)
+            lower_1_1_0 = lower_0_0_0 + wp.vec3i(1, 1, 0)
+            lower_1_1_1 = lower_0_0_0 + wp.vec3i(1, 1, 1)
+
+            # Get grid values
+            grid_0_0_0 = grid[0, lower_0_0_0[0], lower_0_0_0[1], lower_0_0_0[2]]
+            grid_0_0_1 = grid[0, lower_0_0_1[0], lower_0_0_1[1], lower_0_0_1[2]]
+            grid_0_1_0 = grid[0, lower_0_1_0[0], lower_0_1_0[1], lower_0_1_0[2]]
+            grid_0_1_1 = grid[0, lower_0_1_1[0], lower_0_1_1[1], lower_0_1_1[2]]
+            grid_1_0_0 = grid[0, lower_1_0_0[0], lower_1_0_0[1], lower_1_0_0[2]]
+            grid_1_0_1 = grid[0, lower_1_0_1[0], lower_1_0_1[1], lower_1_0_1[2]]
+            grid_1_1_0 = grid[0, lower_1_1_0[0], lower_1_1_0[1], lower_1_1_0[2]]
+            grid_1_1_1 = grid[0, lower_1_1_1[0], lower_1_1_1[1], lower_1_1_1[2]]
+
+            # Compute the interpolation weights
+            dx = point[0] - wp.float32(lower_0_0_0[0])
+            dy = point[1] - wp.float32(lower_0_0_0[1])
+            dz = point[2] - wp.float32(lower_0_0_0[2])
+            w_000 = (1.0 - dx) * (1.0 - dy) * (1.0 - dz)
+            w_001 = (1.0 - dx) * (1.0 - dy) * dz
+            w_010 = (1.0 - dx) * dy * (1.0 - dz)
+            w_011 = (1.0 - dx) * dy * dz
+            w_100 = dx * (1.0 - dy) * (1.0 - dz)
+            w_101 = dx * (1.0 - dy) * dz
+            w_110 = dx * dy * (1.0 - dz)
+            w_111 = dx * dy * dz
+
+            # Compute the interpolated value
+            point_value = (
+                w_000 * grid_0_0_0 + w_001 * grid_0_0_1 + w_010 * grid_0_1_0 + w_011 * grid_0_1_1 +
+                w_100 * grid_1_0_0 + w_101 * grid_1_0_1 + w_110 * grid_1_1_0 + w_111 * grid_1_1_1
+            )
+
+            # Set the output
+            point_values[i] = point_value
+
+        return None, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, grid, points, point_values):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[grid, points, point_values],
+            dim=[points.shape[0]],
+        )
+        return point_values
+
+    @Operator.register_backend(ComputeBackend.JAX)
+    @partial(jit, static_argnums=(0))
+    def jax_implementation(self, grid, points, point_values):
+        # TODO: Implement JAX version
+        raise NotImplementedError("JAX implementation not yet available")
diff --git a/xlb/operator/postprocess/q_criterion.py b/xlb/operator/postprocess/q_criterion.py
new file mode 100644
index 00000000..227b3655
--- /dev/null
+++ b/xlb/operator/postprocess/q_criterion.py
@@ -0,0 +1,143 @@
+# Credit https://github.com/loliverhennigh/XLB Original Author: Oliver Hennigh
+import warp as wp
+from typing import Any
+from functools import partial
+from jax import jit
+
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.operator import Operator
+
+
+class QCriterion(Operator):
+    """
+    Compute Q-criterion and vorticity magnitude for flow visualization and analysis.
+
+    The Q-criterion is the second invariant of the velocity gradient tensor,
+    defined as Q = 1/2(|Ω|^2 - |S|^2) where Ω is the vorticity tensor and
+    S is the rate-of-strain tensor.
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        # Call the parent constructor
+        super().__init__(
+            velocity_set,
+            precision_policy,
+            compute_backend,
+        )
+
+    def _construct_warp(self):
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(
+            u: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            norm_mu: wp.array4d(dtype=Any),
+            q: wp.array4d(dtype=Any),
+        ):
+            # Get the global index
+            i, j, k = wp.tid()
+
+            # Add ghost cells to index
+            i += 1
+            j += 1
+            k += 1
+
+            # Check if anything on edges
+            b_id_2_1_1 = bc_mask[0, i + 1, j, k]
+            b_id_1_2_1 = bc_mask[0, i, j + 1, k]
+            b_id_1_1_2 = bc_mask[0, i, j, k + 1]
+            b_id_0_1_1 = bc_mask[0, i - 1, j, k]
+            b_id_1_0_1 = bc_mask[0, i, j - 1, k]
+            b_id_1_1_0 = bc_mask[0, i, j, k - 1]
+            if (
+                b_id_2_1_1 != wp.uint8(0)
+                or b_id_1_2_1 != wp.uint8(0)
+                or b_id_1_1_2 != wp.uint8(0)
+                or b_id_0_1_1 != wp.uint8(0)
+                or b_id_1_0_1 != wp.uint8(0)
+                or b_id_1_1_0 != wp.uint8(0)
+            ):
+                return
+
+            # Get derivatives
+            u_x_dx = (u[0, i + 1, j, k] - u[0, i - 1, j, k]) / 2.0
+            u_x_dy = (u[0, i, j + 1, k] - u[0, i, j - 1, k]) / 2.0
+            u_x_dz = (u[0, i, j, k + 1] - u[0, i, j, k - 1]) / 2.0
+            u_y_dx = (u[1, i + 1, j, k] - u[1, i - 1, j, k]) / 2.0
+            u_y_dy = (u[1, i, j + 1, k] - u[1, i, j - 1, k]) / 2.0
+            u_y_dz = (u[1, i, j, k + 1] - u[1, i, j, k - 1]) / 2.0
+            u_z_dx = (u[2, i + 1, j, k] - u[2, i - 1, j, k]) / 2.0
+            u_z_dy = (u[2, i, j + 1, k] - u[2, i, j - 1, k]) / 2.0
+            u_z_dz = (u[2, i, j, k + 1] - u[2, i, j, k - 1]) / 2.0
+
+            # Compute vorticity
+            mu_x = u_z_dy - u_y_dz
+            mu_y = u_x_dz - u_z_dx
+            mu_z = u_y_dx - u_x_dy
+            mu = wp.sqrt(mu_x**2.0 + mu_y**2.0 + mu_z**2.0)
+
+            # Compute strain rate
+            s_0_0 = u_x_dx
+            s_0_1 = 0.5 * (u_x_dy + u_y_dx)
+            s_0_2 = 0.5 * (u_x_dz + u_z_dx)
+            s_1_0 = s_0_1
+            s_1_1 = u_y_dy
+            s_1_2 = 0.5 * (u_y_dz + u_z_dy)
+            s_2_0 = s_0_2
+            s_2_1 = s_1_2
+            s_2_2 = u_z_dz
+            s_dot_s = s_0_0**2.0 + s_0_1**2.0 + s_0_2**2.0 + s_1_0**2.0 + s_1_1**2.0 + s_1_2**2.0 + s_2_0**2.0 + s_2_1**2.0 + s_2_2**2.0
+
+            # Compute omega
+            omega_0_0 = 0.0
+            omega_0_1 = 0.5 * (u_x_dy - u_y_dx)
+            omega_0_2 = 0.5 * (u_x_dz - u_z_dx)
+            omega_1_0 = -omega_0_1
+            omega_1_1 = 0.0
+            omega_1_2 = 0.5 * (u_y_dz - u_z_dy)
+            omega_2_0 = -omega_0_2
+            omega_2_1 = -omega_1_2
+            omega_2_2 = 0.0
+            omega_dot_omega = (
+                omega_0_0**2.0
+                + omega_0_1**2.0
+                + omega_0_2**2.0
+                + omega_1_0**2.0
+                + omega_1_1**2.0
+                + omega_1_2**2.0
+                + omega_2_0**2.0
+                + omega_2_1**2.0
+                + omega_2_2**2.0
+            )
+
+            # Compute q-criterion
+            q_value = 0.5 * (omega_dot_omega - s_dot_s)
+
+            # Set the output
+            norm_mu[0, i, j, k] = mu
+            q[0, i, j, k] = q_value
+
+        return None, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, u, bc_mask, norm_mu, q):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[u, bc_mask, norm_mu, q],
+            dim=[i - 2 for i in u.shape[1:]],
+        )
+        return norm_mu, q
+
+    @Operator.register_backend(ComputeBackend.JAX)
+    @partial(jit, static_argnums=(0))
+    def jax_implementation(self, u, bc_mask, norm_mu, q):
+        # TODO: Implement JAX version
+        raise NotImplementedError("JAX implementation not yet available")
diff --git a/xlb/operator/postprocess/vorticity.py b/xlb/operator/postprocess/vorticity.py
new file mode 100644
index 00000000..55382d02
--- /dev/null
+++ b/xlb/operator/postprocess/vorticity.py
@@ -0,0 +1,101 @@
+import warp as wp
+from typing import Any
+from functools import partial
+from jax import jit
+
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.operator import Operator
+
+
+class Vorticity(Operator):
+    """
+    Compute vorticity vector and magnitude for flow visualization and analysis.
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        # Call the parent constructor
+        super().__init__(
+            velocity_set,
+            precision_policy,
+            compute_backend,
+        )
+
+    def _construct_warp(self):
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(
+            u: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            vorticity: wp.array4d(dtype=Any),
+            vorticity_magnitude: wp.array4d(dtype=Any),
+        ):
+            # Get the global index
+            i, j, k = wp.tid()
+
+            # Add ghost cells to index
+            i += 1
+            j += 1
+            k += 1
+
+            # Check if anything on edges
+            b_id_2_1_1 = bc_mask[0, i + 1, j, k]
+            b_id_1_2_1 = bc_mask[0, i, j + 1, k]
+            b_id_1_1_2 = bc_mask[0, i, j, k + 1]
+            b_id_0_1_1 = bc_mask[0, i - 1, j, k]
+            b_id_1_0_1 = bc_mask[0, i, j - 1, k]
+            b_id_1_1_0 = bc_mask[0, i, j, k - 1]
+            if (
+                b_id_2_1_1 != wp.uint8(0)
+                or b_id_1_2_1 != wp.uint8(0)
+                or b_id_1_1_2 != wp.uint8(0)
+                or b_id_0_1_1 != wp.uint8(0)
+                or b_id_1_0_1 != wp.uint8(0)
+                or b_id_1_1_0 != wp.uint8(0)
+            ):
+                return
+
+            # Get derivatives using central differences
+            u_x_dy = (u[0, i, j + 1, k] - u[0, i, j - 1, k]) / 2.0
+            u_x_dz = (u[0, i, j, k + 1] - u[0, i, j, k - 1]) / 2.0
+            u_y_dx = (u[1, i + 1, j, k] - u[1, i - 1, j, k]) / 2.0
+            u_y_dz = (u[1, i, j, k + 1] - u[1, i, j, k - 1]) / 2.0
+            u_z_dx = (u[2, i + 1, j, k] - u[2, i - 1, j, k]) / 2.0
+            u_z_dy = (u[2, i, j + 1, k] - u[2, i, j - 1, k]) / 2.0
+
+            # Compute vorticity components (curl of velocity)
+            vort_x = u_z_dy - u_y_dz
+            vort_y = u_x_dz - u_z_dx
+            vort_z = u_y_dx - u_x_dy
+
+            # Store vorticity vector components
+            vorticity[0, i, j, k] = vort_x
+            vorticity[1, i, j, k] = vort_y
+            vorticity[2, i, j, k] = vort_z
+
+            # Compute and store vorticity magnitude
+            vorticity_magnitude[0, i, j, k] = wp.sqrt(vort_x * vort_x + vort_y * vort_y + vort_z * vort_z)
+
+        return None, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, u, bc_mask, vorticity, vorticity_magnitude):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[u, bc_mask, vorticity, vorticity_magnitude],
+            dim=[i - 2 for i in u.shape[1:]],
+        )
+        return vorticity, vorticity_magnitude
+
+    @Operator.register_backend(ComputeBackend.JAX)
+    @partial(jit, static_argnums=(0))
+    def jax_implementation(self, u, bc_mask, vorticity, vorticity_magnitude):
+        # TODO: Implement JAX version
+        raise NotImplementedError("JAX implementation not yet available")
diff --git a/xlb/operator/stepper/__init__.py b/xlb/operator/stepper/__init__.py
index e5d159c6..4d431313 100644
--- a/xlb/operator/stepper/__init__.py
+++ b/xlb/operator/stepper/__init__.py
@@ -1,2 +1,3 @@
 from xlb.operator.stepper.stepper import Stepper
 from xlb.operator.stepper.nse_stepper import IncompressibleNavierStokesStepper
+from xlb.operator.stepper.ibm_stepper import IBMStepper
\ No newline at end of file
diff --git a/xlb/operator/stepper/ibm_stepper.py b/xlb/operator/stepper/ibm_stepper.py
new file mode 100644
index 00000000..84ff55cf
--- /dev/null
+++ b/xlb/operator/stepper/ibm_stepper.py
@@ -0,0 +1,408 @@
+from functools import partial
+from jax import jit
+import warp as wp
+from typing import Any
+from contextlib import nullcontext
+
+from xlb.compute_backend import ComputeBackend
+from xlb.operator import Operator
+from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
+from xlb.operator.stepper.nse_stepper import IncompressibleNavierStokesStepper
+from warp.utils import ScopedTimer
+
+
+class IBMStepper(IncompressibleNavierStokesStepper):
+    def __init__(self, grid, boundary_conditions=[], collision_type="BGK", use_scoped_timer=False):
+        super().__init__(grid, boundary_conditions, collision_type)
+        # Initialize timer context based on the flag
+        self.timer_context = (
+            ScopedTimer("IBM_Stepper", use_nvtx=True, synchronize=True, cuda_filter=wp.TIMING_ALL) if use_scoped_timer else nullcontext()
+        )
+
+        self.grid_dim = grid.shape
+        dim_x, dim_y, dim_z = self.grid_dim
+
+        # Initialize Eulerian points array
+        self.f_eulerian_points = wp.zeros(shape=(dim_x * dim_y * dim_z), dtype=wp.vec3)
+        self.f_eulerian_forces = wp.zeros(shape=(dim_x * dim_y * dim_z), dtype=wp.vec3)
+        self.f_eulerian_velocities = wp.zeros(shape=(dim_x * dim_y * dim_z), dtype=wp.vec3)
+        self.f_eulerian_weights = wp.zeros(shape=(dim_x * dim_y * dim_z), dtype=self.compute_dtype)
+
+        @wp.func
+        def hash_to_grid_idx(hash_idx: int, dim_x: int, dim_y: int) -> wp.vec3i:
+            """Convert hash grid index to 3D grid coordinates"""
+            k = hash_idx // (dim_x * dim_y)
+            j = (hash_idx % (dim_x * dim_y)) // dim_x
+            i = hash_idx % dim_x
+            return wp.vec3i(i, j, k)
+
+        @wp.func
+        def grid_to_hash_idx(i: int, j: int, k: int, dim_x: int, dim_y: int) -> int:
+            """Convert 3D grid coordinates to hash grid index"""
+            return k * (dim_x * dim_y) + j * dim_x + i
+
+        @wp.kernel
+        def init_eulerian_points(points: wp.array(dtype=wp.vec3), dim_x: int, dim_y: int, dim_z: int):
+            idx = wp.tid()
+            grid_pos = hash_to_grid_idx(idx, dim_x, dim_y)
+            points[idx] = wp.vec3(float(grid_pos[0]) + 0.5, float(grid_pos[1]) + 0.5, float(grid_pos[2]) + 0.5)
+
+        # Launch kernel to initialize points
+        wp.launch(kernel=init_eulerian_points, dim=dim_x * dim_y * dim_z, inputs=[self.f_eulerian_points, dim_x, dim_y, dim_z])
+
+        self.hash_grid = wp.HashGrid(dim_x=dim_x, dim_y=dim_y, dim_z=dim_z)
+        self.hash_grid.build(self.f_eulerian_points, 2.0)  # 2.0 is the radius
+
+        self.s_lagr_forces_initialized = False
+
+        self._construct_ibm_warp()
+
+    @Operator.register_backend(ComputeBackend.JAX)
+    @partial(jit, static_argnums=(0))
+    def jax_implementation(self, f_0, f_1, bc_mask, missing_mask, timestep):
+        raise NotImplementedError("IBM stepper is not implemented in JAX backend. Please use WARP backend.")
+
+    def _construct_ibm_warp(self):
+        # Set local constants
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+        _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
+        _opp_indices = self.velocity_set.opp_indices
+        _weights = self.velocity_set.w
+        _c = self.velocity_set.c
+        _dim_x = self.grid_dim[0]
+        _dim_y = self.grid_dim[1]
+
+        # Read the list of bc_to_id created upon instantiation
+        bc_to_id = boundary_condition_registry.bc_to_id
+
+        # Gather IDs of ExtrapolationOutflowBC boundary conditions
+        extrapolation_outflow_bc_ids = []
+        for bc_name, bc_id in bc_to_id.items():
+            if bc_name.startswith("ExtrapolationOutflowBC"):
+                extrapolation_outflow_bc_ids.append(bc_id)
+        # Group active boundary conditions
+        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
+
+        _opp_indices = self.velocity_set.opp_indices
+
+        @wp.func
+        def hash_to_grid_idx(hash_idx: int, dim_x: int, dim_y: int) -> wp.vec3i:
+            """Convert hash grid index to 3D grid coordinates"""
+            k = hash_idx // (dim_x * dim_y)
+            j = (hash_idx % (dim_x * dim_y)) // dim_x
+            i = hash_idx % dim_x
+            return wp.vec3i(i, j, k)
+
+        @wp.func
+        def grid_to_hash_idx(i: int, j: int, k: int, dim_x: int, dim_y: int) -> int:
+            """Convert 3D grid coordinates to hash grid index"""
+            return k * (dim_x * dim_y) + j * dim_x + i
+
+        # Smoothing function as proposed by Peskin
+        @wp.func
+        def peskin_weight(r: float):
+            abs_r = wp.abs(r)
+            if abs_r <= 1.0:
+                return self.compute_dtype(0.125) * (
+                    self.compute_dtype(3.0)
+                    - 2.0 * abs_r
+                    + wp.sqrt(self.compute_dtype(1.0) + self.compute_dtype(4.0) * abs_r - self.compute_dtype(4.0) * abs_r * abs_r)
+                )
+            elif abs_r <= 2.0:
+                return self.compute_dtype(0.125) * (
+                    self.compute_dtype(5.0)
+                    - 2.0 * abs_r
+                    - wp.sqrt(self.compute_dtype(-7.0) + self.compute_dtype(12.0) * abs_r - self.compute_dtype(4.0) * abs_r * abs_r)
+                )
+            else:
+                return self.compute_dtype(0.0)
+
+        @wp.func
+        def weight(x: wp.vec3, Xk: wp.vec3):
+            r = x - Xk
+            return peskin_weight(r[0]) * peskin_weight(r[1]) * peskin_weight(r[2])
+
+        # Kernel to initialize the force on Lagrangian points (Step 1)
+        @wp.kernel
+        def initialize_lagr_force(
+            solid_lagr_velocities: wp.array(dtype=wp.vec3),
+            fluid_lagr_velocities: wp.array(dtype=wp.vec3),
+            lag_forces: wp.array(dtype=wp.vec3),
+        ):
+            tid = wp.tid()
+            vk = solid_lagr_velocities[tid]
+            u_Xk = fluid_lagr_velocities[tid]
+
+            # Initialize force
+            lag_forces[tid] = vk - u_Xk
+
+        # Kernel to interpolate force from Lagrangian to Eulerian grid (Step 2)
+        @wp.kernel
+        def interpolate_force_to_eulerian_atomic(
+            lag_positions: wp.array(dtype=wp.vec3),
+            lag_forces: wp.array(dtype=wp.vec3),
+            lag_areas: wp.array(dtype=Any),
+            eul_positions: wp.array(dtype=wp.vec3),
+            eul_forces: wp.array(dtype=wp.vec3),
+            eul_weights: wp.array(dtype=Any),  # Accumulator for weights
+            grid: wp.uint64,
+        ):
+            tid = wp.tid()
+            Xk = lag_positions[tid]
+            Fk = lag_forces[tid]
+            Ak = lag_areas[tid]
+
+            # Query neighboring Eulerian points
+            query = wp.hash_grid_query(grid, Xk, 2.0)
+            index = int(0)
+
+            while wp.hash_grid_query_next(query, index):
+                x_pos = eul_positions[index]
+                w = weight(x_pos, Xk)
+                # First accumulate the weight
+                wp.atomic_add(eul_weights, index, w)
+                # Then accumulate the weighted force
+                delta_f = Fk * w * Ak
+                wp.atomic_add(eul_forces, index, delta_f)
+
+        # Kernel to interpolate corrected velocities back to Lagrangian points (Step 4)
+        @wp.kernel
+        def interpolate_velocity_to_lagrangian(
+            lag_positions: wp.array(dtype=wp.vec3),
+            eul_positions: wp.array(dtype=wp.vec3),
+            eul_velocities: wp.array(dtype=wp.vec3),
+            lag_fluid_velocities: wp.array(dtype=wp.vec3),
+            grid: wp.uint64,
+        ):
+            tid = wp.tid()
+            Xk = lag_positions[tid]
+
+            # Initialize numerator and denominator for interpolation
+            numerator = wp.vec3(self.compute_dtype(0.0), self.compute_dtype(0.0), self.compute_dtype(0.0))
+            denominator = self.compute_dtype(0.0)
+
+            # Query neighboring Eulerian points
+            query = wp.hash_grid_query(grid, Xk, 2.0)
+            index = int(0)
+
+            while wp.hash_grid_query_next(query, index):
+                x_pos = eul_positions[index]
+                u = eul_velocities[index]
+                w = weight(x_pos, Xk)
+                numerator += u * w
+                denominator += w
+
+            if denominator > self.compute_dtype(0.0):
+                u_interp = numerator / denominator
+            else:
+                u_interp = wp.vec3(0.0, 0.0, 0.0)
+
+            lag_fluid_velocities[tid] = u_interp
+
+        # Kernel to update the force at Lagrangian points (Step 5)
+        @wp.kernel
+        def update_lagr_force(
+            solid_lagr_velocities: wp.array(dtype=wp.vec3),
+            fluid_lagr_velocities: wp.array(dtype=wp.vec3),
+            lag_forces: wp.array(dtype=wp.vec3),
+        ):
+            tid = wp.tid()
+            vk = solid_lagr_velocities[tid]
+            uk = fluid_lagr_velocities[tid]
+            delta_F = vk - uk
+            lag_forces[tid] += delta_F
+
+        @wp.kernel
+        def compute_eulerian_velocity_from_f_1(f_1: wp.array4d(dtype=Any), eul_velocities: wp.array(dtype=wp.vec3)):
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+            # Read from thread local memory
+            _f_1_thread = _f_vec()
+
+            for l in range(self.velocity_set.q):
+                _f_1_thread[l] = self.compute_dtype(f_1[l, index[0], index[1], index[2]])
+
+            _rho, _u = self.macroscopic.warp_functional(_f_1_thread)
+
+            eul_velocities[grid_to_hash_idx(i, j, k, _dim_x, _dim_y)] = _u
+
+        @wp.kernel
+        def correct_population_ibm(f_1: wp.array4d(dtype=Any), eul_forces: wp.array(dtype=wp.vec3)):
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            # Initialize thread-local storage for populations
+            _f1_thread = _f_vec()
+
+            # Retrieve f_1 values for the current grid point
+            for l in range(self.velocity_set.q):
+                _f1_thread[l] = self.compute_dtype(f_1[l, index[0], index[1], index[2]])
+
+            # Compute macroscopic quantities (rho, u) from f_1
+            _rho, _u = self.macroscopic.warp_functional(_f1_thread)
+
+            # Retrieve the force at the current grid point
+            force = eul_forces[grid_to_hash_idx(i, j, k, _dim_x, _dim_y)]
+
+            # Compute equilibrium with force applied
+            feq_force = self.equilibrium.warp_functional(_rho, _u + force)
+            feq = self.equilibrium.warp_functional(_rho, _u)
+
+            # Update f_1 with the new post-collision population
+            for l in range(self.velocity_set.q):
+                f_1[l, index[0], index[1], index[2]] += self.store_dtype(feq_force[l] - feq[l])
+
+        # Add a new kernel for conservation-aware force normalization
+        @wp.kernel
+        def conservation_aware_normalize_forces(
+            eul_forces: wp.array(dtype=wp.vec3),
+            eul_weights: wp.array(dtype=Any),
+            eul_velocities: wp.array(dtype=wp.vec3),
+        ):
+            tid = wp.tid()
+            weight_sum = eul_weights[tid]
+
+            if weight_sum > self.compute_dtype(0.0):
+                # The accumulated forces contain the weighted sum of desired velocity changes
+                # We normalize by weight_sum to get the average desired velocity change
+                # This ensures that we're applying just the right amount of force to achieve
+                # the collective boundary condition, not over-enforcing it
+                eul_forces[tid] = eul_forces[tid] / weight_sum
+
+        # Add a new kernel that combines force interpolation and conservation in one step
+        @wp.kernel
+        def improved_interpolate_force_to_eulerian(
+            lag_positions: wp.array(dtype=wp.vec3),
+            lag_forces: wp.array(dtype=wp.vec3),
+            lag_areas: wp.array(dtype=Any),
+            eul_positions: wp.array(dtype=wp.vec3),
+            eul_forces: wp.array(dtype=wp.vec3),  # Will store desired velocity, not force directly
+            eul_weights: wp.array(dtype=Any),  # For normalization
+            grid: wp.uint64,
+        ):
+            tid = wp.tid()
+            Xk = lag_positions[tid]
+            Fk = lag_forces[tid]  # Fk here represents the desired velocity change at the Lagrangian point
+            Ak = lag_areas[tid]
+
+            # Query neighboring Eulerian points
+            query = wp.hash_grid_query(grid, Xk, 2.0)
+            index = int(0)
+
+            while wp.hash_grid_query_next(query, index):
+                x_pos = eul_positions[index]
+                w = weight(x_pos, Xk)
+
+                # The weight represents how much this Lagrangian point influences this Eulerian point
+                wp.atomic_add(eul_weights, index, w)
+
+                # We accumulate the weighted desired velocity from each Lagrangian point
+                # Each Lagrangian point contributes according to its weight and area
+                target_velocity = Fk * w * Ak
+                wp.atomic_add(eul_forces, index, target_velocity)
+
+        # Add this to the constructor
+        self.improved_interpolate_force_to_eulerian = improved_interpolate_force_to_eulerian
+
+        @wp.kernel
+        def physics_based_normalize_and_correct(
+            eul_forces: wp.array(dtype=wp.vec3),  # Contains accumulated weighted velocities
+            eul_weights: wp.array(dtype=Any),  # Contains sum of weights
+            eul_velocities: wp.array(dtype=wp.vec3),  # Current velocities
+        ):
+            tid = wp.tid()
+            weight_sum = eul_weights[tid]
+
+            if weight_sum > self.compute_dtype(0.0):
+                # Calculate the physically correct target velocity at this Eulerian point
+                # by taking the weighted average of all desired velocities from influencing Lagrangian points
+                target_velocity = eul_forces[tid] / weight_sum
+
+                # The force we need to apply is the difference between the target and current velocity
+                # This is the minimal force needed to satisfy the boundary conditions collectively
+                correction_force = target_velocity - eul_velocities[tid]
+
+                # Store the correction force back in eul_forces for use in later steps
+                eul_forces[tid] = correction_force
+
+        self.physics_based_normalize_and_correct = physics_based_normalize_and_correct
+
+        self.initialize_lagr_force = initialize_lagr_force
+        self.compute_eulerian_velocity_from_f_1 = compute_eulerian_velocity_from_f_1
+        self.interpolate_force_to_eulerian_atomic = interpolate_force_to_eulerian_atomic
+        self.interpolate_velocity_to_lagrangian = interpolate_velocity_to_lagrangian
+        self.update_lagr_force = update_lagr_force
+        self.correct_population_ibm = correct_population_ibm
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(
+        self,
+        f_0,
+        f_1,
+        s_lagr_vertices_wp,
+        lagr_solid_vertex_areas_wp,
+        lagr_solid_velocities_wp,
+        bc_mask,
+        missing_mask,
+        omega,
+        timestep,
+    ):
+        self.s_lagr_forces = wp.zeros(shape=(s_lagr_vertices_wp.shape[0]), dtype=wp.vec3)
+        self.f_lagr_velocities = wp.zeros(shape=(s_lagr_vertices_wp.shape[0]), dtype=wp.vec3)
+
+        with self.timer_context:
+            # Step 1: Perform LBM step
+            wp.launch(kernel=self.warp_kernel, dim=f_0.shape[1:], inputs=[f_0, f_1, bc_mask, missing_mask, omega, timestep])
+
+            num_iterations = 1
+            for _ in range(num_iterations):
+                self.f_eulerian_forces.zero_()
+                self.f_eulerian_weights.zero_()
+                wp.launch(kernel=self.compute_eulerian_velocity_from_f_1, dim=f_1.shape[1:], inputs=[f_1, self.f_eulerian_velocities])
+
+                # Step 2: Use our improved interpolation that accumulates forces with the proper weighting
+                wp.launch(
+                    kernel=self.improved_interpolate_force_to_eulerian,
+                    dim=s_lagr_vertices_wp.shape[0],
+                    inputs=[
+                        s_lagr_vertices_wp,
+                        self.s_lagr_forces,
+                        lagr_solid_vertex_areas_wp,
+                        self.f_eulerian_points,
+                        self.f_eulerian_forces,
+                        self.f_eulerian_weights,
+                        wp.uint64(self.hash_grid.id),
+                    ],
+                )
+
+                # Step 3: Use our physics-based normalization and correction
+                wp.launch(
+                    kernel=self.physics_based_normalize_and_correct,
+                    dim=self.f_eulerian_forces.shape[0],
+                    inputs=[self.f_eulerian_forces, self.f_eulerian_weights, self.f_eulerian_velocities],
+                )
+
+                # Step 4: Interpolate corrected velocities back to Lagrangian points
+                wp.launch(
+                    kernel=self.interpolate_velocity_to_lagrangian,
+                    dim=s_lagr_vertices_wp.shape[0],
+                    inputs=[
+                        s_lagr_vertices_wp,
+                        self.f_eulerian_points,
+                        self.f_eulerian_velocities,
+                        self.f_lagr_velocities,
+                        wp.uint64(self.hash_grid.id),
+                    ],
+                )
+
+                # Step 5: Update Lagrangian forces
+                wp.launch(
+                    kernel=self.update_lagr_force,
+                    dim=s_lagr_vertices_wp.shape[0],
+                    inputs=[lagr_solid_velocities_wp, self.f_lagr_velocities, self.s_lagr_forces],
+                )
+
+            # Step 6: Correct populations
+            wp.launch(kernel=self.correct_population_ibm, dim=f_1.shape[1:], inputs=[f_1, self.f_eulerian_forces])
+
+        return f_0, f_1, self.s_lagr_forces