teerthsharma · teerthsharma · Apr 30, 2026 · May 2, 2026
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -1,3 +1,13 @@
 ## 2026-01-29 - Single Pass Variance Calculation in Manifold Heap
 **Learning:** The `ChebyshevGuard::calculate` function in `ManifoldHeap` was performing two passes over the memory blocks to calculate mean and variance separately. This is a common pattern when following the mathematical definition directly. However, in a performance-critical "metabolism" loop (GC), this doubles the memory access overhead.
 **Action:** Always check for opportunities to compute statistics (mean, variance) in a single pass using Welford's algorithm or accumulated sums, especially when iterating over large data structures.
+
+<<<<<<< HEAD
+## 2026-05-01 - Avoid High-Level Tensor Ops in Scalar Reductions
+**Learning:** High-level `Tensor` operations like `sub()` and `mul()` trigger intermediate heap allocations for shape and stride metadata. When computing scalar reductions (like MSE, distances, or loss functions), using these operations introduces severe memory overhead inside hot loops. Attempting to use `.min()` length truncation as a safeguard is an anti-pattern as it masks shape mismatch errors.
+**Action:** For scalar reductions, assert shape equality (`assert_eq!(a.shape, b.shape)`) and perform a single-pass iteration directly over the underlying borrowed data arrays (`a.data.borrow()`) to eliminate intermediate allocations and safely compute the result.
+=======
+## 2026-05-18 - Single-Pass Scalar Reductions in Linear Algebra
+**Learning:** High-level tensor operations like `a.sub(b)` and `.mul()` generate costly intermediate `Tensor` heap allocations containing shape and strides metadata. When computing scalar reductions (like loss functions or distance metrics) over tensors, this overhead is unnecessary and significantly impacts performance in hot paths.
+**Action:** Always avoid intermediate `Tensor` allocations for scalar reductions (e.g., `mse`, `mae`, `binary_cross_entropy`, `hinge_loss`, `euclidean_distance`, `manhattan_distance`, `chebyshev_distance`, `rbf_kernel`). Instead, assert shape equality (`assert_eq!(a.shape, b.shape)`) and perform a single-pass iteration directly over the underlying borrowed data arrays (`a.data.borrow()`).
+>>>>>>> 5c3b036 (Perf: single pass scalar reductions)
diff --git a/crates/aegis-cli/src/main.rs b/crates/aegis-cli/src/main.rs
@@ -58,15 +58,17 @@ fn main() {
     // Spawn a thread with 8MB stack to prevent overflow.
     let builder = std::thread::Builder::new().stack_size(8 * 1024 * 1024);
 
-    let handler = builder.spawn(|| {
-        let cli = Cli::parse();
-
-        match cli.command {
-            Some(Commands::Repl) | None => run_repl(),
-            Some(Commands::Run { file, mode }) => run_file(&file, &mode),
-            Some(Commands::Check { file }) => check_file(&file),
-        }
-    }).unwrap();
+    let handler = builder
+        .spawn(|| {
+            let cli = Cli::parse();
+
+            match cli.command {
+                Some(Commands::Repl) | None => run_repl(),
+                Some(Commands::Run { file, mode }) => run_file(&file, &mode),
+                Some(Commands::Check { file }) => check_file(&file),
+            }
+        })
+        .unwrap();
 
     handler.join().unwrap();
 }
@@ -166,7 +168,10 @@ fn run_file(path: &PathBuf, mode: &str) {
     if let Some(ext) = path.extension() {
         let s = ext.to_string_lossy();
         if s != "aegis" && s != "ag" {
-            println!("Warning: File extension '.{}' is not standard (.aegis or .ag)", s);
+            println!(
+                "Warning: File extension '.{}' is not standard (.aegis or .ag)",
+                s
+            );
         }
     }
 
@@ -181,14 +186,14 @@ fn run_file(path: &PathBuf, mode: &str) {
     };
 
     if mode == "titan" {
-        use aegis_lang::vm::{TitanVM, Compiler};
+        use aegis_lang::vm::{Compiler, TitanVM};
         // Compile to Bytecode
         let compiler = Compiler::new();
         let code = compiler.compile(&ast);
-        
+
         let mut vm = TitanVM::new();
         vm.load_code(code);
-        
+
         match vm.run() {
             Ok(result) => {
                 println!("{:?}", result);

diff --git a/crates/aegis-core/src/memory.rs b/crates/aegis-core/src/memory.rs
@@ -13,9 +13,9 @@
 #[cfg(feature = "std")]
 use std::thread;
 
-use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use core::cell::UnsafeCell;
 use core::marker::PhantomData;
+use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 
 /// The number of concurrent "Time Dimensions" (Shards) in the clock.
 /// 32 Shards ensures minimal contention even on high-core-count Titan machines.
@@ -65,31 +65,36 @@ impl<T, const SIZE: usize> TitanClock<T, SIZE> {
 
     pub fn new() -> Self {
         // Assert SIZE is divisible by SHARDS for simplicity
-        assert!(SIZE % SHARDS == 0, "Manifold SIZE must be divisible by 32 (SHARDS)");
+        assert!(
+            SIZE % SHARDS == 0,
+            "Manifold SIZE must be divisible by 32 (SHARDS)"
+        );
 
         Self {
             manifold: core::array::from_fn(|_| Slot::new()),
-            hands: core::array::from_fn(|_| TimeHand { index: AtomicUsize::new(0) }),
+            hands: core::array::from_fn(|_| TimeHand {
+                index: AtomicUsize::new(0),
+            }),
             _marker: PhantomData,
         }
     }
-    
+
     /// Reserve a slot using the clock algorithm.
     /// Returns the index of a metabolically available slot.
     fn reserve_slot(&self) -> usize {
         // In a real implementation, we would hash the Thread ID to pick a starting shard.
         // For portable no_std, we can use a relaxed global counter or just start at 0.
         // To reduce contention, let's just create a pseudo-random start based on the stack pointer or similar?
         // Or just iterate efficiently.
-        let start_shard = 0; 
-        
+        let start_shard = 0;
+
         for i in 0..SHARDS {
-             let shard_id = (start_shard + i) % SHARDS;
-             if let Some(idx) = self.try_reserve_in_shard(shard_id) {
-                 return idx;
-             }
+            let shard_id = (start_shard + i) % SHARDS;
+            if let Some(idx) = self.try_reserve_in_shard(shard_id) {
+                return idx;
+            }
         }
-        
+
         // If all shards are saturated (entropy storm), we force the "Big Bang" (overwrite) in shard 0.
         self.force_reserve_in_shard(0)
     }
@@ -102,7 +107,7 @@ impl<T, const SIZE: usize> TitanClock<T, SIZE> {
 
         // Limit search to 2 revolutions (Second Chance Algorithm requirement)
         let limit = Self::STACK_SIZE * 2;
-        
+
         for _ in 0..limit {
             // Atomic increment of the hand
             let local_idx = hand.index.fetch_add(1, Ordering::Relaxed) % Self::STACK_SIZE;
@@ -112,19 +117,19 @@ impl<T, const SIZE: usize> TitanClock<T, SIZE> {
             // Bio-Clock Logic:
             // If Energy=1 (Hot) -> Set Energy=0 (Cold) and Continue.
             // If Energy=0 (Cold) -> Claim it.
-            
+
             // We use compare_exchange to be pedantic, but specialized Load/Store is fine for heuristic.
             // If we see Hot, make it Cold.
             if slot.energy.load(Ordering::Acquire) {
-                 slot.energy.store(false, Ordering::Release);
-                 // We don't take it. We give it a second chance.
+                slot.energy.store(false, Ordering::Release);
+                // We don't take it. We give it a second chance.
             } else {
-                 // It's cold. We take it.
-                 // Ideally we should CAS a "claiming" bit to ensure unique ownership in race.
-                 // But for this "Bio" memory, Last-Writer-Wins on the same slot is acceptable noise
-                 // provided we don't drop live data.
-                 // Since it was cold, it deemed dead.
-                 return Some(global_idx);
+                // It's cold. We take it.
+                // Ideally we should CAS a "claiming" bit to ensure unique ownership in race.
+                // But for this "Bio" memory, Last-Writer-Wins on the same slot is acceptable noise
+                // provided we don't drop live data.
+                // Since it was cold, it deemed dead.
+                return Some(global_idx);
             }
         }
         None
@@ -136,33 +141,35 @@ impl<T, const SIZE: usize> TitanClock<T, SIZE> {
         let local_idx = hand.index.fetch_add(1, Ordering::Relaxed) % Self::STACK_SIZE;
         shard_id * Self::STACK_SIZE + local_idx
     }
-    
+
     /// Public Allocator API
     /// O(1) amortized. Lock-Free.
     pub fn alloc(&self, item: T) -> usize {
         let idx = self.reserve_slot();
         let slot = &self.manifold[idx];
-        
+
         unsafe {
-             // Drop old data if present (metabolism)
-             // *slot.data.get() = None; // redundant if we overwrite immediately
-             *slot.data.get() = Some(item);
+            // Drop old data if present (metabolism)
+            // *slot.data.get() = None; // redundant if we overwrite immediately
+            *slot.data.get() = Some(item);
         }
-        
+
         // Spark of Life
         slot.energy.store(true, Ordering::Release);
-        
+
         idx
     }
-    
+
     /// Access data. Energizes the slot (refreshes the bit).
     pub fn access(&self, index: usize) -> Option<&T> {
-        if index >= SIZE { return None; }
-
+        if index >= SIZE {
+            return None;
+        }
+
         let slot = &self.manifold[index];
         // Bio-Feedback: Reading the memory strengthens its synapse
         slot.energy.store(true, Ordering::Relaxed);
-        
+
         unsafe { (*slot.data.get()).as_ref() }
     }
 }
@@ -176,7 +183,7 @@ mod tests {
     use super::*;
     use std::sync::Arc;
     use std::thread;
-    
+
     #[test]
     fn test_titan_genesis() {
         // 32 Shards * 2 = 64 slots
@@ -185,36 +192,36 @@ mod tests {
         assert!(idx < 64);
         assert_eq!(*clock.access(idx).unwrap(), 42);
     }
-    
+
     #[test]
     fn test_shard_saturation() {
         // 32 shards * 1 slot each = 32 slots total.
-        let clock: TitanClock<i32, 32> = TitanClock::new(); 
-        
+        let clock: TitanClock<i32, 32> = TitanClock::new();
+
         // Fill everything
         for i in 0..32 {
             clock.alloc(i);
         }
-        
+
         // Access everything to make it HOT
         for i in 0..32 {
             clock.access(i);
         }
-        
+
         // Now Alloc 33.
         // It must scan, turn something cold, and eventualy overwrite.
         let idx_new = clock.alloc(100);
-        
+
         assert_eq!(*clock.access(idx_new).unwrap(), 100);
     }
-    
+
     /*
     #[test]
     fn test_multithreaded_stress() {
         let clock = Arc::new(TitanClock::<usize, 1024>::new()); // 32 slots per shard
-        
+
         let mut handles: Vec<std::thread::JoinHandle<()>> = Vec::new(); // Use Vec::new()
-        
+
         // Spawn 10 Titan Threads
         for t in 0..10 {
             let c = clock.clone();
@@ -226,11 +233,11 @@ mod tests {
                 }
             }));
         }
-        
+
         for h in handles {
             h.join().unwrap();
         }
-        
+
         // Verify manifold integrity
         // Just checking we can read index 0 without panic
         assert!(clock.access(0).is_some() || clock.access(0).is_none());

diff --git a/crates/aether-cli/src/main.rs b/crates/aether-cli/src/main.rs
@@ -58,15 +58,17 @@ fn main() {
     // Spawn a thread with 8MB stack to prevent overflow.
     let builder = std::thread::Builder::new().stack_size(8 * 1024 * 1024);
 
-    let handler = builder.spawn(|| {
-        let cli = Cli::parse();
-
-        match cli.command {
-            Some(Commands::Repl) | None => run_repl(),
-            Some(Commands::Run { file, mode }) => run_file(&file, &mode),
-            Some(Commands::Check { file }) => check_file(&file),
-        }
-    }).unwrap();
+    let handler = builder
+        .spawn(|| {
+            let cli = Cli::parse();
+
+            match cli.command {
+                Some(Commands::Repl) | None => run_repl(),
+                Some(Commands::Run { file, mode }) => run_file(&file, &mode),
+                Some(Commands::Check { file }) => check_file(&file),
+            }
+        })
+        .unwrap();
 
     handler.join().unwrap();
 }
@@ -166,7 +168,10 @@ fn run_file(path: &PathBuf, mode: &str) {
     if let Some(ext) = path.extension() {
         let s = ext.to_string_lossy();
         if s != "aether" && s != "ae" {
-            println!("Warning: File extension '.{}' is not standard (.aether or .ae)", s);
+            println!(
+                "Warning: File extension '.{}' is not standard (.aether or .ae)",
+                s
+            );
         }
     }
 
@@ -181,14 +186,14 @@ fn run_file(path: &PathBuf, mode: &str) {
     };
 
     if mode == "titan" {
-        use aether_lang::vm::{TitanVM, Compiler};
+        use aether_lang::vm::{Compiler, TitanVM};
         // Compile to Bytecode
         let compiler = Compiler::new();
         let code = compiler.compile(&ast);
-        
+
         let mut vm = TitanVM::new();
         vm.load_code(code);
-        
+
         match vm.run() {
             Ok(result) => {
                 println!("{:?}", result);

diff --git a/crates/aether-core/src/aether.rs b/crates/aether-core/src/aether.rs
@@ -304,17 +304,21 @@ impl<const D: usize> HierarchicalBlockTree<D> {
         let mut active_l1 = [false; MAX_BLOCKS];
         for (i, active) in active_l1.iter_mut().enumerate().take(self.counts[1]) {
             let parent = i / 4;
-            if parent < self.counts[2] && active_l2[parent] 
-               && !self.levels[1][i].can_prune(query, threshold) {
+            if parent < self.counts[2]
+                && active_l2[parent]
+                && !self.levels[1][i].can_prune(query, threshold)
+            {
                 *active = true;
             }
         }
 
         // Level 0 (finest) - final result
         for (i, res) in result.iter_mut().enumerate().take(self.counts[0]) {
             let parent = i / 4;
-            if parent < self.counts[1] && active_l1[parent]
-               && !self.levels[0][i].can_prune(query, threshold) {
+            if parent < self.counts[1]
+                && active_l1[parent]
+                && !self.levels[0][i].can_prune(query, threshold)
+            {
                 *res = true;
             }
         }

diff --git a/crates/aether-core/src/lib.rs b/crates/aether-core/src/lib.rs
@@ -25,11 +25,11 @@ extern crate alloc;
 pub mod aether;
 pub mod governor;
 pub mod manifold;
+pub mod memory;
 pub mod ml;
-pub mod state;
 pub mod os;
+pub mod state;
 pub mod topology;
-pub mod memory;
 
 // Re-export key types for convenience
 pub use aether::{BlockMetadata, DriftDetector, HierarchicalBlockTree};