LegNeato
diff --git a/‎crates/cuda_std/src/warp/shuffle.rs‎
Lines changed: 196 additions & 84 deletions b/‎crates/cuda_std/src/warp/shuffle.rs‎
Lines changed: 196 additions & 84 deletions
diff --git a/‎tests/compiletests/ui/dis/shuffle_crashing.rs‎
Lines changed: 4 additions & 4 deletions b/‎tests/compiletests/ui/dis/shuffle_crashing.rs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/compiletests/ui/dis/shuffle_working.rs‎
Lines changed: 3 additions & 3 deletions b/‎tests/compiletests/ui/dis/shuffle_working.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/compiletests/ui/warp/matrix/basic_matrix.rs‎
Lines changed: 18 additions & 18 deletions b/‎tests/compiletests/ui/warp/matrix/basic_matrix.rs‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎tests/compiletests/ui/warp/matrix/invalid_matrix_dims.rs‎
Lines changed: 5 additions & 5 deletions b/‎tests/compiletests/ui/warp/matrix/invalid_matrix_dims.rs‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tests/compiletests/ui/warp/shuffle/FIXME.md‎
Lines changed: 0 additions & 25 deletions b/‎tests/compiletests/ui/warp/shuffle/FIXME.md‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎tests/compiletests/ui/warp/shuffle/debug_shuffle.rs‎
Lines changed: 0 additions & 47 deletions b/‎tests/compiletests/ui/warp/shuffle/debug_shuffle.rs‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎tests/compiletests/ui/warp/shuffle/generic_no_bound_test.rs‎
Lines changed: 0 additions & 37 deletions b/‎tests/compiletests/ui/warp/shuffle/generic_no_bound_test.rs‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎tests/compiletests/ui/warp/shuffle/generic_struct_test.rs‎
Lines changed: 0 additions & 38 deletions b/‎tests/compiletests/ui/warp/shuffle/generic_struct_test.rs‎
Lines changed: 0 additions & 38 deletions
diff --git a/‎tests/compiletests/ui/warp/shuffle/invalid_lane_id.rs‎
Lines changed: 0 additions & 28 deletions b/‎tests/compiletests/ui/warp/shuffle/invalid_lane_id.rs‎
Lines changed: 0 additions & 28 deletions
@@ -2,19 +2,19 @@
 // compile-flags: -Cllvm-args=--disassemble-entry=test_crashing --error-format=human
 
 use cuda_std::kernel;
-use cuda_std::warp::{WarpMask, Shuffle, ShuffleWidth};
 use cuda_std::warp::shuffle::patterns;
+use cuda_std::warp::{Shuffle, ShuffleWidth, WarpMask};
 
 #[kernel]
 pub unsafe fn test_crashing() {
     let mask = WarpMask::all();
     let width = ShuffleWidth::full_warp();
     // This is now a ZST - the parameters are ignored
     let shuffle = Shuffle::<i32>::new(mask, width);
-    
+
     let value = 42i32;
     let pattern = patterns::Down::new(1);
-    
+
     // Method call - should not crash now as shuffle is a ZST
     let _result = shuffle.down(value, pattern);
-}
+}
@@ -2,13 +2,13 @@
 // compile-flags: -Cllvm-args=--disassemble-entry=test_working --error-format=human
 
 use cuda_std::kernel;
-use cuda_std::warp::{WarpMask, ShuffleValue};
+use cuda_std::warp::{ShuffleValue, WarpMask};
 
 #[kernel]
 pub unsafe fn test_working() {
     let mask = WarpMask::all();
     let value = 42i32;
-    
+
     // Direct trait call - this works
     let _result = <i32 as ShuffleValue>::shuffle_down(mask, value, 1, 32);
-}
+}
@@ -1,31 +1,31 @@
 // Test CUDA warp matrix functions (tensor core) compile correctly
 // build-pass
 
-use cuda_std::kernel;
-use cuda_std::warp::matrix::{TensorCore, dims, layout, MatrixElement};
 use cuda_std::half::f16;
+use cuda_std::kernel;
+use cuda_std::warp::matrix::{dims, layout, MatrixElement, TensorCore};
 
 #[kernel]
 pub unsafe fn test_warp_matrix_type_safe() {
     // Create a tensor core operation with compile-time validated dimensions
     // Type-driven API: specify element type directly
     let tc_f16 = TensorCore::<f16, dims::Shape<16, 16, 16>>::new();
-    
+
     // Create matrix fragments with type-safe API
     let mut a_fragment = tc_f16.matrix_a::<layout::Row>();
     let mut b_fragment = tc_f16.matrix_b::<layout::Col>();
-    let mut c_fragment = tc_f16.accumulator();  // Returns f32 accumulator by default
-    
+    let mut c_fragment = tc_f16.accumulator(); // Returns f32 accumulator by default
+
     // Initialize accumulator
     c_fragment.fill(0.0);
-    
+
     // Load operations would use compile-time stride validation
     // a_fragment.load::<64>(&a_matrix);  // STRIDE validated at compile time
     // b_fragment.load::<64>(&b_matrix);
-    
+
     // Perform matrix multiply-accumulate
     // c_fragment.mma(&a_fragment, &b_fragment);
-    
+
     // Store result
     // c_fragment.store::<layout::Row, 64>(&mut c_matrix);
 }
@@ -35,11 +35,11 @@ pub unsafe fn test_different_tensor_shapes() {
     // 16x16x16 - Most common configuration
     let tc_16x16 = TensorCore::<f16, dims::Shape<16, 16, 16>>::new();
     let _a = tc_16x16.matrix_a::<layout::Row>();
-    
+
     // 32x8x16 - Tall and skinny
     let tc_32x8 = TensorCore::<f16, dims::Shape<32, 8, 16>>::new();
     let _a = tc_32x8.matrix_a::<layout::Row>();
-    
+
     // 8x32x16 - Short and wide
     let tc_8x32 = TensorCore::<f16, dims::Shape<8, 32, 16>>::new();
     let _a = tc_8x32.matrix_a::<layout::Row>();
@@ -52,38 +52,38 @@ pub unsafe fn test_different_element_types() {
         let tc = TensorCore::<f16, dims::Shape<16, 16, 16>>::new();
         let _a = tc.matrix_a::<layout::Row>();
         let _b = tc.matrix_b::<layout::Col>();
-        let _c = tc.accumulator();  // f32 by default for f16
+        let _c = tc.accumulator(); // f32 by default for f16
     }
-    
+
     // f16 input, f16 accumulator
     {
         let tc = TensorCore::<f16, dims::Shape<16, 16, 16>>::new();
         let _a = tc.matrix_a::<layout::Row>();
         let _b = tc.matrix_b::<layout::Col>();
-        let _c = tc.accumulator_f16();  // explicitly use f16 accumulator
+        let _c = tc.accumulator_f16(); // explicitly use f16 accumulator
     }
-    
+
     // i8 input, i32 accumulator
     {
         let tc = TensorCore::<i8, dims::Shape<16, 16, 16>>::new();
         let _a = tc.matrix_a::<layout::Row>();
         let _b = tc.matrix_b::<layout::Col>();
-        let _c = tc.accumulator();  // i32 for i8
+        let _c = tc.accumulator(); // i32 for i8
     }
 }
 
 #[kernel]
 pub unsafe fn test_layout_combinations() {
     let tc = TensorCore::<f16, dims::Shape<16, 16, 16>>::new();
-    
+
     // All valid layout combinations
     let _a_row = tc.matrix_a::<layout::Row>();
     let _a_col = tc.matrix_a::<layout::Col>();
     let _b_row = tc.matrix_b::<layout::Row>();
     let _b_col = tc.matrix_b::<layout::Col>();
-    
+
     // Accumulator can have different layouts for storage
     let acc = tc.accumulator();
     // acc.store::<layout::Row, 64>(&mut output);
     // acc.store::<layout::Col, 64>(&mut output);
-}
+}
@@ -1,21 +1,21 @@
 // Test that invalid matrix dimensions cannot be created
 // compile-fail
 
-use cuda_std::kernel;
-use cuda_std::warp::matrix::{TensorCore, dims};
 use cuda_std::half::f16;
+use cuda_std::kernel;
+use cuda_std::warp::matrix::{dims, TensorCore};
 
 #[kernel]
 pub unsafe fn test_invalid_matrix_dimensions() {
     // This should fail - 17x17x17 is not a valid tensor core shape
     let invalid = TensorCore::<f16, dims::Shape<17, 17, 17>>::new();
     //~^ ERROR the trait bound `Shape<17, 17, 17>: TensorCoreShape` is not satisfied
-    
+
     // This should fail - 64x64x64 is too large
     let too_large = TensorCore::<f16, dims::Shape<64, 64, 64>>::new();
     //~^ ERROR the trait bound `Shape<64, 64, 64>: TensorCoreShape` is not satisfied
-    
+
     // This should fail - 1x1x1 is too small
     let too_small = TensorCore::<f16, dims::Shape<1, 1, 1>>::new();
     //~^ ERROR the trait bound `Shape<1, 1, 1>: TensorCoreShape` is not satisfied
-}
+}