Fix swapped scalar and simd implementations of strip

raphlinus · raphlinus · commit b0bd9d327b3f · 2024-12-10T21:34:10.000-08:00
The logic for choosing whether to use the scalar or simd version of the strip kernel was backwards.

This makes a pretty small performance difference; it just isn't a large part of the total time.

Also optimize clamping behavior to take advantage of saturation in conversion operations.
diff --git a/cpu-sparse/src/fine.rs b/cpu-sparse/src/fine.rs
@@ -57,7 +57,7 @@ impl<'a> Fine<'a> {
             for i in 0..WIDE_TILE_WIDTH {
                 let mut rgba_f32 = [0.0; 4];
                 rgba_f32.copy_from_slice(&self.scratch[(i * STRIP_HEIGHT + j) * 4..][..4]);
-                let rgba_u8 = rgba_f32.map(|x| (x.clamp(0., 1.) * 255.0).round() as u8);
+                let rgba_u8 = rgba_f32.map(|x| (x * 255.0).round() as u8);
                 self.out_buf[line_ix + i * 4..][..4].copy_from_slice(&rgba_u8);
             }
         }
diff --git a/cpu-sparse/src/simd.rs b/cpu-sparse/src/simd.rs
@@ -44,11 +44,11 @@ impl<'a> Fine<'a> {
 
     pub(crate) fn strip(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) {
         if self.use_simd {
-            self.strip_scalar(x, width, alphas, color);
-        } else {
             unsafe {
                 self.strip_simd(x, width, alphas, color);
             }
+        } else {
+            self.strip_scalar(x, width, alphas, color);
         }
     }
 }
diff --git a/cpu-sparse/src/simd/neon.rs b/cpu-sparse/src/simd/neon.rs
@@ -23,7 +23,7 @@ impl<'a> Fine<'a> {
 
     pub fn pack_simd(&mut self, x: usize, y: usize) {
         unsafe fn cvt(v: float32x4_t) -> uint8x16_t {
-            let clamped = vminq_f32(vmaxq_f32(v, vdupq_n_f32(0.0)), vdupq_n_f32(1.0));
+            let clamped = vminq_f32(v, vdupq_n_f32(1.0));
             let scaled = vmulq_f32(clamped, vdupq_n_f32(255.0));
             vreinterpretq_u8_u32(vcvtnq_u32_f32(scaled))
         }

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ impl<'a> Fine<'a> {`
`57`	`57`	`for i in 0..WIDE_TILE_WIDTH {`
`58`	`58`	`let mut rgba_f32 = [0.0; 4];`
`59`	`59`	`rgba_f32.copy_from_slice(&self.scratch[(i * STRIP_HEIGHT + j) * 4..][..4]);`
`60`		`- let rgba_u8 = rgba_f32.map(\|x\| (x.clamp(0., 1.) * 255.0).round() as u8);`
	`60`	`+ let rgba_u8 = rgba_f32.map(\|x\| (x * 255.0).round() as u8);`
`61`	`61`	`self.out_buf[line_ix + i * 4..][..4].copy_from_slice(&rgba_u8);`
`62`	`62`	`}`
`63`	`63`	`}`
Original file line number	Diff line number	Diff line change
`@@ -44,11 +44,11 @@ impl<'a> Fine<'a> {`
`44`	`44`
`45`	`45`	`pub(crate) fn strip(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) {`
`46`	`46`	`if self.use_simd {`
`47`		`- self.strip_scalar(x, width, alphas, color);`
`48`		`- } else {`
`49`	`47`	`unsafe {`
`50`	`48`	`self.strip_simd(x, width, alphas, color);`
`51`	`49`	`}`
	`50`	`+ } else {`
	`51`	`+ self.strip_scalar(x, width, alphas, color);`
`52`	`52`	`}`
`53`	`53`	`}`
`54`	`54`	`}`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ impl<'a> Fine<'a> {`
`23`	`23`
`24`	`24`	`pub fn pack_simd(&mut self, x: usize, y: usize) {`
`25`	`25`	`unsafe fn cvt(v: float32x4_t) -> uint8x16_t {`
`26`		`- let clamped = vminq_f32(vmaxq_f32(v, vdupq_n_f32(0.0)), vdupq_n_f32(1.0));`
	`26`	`+ let clamped = vminq_f32(v, vdupq_n_f32(1.0));`
`27`	`27`	`let scaled = vmulq_f32(clamped, vdupq_n_f32(255.0));`
`28`	`28`	`vreinterpretq_u8_u32(vcvtnq_u32_f32(scaled))`
`29`	`29`	`}`