diff --git a/canvas/Cargo.toml b/canvas/Cargo.toml
index a0b2f58..cb86618 100644
--- a/canvas/Cargo.toml
+++ b/canvas/Cargo.toml
@@ -14,6 +14,11 @@ categories = ["multimedia::images"]
 [dependencies]
 image-texel = { path = "../texel", version = "0.5.0" }
 bytemuck = "1.1"
+libm = { version = "0.2", default-features = false, features = ["arch"] }
+
+[features]
+# Use runtime feature detection on x86 and x86_64 targets.
+runtime-features = []
 
 [dev-dependencies]
 brunch = "0.6.1"
diff --git a/canvas/src/arch.rs b/canvas/src/arch.rs
index a311f2b..bdcffcd 100644
--- a/canvas/src/arch.rs
+++ b/canvas/src/arch.rs
@@ -1,4 +1,6 @@
 #![allow(unsafe_code)]
+// May be unused if no architecture features are detected at compile time or runtime.
+#[allow(unused_imports)]
 use core::mem::transmute;
 
 // For when we want to make sure we have a texel at compile time based on bytemuck.
@@ -12,8 +14,12 @@ macro_rules! expect_texel {
     };
 }
 
+// May be unused if no architecture features are detected at compile time or runtime.
+#[allow(dead_code)]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86_avx2;
+// May be unused if no architecture features are detected at compile time or runtime.
+#[allow(dead_code)]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86_ssse3;
 
@@ -30,29 +36,85 @@ pub(crate) struct ShuffleOps {
 
 impl ShuffleOps {
     /// FIXME(perf): implement and choose arch-specific shuffles.
+    // May be unused if no architecture features are detected at compile time or runtime.
+    #[allow(unused_mut)]
     pub fn with_arch(mut self) -> Self {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            self = self.with_x86();
+        }
+
+        self
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    // May be unused if no architecture features are detected at compile time or runtime.
+    #[allow(unused_mut)]
+    fn with_x86(mut self) -> Self {
+        #[cfg(target_feature = "ssse3")]
+        // SAFETY: `ssse3` detected at compile time
+        unsafe {
+            self = self.with_x86_ssse3();
+        }
+
+        #[cfg(not(target_feature = "ssse3"))]
+        #[cfg(feature = "runtime-features")]
         if std::is_x86_feature_detected!("ssse3") {
-            self.shuffle_u8x4 = unsafe {
-                transmute::<unsafe fn(&mut [[u8; 4]], [u8; 4]), _>(x86_ssse3::shuffle_u8x4)
-            };
-            self.shuffle_u16x4 = unsafe {
-                transmute::<unsafe fn(&mut [[u16; 4]], [u8; 4]), _>(x86_ssse3::shuffle_u16x4)
-            };
+            // SAFETY: `ssse3` detected at runtime
+            unsafe {
+                self = self.with_x86_ssse3();
+            }
+        }
+
+        #[cfg(target_feature = "avx2")]
+        // SAFETY: `avx2` detected at compile time
+        unsafe {
+            self = self.with_x86_avx2();
+        }
+
+        #[cfg(not(target_feature = "avx2"))]
+        #[cfg(feature = "runtime-features")]
+        if std::is_x86_feature_detected!("avx2") {
+            // SAFETY: `avx2` detected at runtime
+            unsafe {
+                self = self.with_x86_avx2();
+            }
         }
 
+        self
+    }
+
+    /// # Safety
+    ///
+    /// Must only be used when the `ssse3` feature is available.
+    // May be unused if no architecture features are detected at compile time or runtime.
+    #[allow(dead_code)]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    unsafe fn with_x86_ssse3(mut self) -> Self {
+        self.shuffle_u8x4 =
+            unsafe { transmute::<unsafe fn(&mut [[u8; 4]], [u8; 4]), _>(x86_ssse3::shuffle_u8x4) };
+        self.shuffle_u16x4 = unsafe {
+            transmute::<unsafe fn(&mut [[u16; 4]], [u8; 4]), _>(x86_ssse3::shuffle_u16x4)
+        };
+
+        self
+    }
+
+    /// # Safety
+    ///
+    /// Must only be used when the `avx2` feature is available.
+    // May be unused if no architecture features are detected at compile time or runtime.
+    #[allow(dead_code)]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    unsafe fn with_x86_avx2(mut self) -> Self {
         // Note: On Ivy Bridge these have the same *throughput* of 256bit-per-cycle as their SSSE3
         // equivalents until Icelake. With Icelake they are twice as fast at 512bit-per-cycle.
         // Therefore, we don't select them until we find a way to predict/select this.
-        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-        if std::is_x86_feature_detected!("avx2") {
-            self.shuffle_u8x4 = unsafe {
-                transmute::<unsafe fn(&mut [[u8; 4]], [u8; 4]), _>(x86_avx2::shuffle_u8x4)
-            };
-            self.shuffle_u16x4 = unsafe {
-                transmute::<unsafe fn(&mut [[u16; 4]], [u8; 4]), _>(x86_avx2::shuffle_u16x4)
-            };
-        }
+
+        self.shuffle_u8x4 =
+            unsafe { transmute::<unsafe fn(&mut [[u8; 4]], [u8; 4]), _>(x86_avx2::shuffle_u8x4) };
+        self.shuffle_u16x4 =
+            unsafe { transmute::<unsafe fn(&mut [[u16; 4]], [u8; 4]), _>(x86_avx2::shuffle_u16x4) };
 
         self
     }
diff --git a/canvas/src/bits.rs b/canvas/src/bits.rs
index 3038cdc..6e55551 100644
--- a/canvas/src/bits.rs
+++ b/canvas/src/bits.rs
@@ -1,5 +1,6 @@
 use crate::layout::{SampleBits, SampleParts};
-use image_texel::{AsTexel, Texel};
+use image_texel::AsTexel;
+use image_texel::Texel;
 
 /// Specifies which bits a channel comes from, within a `TexelKind` aggregate.
 #[derive(Clone, Copy, Debug)]
diff --git a/canvas/src/color/oklab.rs b/canvas/src/color/oklab.rs
index bdd9bd2..e2291fe 100644
--- a/canvas/src/color/oklab.rs
+++ b/canvas/src/color/oklab.rs
@@ -1,4 +1,5 @@
 use crate::color_matrix::ColMatrix;
+use libm::powf;
 
 const M1: ColMatrix = ColMatrix([
     [0.8189330101, 0.0329845436, 0.0482003018],
@@ -94,7 +95,7 @@ pub(crate) fn f_lms_inv(lms: [f32; 3]) -> [f32; 3] {
 }
 
 fn pow([a, b, c]: [f32; 3], exp: f32) -> [f32; 3] {
-    [a.powf(exp), b.powf(exp), c.powf(exp)]
+    [powf(a, exp), powf(b, exp), powf(c, exp)]
 }
 
 fn copysign([a, b, c]: [f32; 3], [sa, sb, sc]: [f32; 3]) -> [f32; 3] {
diff --git a/canvas/src/color/srlab2.rs b/canvas/src/color/srlab2.rs
index f53e9ad..849a0df 100644
--- a/canvas/src/color/srlab2.rs
+++ b/canvas/src/color/srlab2.rs
@@ -1,5 +1,6 @@
 use crate::color::Whitepoint;
 use crate::color_matrix::ColMatrix;
+use libm::powf;
 
 #[rustfmt::skip]
 const M_CAT02: ColMatrix = ColMatrix([
@@ -119,7 +120,7 @@ fn non_linearity(lms: [f32; 3]) -> [f32; 3] {
             // Limited to 0.08 precisely
             v * 24389.0 / 2700.0
         } else {
-            1.16 * v.powf(1.0 / 3.0) - 0.16
+            1.16 * powf(v, 1.0 / 3.0) - 0.16
         }
     }
 
@@ -131,7 +132,7 @@ fn non_linearity_inv(lms: [f32; 3]) -> [f32; 3] {
         if v.abs() < 0.08 {
             v * 2700.0 / 24389.0
         } else {
-            ((v + 0.16) / 1.16).powf(3.0)
+            powf((v + 0.16) / 1.16, 3.0)
         }
     }
 
diff --git a/canvas/src/color/transfer.rs b/canvas/src/color/transfer.rs
index ec7b06e..e1b9ecb 100644
--- a/canvas/src/color/transfer.rs
+++ b/canvas/src/color/transfer.rs
@@ -1,7 +1,7 @@
 /// To emulate the syntax used in GLSL more closely.
 #[inline]
 fn pow(base: f32, exp: f32) -> f32 {
-    base.powf(exp)
+    libm::powf(base, exp)
 }
 
 pub fn transfer_oe_bt709(val: f32) -> f32 {
@@ -143,6 +143,7 @@ pub fn transfer_display_scene_smpte2084(val: f32) -> f32 {
 pub fn transfer_oe_smpte2084(val: f32) -> f32 {
     transfer_eo_inv_smpte2084(transfer_scene_display_smpte2084(val))
 }
+#[expect(dead_code)]
 pub fn transfer_oe_inv_smpte2084(val: f32) -> f32 {
     transfer_display_scene_smpte2084(transfer_eo_smpte2084(val))
 }
diff --git a/canvas/src/frame.rs b/canvas/src/frame.rs
index 963b143..09ae416 100644
--- a/canvas/src/frame.rs
+++ b/canvas/src/frame.rs
@@ -1,4 +1,8 @@
 //! A byte-buffer based image descriptor.
+
+use alloc::borrow::ToOwned;
+use alloc::vec::Vec;
+
 use image_texel::image::{ImageMut, ImageRef};
 use image_texel::Image;
 
diff --git a/canvas/src/layout.rs b/canvas/src/layout.rs
index bae5308..3f471b4 100644
--- a/canvas/src/layout.rs
+++ b/canvas/src/layout.rs
@@ -1,5 +1,6 @@
 //! Defines layout and buffer of our images.
-use crate::color::{Color, ColorChannel, ColorChannelModel};
+
+use alloc::boxed::Box;
 
 use image_texel::image::{Coord, ImageRef};
 use image_texel::layout::{
@@ -7,6 +8,7 @@ use image_texel::layout::{
     Strides, TexelLayout,
 };
 
+use crate::color::{Color, ColorChannel, ColorChannelModel};
 use crate::shader::ChunkSpec;
 
 /// The byte layout of a buffer.
diff --git a/canvas/src/lib.rs b/canvas/src/lib.rs
index a0c0908..3930042 100644
--- a/canvas/src/lib.rs
+++ b/canvas/src/lib.rs
@@ -55,6 +55,14 @@
 // Deny, not forbid, unsafe code. In `arch` module we have inherently unsafe code, for the moment.
 // Maybe at a future point we gain some possibility to write such code safely.
 #![deny(unsafe_code)]
+// Be std for doctests, avoids a weird warning about missing allocator.
+#![cfg_attr(not(doctest), no_std)]
+
+#[cfg(feature = "runtime-features")]
+extern crate std;
+
+#[macro_use]
+extern crate alloc;
 
 mod arch;
 mod bits;
diff --git a/canvas/src/shader.rs b/canvas/src/shader.rs
index a1102c4..97ef2f9 100644
--- a/canvas/src/shader.rs
+++ b/canvas/src/shader.rs
@@ -2,6 +2,7 @@
 //!
 //! Takes quite a lot of inspiration from how GPUs work. We have a primitive sampler unit, a
 //! fragment unit, and pipeline multiple texels in parallel.
+use alloc::vec::Vec;
 use core::ops::Range;
 use image_texel::image::{ImageMut, ImageRef};
 use image_texel::{AsTexel, Texel, TexelBuffer};
@@ -1482,7 +1483,9 @@ impl CommonPixel {
                 // FIXME: do the transform u32::from_ne_bytes(x.as_ne_bytes()) when appropriate.
                 join_fn: |num, bits, idx| {
                     let max_val = bits.mask();
-                    let raw = (num[(idx & 0x3) as usize] * max_val as f32).round() as u32;
+                    // Equivalent to `x.round() as u32` for positive-normal f32
+                    let round = |x| (x + 0.5) as u32;
+                    let raw = round(num[(idx & 0x3) as usize] * max_val as f32);
                     raw.min(max_val)
                 },
                 bits,
diff --git a/drm/src/lib.rs b/drm/src/lib.rs
index 8ab5702..6123ca6 100644
--- a/drm/src/lib.rs
+++ b/drm/src/lib.rs
@@ -13,6 +13,9 @@
 //! pixel matrix. Then some of those formats map cleanly to planes of color information that can be
 //! viewed as a matrix with strides, which finally enables useful operations such as
 //! initialization.
+// Be std for doctests, avoids a weird warning about missing allocator.
+#![cfg_attr(not(doctest), no_std)]
+
 use canvas::{layout, texels};
 use core::convert::TryFrom;
 use core::fmt;