feat: complete MSL Phase 3 Wilson Dslash kernel 8-way stencil logic

Petrus Pennanen · Petrus Pennanen · commit 5dd94bc6b030 · 2026-02-23T23:37:44.000+01:00
diff --git a/Grid/Grid/qcd/action/fermion/WilsonKernels.metal b/Grid/Grid/qcd/action/fermion/WilsonKernels.metal
@@ -1,41 +1,194 @@
 #include <metal_stdlib>
 using namespace metal;
 
-// In Grid, SiteSpinor and SiteHalfSpinor are heavily templated.
-// For Metal, we define the memory layout for SU(Nc) Nc=3, Nd=4.
-// Spinor has 4 spin components, each with 3 color components.
-// Each component is a complex number (2 floats or 2 doubles).
-// For simplicity in Phase 2, we assume single precision floats (or we can use macros for precision).
-
-// Complex number structure
-struct Complex {
-    float real;
-    float imag;
+struct StencilEntry {
+    uint offset;
+    uint is_local;
+    uint permute;
+    uint around_the_world; // Grid adds a fourth 32-bit int padding
 };
 
-// HalfSpinor: 2 spin components * 3 colors = 6 Complex numbers
-struct SiteHalfSpinor {
-    Complex data[6];
-};
+// Target OS is macOS (M-series), Grid utilizes NEON SIMD (Nsimd=2 for float complex).
+// 1 vComplexF = float4(lane0_real, lane0_imag, lane1_real, lane1_imag).
+struct vComplexF { float4 v; };
 
-// Spinor: 4 spin components * 3 colors = 12 Complex numbers
-struct SiteSpinor {
-    Complex data[12];
-};
+struct SiteHalfSpinor { float4 data[6]; };
+struct SiteSpinor { float4 data[12]; };
+struct SU3Matrix { float4 data[9]; };
 
-// SU(3) Matrix: 3x3 Complex numbers
-struct SU3Matrix {
-    Complex data[9];
-};
+// SIMD Algebra Math
+inline float4 timesI(float4 a) { return float4(-a.y, a.x, -a.w, a.z); }
+inline float4 timesMinusI(float4 a) { return float4(a.y, -a.x, a.w, -a.z); }
+inline float4 multComplex(float4 a, float4 b) {
+    float4 r;
+    r.x = a.x*b.x - a.y*b.y; r.y = a.x*b.y + a.y*b.x;
+    r.z = a.z*b.z - a.w*b.w; r.w = a.z*b.w + a.w*b.z;
+    return r;
+}
+inline float4 permute_lanes(float4 a) { return a.zwxy; }
 
-// Gauge field link has 4 directions * SU(3) Matrix per site
-// Wait, DoubledGaugeField stores U[site][dir] in a specific layout.
-// StencilEntry contains offsets and permutation flags.
-struct StencilEntry {
-    uint32_t offset;
-    uint32_t is_local;
-    uint32_t permute;
-};
+// SU(3) multiplies a 6-component Half Spinor
+inline SiteHalfSpinor multLink(SU3Matrix U, SiteHalfSpinor chi) {
+    SiteHalfSpinor res;
+    for(int s=0; s<2; s++) {
+        for(int c=0; c<3; c++) {
+            float4 sum = float4(0.0f);
+            for(int k=0; k<3; k++) {
+                sum += multComplex(U.data[c*3 + k], chi.data[s*3 + k]);
+            }
+            res.data[s*3 + c] = sum;
+        }
+    }
+    return res;
+}
+
+// Xp projector (1 - gamma_x)
+inline SiteHalfSpinor spProjXp(SiteSpinor fspin, uint perm) {
+    SiteHalfSpinor hspin;
+    for(int c=0; c<3; ++c) {
+        hspin.data[0*3+c] = fspin.data[0*3+c] + timesI(fspin.data[3*3+c]);
+        hspin.data[1*3+c] = fspin.data[1*3+c] + timesI(fspin.data[2*3+c]);
+        if(perm) { hspin.data[0*3+c] = permute_lanes(hspin.data[0*3+c]); hspin.data[1*3+c] = permute_lanes(hspin.data[1*3+c]); }
+    }
+    return hspin;
+}
+inline SiteHalfSpinor spProjXm(SiteSpinor fspin, uint perm) {
+    SiteHalfSpinor hspin;
+    for(int c=0; c<3; ++c) {
+        hspin.data[0*3+c] = fspin.data[0*3+c] - timesI(fspin.data[3*3+c]);
+        hspin.data[1*3+c] = fspin.data[1*3+c] - timesI(fspin.data[2*3+c]);
+        if(perm) { hspin.data[0*3+c] = permute_lanes(hspin.data[0*3+c]); hspin.data[1*3+c] = permute_lanes(hspin.data[1*3+c]); }
+    }
+    return hspin;
+}
+inline SiteHalfSpinor spProjYp(SiteSpinor fspin, uint perm) {
+    SiteHalfSpinor hspin;
+    for(int c=0; c<3; ++c) {
+        hspin.data[0*3+c] = fspin.data[0*3+c] - fspin.data[3*3+c];
+        hspin.data[1*3+c] = fspin.data[1*3+c] + fspin.data[2*3+c];
+        if(perm) { hspin.data[0*3+c] = permute_lanes(hspin.data[0*3+c]); hspin.data[1*3+c] = permute_lanes(hspin.data[1*3+c]); }
+    }
+    return hspin;
+}
+inline SiteHalfSpinor spProjYm(SiteSpinor fspin, uint perm) {
+    SiteHalfSpinor hspin;
+    for(int c=0; c<3; ++c) {
+        hspin.data[0*3+c] = fspin.data[0*3+c] + fspin.data[3*3+c];
+        hspin.data[1*3+c] = fspin.data[1*3+c] - fspin.data[2*3+c];
+        if(perm) { hspin.data[0*3+c] = permute_lanes(hspin.data[0*3+c]); hspin.data[1*3+c] = permute_lanes(hspin.data[1*3+c]); }
+    }
+    return hspin;
+}
+inline SiteHalfSpinor spProjZp(SiteSpinor fspin, uint perm) {
+    SiteHalfSpinor hspin;
+    for(int c=0; c<3; ++c) {
+        hspin.data[0*3+c] = fspin.data[0*3+c] + timesI(fspin.data[2*3+c]);
+        hspin.data[1*3+c] = fspin.data[1*3+c] - timesI(fspin.data[3*3+c]);
+        if(perm) { hspin.data[0*3+c] = permute_lanes(hspin.data[0*3+c]); hspin.data[1*3+c] = permute_lanes(hspin.data[1*3+c]); }
+    }
+    return hspin;
+}
+inline SiteHalfSpinor spProjZm(SiteSpinor fspin, uint perm) {
+    SiteHalfSpinor hspin;
+    for(int c=0; c<3; ++c) {
+        hspin.data[0*3+c] = fspin.data[0*3+c] - timesI(fspin.data[2*3+c]);
+        hspin.data[1*3+c] = fspin.data[1*3+c] + timesI(fspin.data[3*3+c]);
+        if(perm) { hspin.data[0*3+c] = permute_lanes(hspin.data[0*3+c]); hspin.data[1*3+c] = permute_lanes(hspin.data[1*3+c]); }
+    }
+    return hspin;
+}
+inline SiteHalfSpinor spProjTp(SiteSpinor fspin, uint perm) {
+    SiteHalfSpinor hspin;
+    for(int c=0; c<3; ++c) {
+        hspin.data[0*3+c] = fspin.data[0*3+c] + fspin.data[2*3+c];
+        hspin.data[1*3+c] = fspin.data[1*3+c] + fspin.data[3*3+c];
+        if(perm) { hspin.data[0*3+c] = permute_lanes(hspin.data[0*3+c]); hspin.data[1*3+c] = permute_lanes(hspin.data[1*3+c]); }
+    }
+    return hspin;
+}
+inline SiteHalfSpinor spProjTm(SiteSpinor fspin, uint perm) {
+    SiteHalfSpinor hspin;
+    for(int c=0; c<3; ++c) {
+        hspin.data[0*3+c] = fspin.data[0*3+c] - fspin.data[2*3+c];
+        hspin.data[1*3+c] = fspin.data[1*3+c] - fspin.data[3*3+c];
+        if(perm) { hspin.data[0*3+c] = permute_lanes(hspin.data[0*3+c]); hspin.data[1*3+c] = permute_lanes(hspin.data[1*3+c]); }
+    }
+    return hspin;
+}
+
+// Reconstructors
+inline void spReconXp(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] = hspin.data[0*3+c];
+        out.data[1*3+c] = hspin.data[1*3+c];
+        out.data[2*3+c] = timesMinusI(hspin.data[1*3+c]);
+        out.data[3*3+c] = timesMinusI(hspin.data[0*3+c]);
+    }
+}
+inline void accumReconXp(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] += hspin.data[0*3+c];
+        out.data[1*3+c] += hspin.data[1*3+c];
+        out.data[2*3+c] -= timesI(hspin.data[1*3+c]);
+        out.data[3*3+c] -= timesI(hspin.data[0*3+c]);
+    }
+}
+inline void accumReconYp(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] += hspin.data[0*3+c];
+        out.data[1*3+c] += hspin.data[1*3+c];
+        out.data[2*3+c] += hspin.data[1*3+c];
+        out.data[3*3+c] -= hspin.data[0*3+c];
+    }
+}
+inline void accumReconZp(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] += hspin.data[0*3+c];
+        out.data[1*3+c] += hspin.data[1*3+c];
+        out.data[2*3+c] -= timesI(hspin.data[0*3+c]);
+        out.data[3*3+c] += timesI(hspin.data[1*3+c]);
+    }
+}
+inline void accumReconTp(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] += hspin.data[0*3+c];
+        out.data[1*3+c] += hspin.data[1*3+c];
+        out.data[2*3+c] += hspin.data[0*3+c];
+        out.data[3*3+c] += hspin.data[1*3+c];
+    }
+}
+inline void accumReconXm(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] += hspin.data[0*3+c];
+        out.data[1*3+c] += hspin.data[1*3+c];
+        out.data[2*3+c] += timesI(hspin.data[1*3+c]);
+        out.data[3*3+c] += timesI(hspin.data[0*3+c]);
+    }
+}
+inline void accumReconYm(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] += hspin.data[0*3+c];
+        out.data[1*3+c] += hspin.data[1*3+c];
+        out.data[2*3+c] -= hspin.data[1*3+c];
+        out.data[3*3+c] += hspin.data[0*3+c];
+    }
+}
+inline void accumReconZm(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] += hspin.data[0*3+c];
+        out.data[1*3+c] += hspin.data[1*3+c];
+        out.data[2*3+c] += timesI(hspin.data[0*3+c]);
+        out.data[3*3+c] -= timesI(hspin.data[1*3+c]);
+    }
+}
+inline void accumReconTm(thread SiteSpinor& out, SiteHalfSpinor hspin) {
+    for(int c=0; c<3; ++c) {
+        out.data[0*3+c] += hspin.data[0*3+c];
+        out.data[1*3+c] += hspin.data[1*3+c];
+        out.data[2*3+c] -= hspin.data[0*3+c];
+        out.data[3*3+c] -= hspin.data[1*3+c];
+    }
+}
 
 // Kernel to execute the Wilson Dslash
 kernel void GenericDhopSite(
@@ -45,12 +198,82 @@ kernel void GenericDhopSite(
     device const StencilEntry* stencil [[buffer(3)]],
     constant uint32_t& Ls [[buffer(4)]],
     constant uint32_t& Nsite [[buffer(5)]],
+    device const SiteHalfSpinor* buf [[buffer(6)]],
     uint id [[thread_position_in_grid]]
 ) {
     if (id >= Nsite * Ls) return;
 
-    // TODO: Implement the 8-way stencil hops mapping to spProj, multLink, Recon
-    // For now, this serves as the foundational shader compile target.
+    uint sF = id; // Spinor site index
+    uint sU = id / Ls; // Gauge field site index (if Ls=1 this is the same)
+    
+    SiteSpinor result;
+    for(int i=0; i<12; i++) result.data[i] = float4(0.0f);
     
-    // out_spinor[id] = in_spinor[id];   // Basic passthrough for testing pipeline
+    // 8-Way Stencil Execution (Xp, Yp, Zp, Tp, Xm, Ym, Zm, Tm)
+    // Dir = 0 (Xp)
+    {
+        StencilEntry SE = stencil[0 * Nsite + sU];
+        SiteHalfSpinor hs = spProjXp(in_spinor[SE.offset], SE.permute);
+        SU3Matrix U = gauge_field[sU * 8 + 0]; // +0 for Xp (Grid layout: U[Xp, Yp, Zp, Tp, Xm, Ym, Zm, Tm])
+        SiteHalfSpinor chi = multLink(U, hs);
+        spReconXp(result, chi);
+    }
+    // Dir = 1 (Yp)
+    {
+        StencilEntry SE = stencil[1 * Nsite + sU];
+        SiteHalfSpinor hs = spProjYp(in_spinor[SE.offset], SE.permute);
+        SU3Matrix U = gauge_field[sU * 8 + 1];
+        SiteHalfSpinor chi = multLink(U, hs);
+        accumReconYp(result, chi);
+    }
+    // Dir = 2 (Zp)
+    {
+        StencilEntry SE = stencil[2 * Nsite + sU];
+        SiteHalfSpinor hs = spProjZp(in_spinor[SE.offset], SE.permute);
+        SU3Matrix U = gauge_field[sU * 8 + 2];
+        SiteHalfSpinor chi = multLink(U, hs);
+        accumReconZp(result, chi);
+    }
+    // Dir = 3 (Tp)
+    {
+        StencilEntry SE = stencil[3 * Nsite + sU];
+        SiteHalfSpinor hs = spProjTp(in_spinor[SE.offset], SE.permute);
+        SU3Matrix U = gauge_field[sU * 8 + 3];
+        SiteHalfSpinor chi = multLink(U, hs);
+        accumReconTp(result, chi);
+    }
+    // Dir = 4 (Xm)
+    {
+        StencilEntry SE = stencil[4 * Nsite + sU];
+        SiteHalfSpinor hs = spProjXm(in_spinor[SE.offset], SE.permute);
+        SU3Matrix U = gauge_field[sU * 8 + 4];
+        SiteHalfSpinor chi = multLink(U, hs);
+        accumReconXm(result, chi);
+    }
+    // Dir = 5 (Ym)
+    {
+        StencilEntry SE = stencil[5 * Nsite + sU];
+        SiteHalfSpinor hs = spProjYm(in_spinor[SE.offset], SE.permute);
+        SU3Matrix U = gauge_field[sU * 8 + 5];
+        SiteHalfSpinor chi = multLink(U, hs);
+        accumReconYm(result, chi);
+    }
+    // Dir = 6 (Zm)
+    {
+        StencilEntry SE = stencil[6 * Nsite + sU];
+        SiteHalfSpinor hs = spProjZm(in_spinor[SE.offset], SE.permute);
+        SU3Matrix U = gauge_field[sU * 8 + 6];
+        SiteHalfSpinor chi = multLink(U, hs);
+        accumReconZm(result, chi);
+    }
+    // Dir = 7 (Tm)
+    {
+        StencilEntry SE = stencil[7 * Nsite + sU];
+        SiteHalfSpinor hs = spProjTm(in_spinor[SE.offset], SE.permute);
+        SU3Matrix U = gauge_field[sU * 8 + 7];
+        SiteHalfSpinor chi = multLink(U, hs);
+        accumReconTm(result, chi);
+    }
+
+    out_spinor[sF] = result;
 }