added readme w/ discussion and makefiles

v0i0 · Jan 21, 2016 · a8d1e56 · a8d1e56
1 parent 1ac3054
commit a8d1e56
Show file tree

Hide file tree

Showing 7 changed files with 171 additions and 7 deletions.
diff --git a/Makefile.fill_lanes b/Makefile.fill_lanes
@@ -0,0 +1,30 @@
+all: fill_lanes.none.x-mic fill_lanes.none.x-host fill_lanes.simd.x-mic fill_lanes.simd.x-host fill_lanes.fill-temp.x-mic fill_lanes.fill-temp.x-host fill_lanes.fill-direct.x-mic fill_lanes.fill-direct.x-host fill_lanes.fill-intr.x-mic
+CC=icc -O3 -std=c99 -w2 -qopt-report=5 -wd10397 -wd10382
+fill_lanes.none.x-mic: fill_lanes.c
+	$(CC) -o$@ -mmic fill_lanes.c
+fill_lanes.none.x-host: fill_lanes.c
+	$(CC) -o$@ -xHost fill_lanes.c
+fill_lanes.simd.x-mic: fill_lanes.c
+	$(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT
+fill_lanes.simd.x-host: fill_lanes.c
+	$(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT
+fill_lanes.fill-temp.x-mic: fill_lanes.c
+	$(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \
+            -DFILL -DFILL_TEMP
+fill_lanes.fill-temp.x-host: fill_lanes.c
+	$(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \
+            -DFILL -DFILL_TEMP
+#fill_lanes.fill-store.x-mic: fill_lanes.c
+#	$(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \
+#            -DFILL -DFILL_STORE
+#fill_lanes.fill-store.x-host: fill_lanes.c
+#	$(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \
+#            -DFILL -DFILL_STORE
+fill_lanes.fill-direct.x-mic: fill_lanes.c
+	$(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \
+            -DFILL -DFILL_DIRECT -DVECTOR_VARIANT
+fill_lanes.fill-direct.x-host: fill_lanes.c
+	$(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \
+            -DFILL -DFILL_DIRECT -DVECTOR_VARIANT
+fill_lanes.fill-intr.x-mic: fill_lanes.c
+	$(CC) -o$@ -mmic fill_lanes.c -DFILL_INTR_PHI
diff --git a/Makefile.inner_loop_reduce b/Makefile.inner_loop_reduce
@@ -0,0 +1,14 @@
+all: inner_loop_reduce.none.x-mic inner_loop_reduce.none.x-host inner_loop_reduce.simd.x-mic inner_loop_reduce.simd.x-host inner_loop_reduce.vector_variant.x-mic inner_loop_reduce.vector_variant.x-host
+CC=icc -O3 -std=c99 -w2 -qopt-report=5 -wd10397 -wd10382
+inner_loop_reduce.none.x-mic: inner_loop_reduce.c
+	$(CC) -o$@ -mmic inner_loop_reduce.c
+inner_loop_reduce.none.x-host: inner_loop_reduce.c
+	$(CC) -o$@ -xHost inner_loop_reduce.c
+inner_loop_reduce.simd.x-mic: inner_loop_reduce.c
+	$(CC) -o$@ -mmic inner_loop_reduce.c -DSIMD -DSIMD_CORRECT
+inner_loop_reduce.simd.x-host: inner_loop_reduce.c
+	$(CC) -o$@ -xHost inner_loop_reduce.c -DSIMD -DSIMD_CORRECT
+inner_loop_reduce.vector_variant.x-mic: inner_loop_reduce.c
+	$(CC) -o$@ -mmic inner_loop_reduce.c -DSIMD -DSIMD_CORRECT -DVECTOR_VARIANT
+inner_loop_reduce.vector_variant.x-host: inner_loop_reduce.c
+	$(CC) -o$@ -xHost inner_loop_reduce.c -DSIMD -DSIMD_CORRECT -DVECTOR_VARIANT
diff --git a/README b/README
@@ -2,3 +2,13 @@ Advanced Vectorization Examples
 ===============================
 
 This repository contains a number of examples that use advanced vectorization.
+These are examples of programs that need some degree of trickery to work.
+Each example consists of a README file, a Makefile and the source code.
+
+* inner_loop_reduce
+    showcases ways to resolve conflicts among lanes, where we have to reduce into a variable memory location.
+* fill_lanes 
+    transforms an inner loop in a vectorized algorithm in such a way that the body gets executed with as many lanes as possible.
+
+Generally speaking, these examples will only work with an up-to-date Intel compiler.
+Their effect is best shown on a Xeon Phi.
diff --git a/README.fill_lanes b/README.fill_lanes
@@ -0,0 +1,53 @@
+Vectorization Example: Fill Lanes
+=================================
+
+Consider a program, where we have to vectorize a loop that contains another, inner loop.
+That inner loop iterates over different pieces of data, and contains a "continue" statement halfway through.
+This "continue" statement guards the execution of a very expensive function.
+
+Vectorizing this loop conventionally leads to code that executes that expensive function frequently, and most of the times with incomplete masks, i.e. we waste some "space" in our SIMD unit.
+
+Instead, it is possible to only execute that expensive function once we have filled our vector registers to the max:
+We do not execute the inner for loop in lock-step anymore, but allow different lanes to proceed independently from each other.
+
+To implement this, we have to figure out if any lane has executed continue, and we have to figure out if all loops are ready to execute a function.
+This means that we need reductions on masks among the lanes.
+The current compiler does not seem to support that.
+
+We present a number of work-arounds (of various effectiveness).
+One relies on non-vectorized functions and global variables (fill-temp), another on vector_variant functions (fill-direct), another on masked stores and global variables (fill-store, defunct), and the last one is an explicitly vectorized code using intrinsics (fill-intr, only Phi).
+
+Measurements
+============
+
+Note: none is the non-vectorized version, simd the trivially vectorized version, fill-* are attempts at filling the lanes.
+Only the intrinsic filling achieves a speedup.
+All versions are correct (i.e. the first correctness parameter is < 1e-5, the second correctness parameter approx -5.99130).
+
+          none.x-mic: Time   : 3.723747200000000e+08
+          none.x-mic: Correct: 3.066983936150791e-06
+          none.x-mic: Correct: -5.991305351257324e+00
+          simd.x-mic: Time   : 1.661247360000000e+08
+          simd.x-mic: Correct: 3.066983936150791e-06
+          simd.x-mic: Correct: -5.991305351257324e+00
+   fill-direct.x-mic: Time   : 4.607440000000000e+08
+   fill-direct.x-mic: Correct: 3.066983936150791e-06
+   fill-direct.x-mic: Correct: -5.991305351257324e+00
+     fill-temp.x-mic: Time   : 6.023345280000000e+08
+     fill-temp.x-mic: Correct: 3.066983936150791e-06
+     fill-temp.x-mic: Correct: -5.991305351257324e+00
+     fill-intr.x-mic: Time   : 3.659471200000000e+07
+     fill-intr.x-mic: Correct: 4.884550889983075e-06
+     fill-intr.x-mic: Correct: -5.991304397583008e+00
+         none.x-host: Time   : 7.618968800000000e+07
+         none.x-host: Correct: 3.066983936150791e-06
+         none.x-host: Correct: -5.991305351257324e+00
+         simd.x-host: Time   : 8.071620000000000e+07
+         simd.x-host: Correct: 3.066983936150791e-06
+         simd.x-host: Correct: -5.991305351257324e+00
+  fill-direct.x-host: Time   : 1.553101760000000e+08
+  fill-direct.x-host: Correct: 3.066983936150791e-06
+  fill-direct.x-host: Correct: -5.991305351257324e+00
+    fill-temp.x-host: Time   : 2.124863200000000e+08
+    fill-temp.x-host: Correct: 3.066983936150791e-06
+    fill-temp.x-host: Correct: -5.991305351257324e+00
diff --git a/README.inner_loop_reduce b/README.inner_loop_reduce
@@ -0,0 +1,36 @@
+Vectorization Example: Inner Loop Reduce
+========================================
+
+Consider an example where we vectorize an outer loop, and a inner loop is iterated through.
+We need to accumulate a value within to a memory location dependent on variables in the inner loop.
+The simd pragma can specify a reduction clause for arrays.
+However, this means that the array will be augmented with another dimension of size <vector length>, and the code below the pragma will just add into that.
+The reduction happens only in the end.
+This may actually be desireable behaviour if we often accumulate into relatively few distinct memory locations.
+It however is unsuited if we accumulate only a few times per memory location, if we are constrained in terms of memory usage, or if we need to allocate on the heap.
+
+The mitigation strategies herein circle around the idea to "hide" code from the compiler in function calls.
+These function calls have to be serialized by the compiler, and we get the desired behaviour.
+Note that it is crucial to annotate the function (memory_reduce_add) with __declspec(noinline).
+If inlined, it will not work: The code will compute invalid results.
+As another additional step, we can make use of the vector_variant annotation, and perform the reduction using compiler intrinsics.
+
+Measurement
+===========
+
+Note: none has no pragma-based vectorization, simd is explicitly vectorized, and vector_variant uses vector_variant-declarations additionally.
+It is clear that performance benefits on both the Phi and the Host.
+
+           none.x-mic: Time   : 3.473042000000000e+06
+           none.x-mic: Correct: 4.835128784179688e-04
+           simd.x-mic: Time   : 1.219468000000000e+06
+           simd.x-mic: Correct: 1.049041748046875e-04
+ vector_variant.x-mic: Time   : 3.594990000000000e+05
+ vector_variant.x-mic: Correct: 3.814697265625000e-05
+          none.x-host: Time   : 1.234876000000000e+06
+          none.x-host: Correct: 4.835128784179688e-04
+          simd.x-host: Time   : 4.378800000000000e+05
+          simd.x-host: Correct: 3.433227539062500e-05
+vector_variant.x-host: Time   : 2.689560000000000e+05
+vector_variant.x-host: Correct: -6.866455078125000e-05
+
diff --git a/fill_lanes.c b/fill_lanes.c
@@ -3,7 +3,7 @@
 #include<stdlib.h>
 #include<stdio.h>
 #include<inttypes.h>
-__declspec(vector(nomask))
+__declspec(vector)
 void compute_f(float dij, float dik, float *fi, float *fj, float *fk) {
     *fi = exp(dik) * cos(dij) * sin(dik) * dij;
     *fj = sin(dij) * exp(dij) * cos(dik) * dik;
@@ -68,10 +68,13 @@ __m128i reduce_land_sse(__m128i b) {
 #endif
 #endif
 
+#ifdef FILL
+    int temp;
+#endif
 __declspec(noinline)
 void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * offs, float * x, float * f, float rsq) {
 #ifdef FILL
-    int temp;
+    int * tempp = &temp;
 #endif
 #ifdef SIMD
     #pragma simd
@@ -99,6 +102,10 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off
         while (temp) {
   #elif defined(FILL_DIRECT)
         while(reduce_lor(active_mask)) {
+  //#elif defined(FILL_STORE)
+  //      *tempp = 0;
+  //      if (active_mask) *tempp = 1;
+  //      while (*tempp) {
   #else
         while (/*some*/active_mask) {
   #endif
@@ -116,6 +123,10 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off
             if (temp) {
   #elif defined(FILL_DIRECT)
             if (reduce_land(eff_mask || ! active_mask)) {
+  //#elif defined(FILL_STORE)
+  //          *tempp = 1;
+  //          if (eff_mask || ! active_mask) *tempp = 0;
+  //          if (*tempp) {
   #else
             if (/*all*/eff_mask || ! active_mask) {
   #endif
@@ -138,6 +149,9 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off
   #ifdef FILL_TEMP
             memory_assign(&temp, 0);
             memory_reduce_lor(&temp, active_mask);
+  //#elif defined(FILL_STORE)
+  //          *tempp = 0;
+  //          if (active_mask) *tempp = 1;
   #endif
         }
 #else
@@ -166,6 +180,7 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off
 #endif
     }
 }
+#ifdef FILL_INTR_PHI
 __declspec(noinline)
 void fill_lanes_intr(int N, int * iarr, int * jarr, int * marr, int * base, int * offs, float * x, float * f, float rsq) {
     float tmpf[16] __attribute__((aligned(64)));
@@ -228,6 +243,7 @@ void fill_lanes_intr(int N, int * iarr, int * jarr, int * marr, int * base, int
         }
     }
 }
+#endif
 int main(int argc, char **argv) {
     int N = 10000;
     int M = 16;
@@ -252,13 +268,17 @@ int main(int argc, char **argv) {
     float rsq = 0.25f * M / N;
     rsq *= rsq;
     uint64_t start = __rdtsc();
+#ifdef FILL_INTR_PHI
     fill_lanes_intr(N * M, iarr, jarr, marr, base, offs, x, f, rsq);
+#else
+    fill_lanes(N * M, iarr, jarr, marr, base, offs, x, f, rsq);
+#endif
     uint64_t end = __rdtsc();
-    printf("Time   : %.15e\n", (float) (end - start));
+    printf("%20s: Time   : %.15e\n", argv[0] + 13, (float) (end - start));
     double sum = 0;
     for (int i = 0; i < N; i++) {
         sum += f[i];
     }
-    printf("Correct: %.15e\n", (float)sum);
-    printf("Correct: %.15e\n", (float)f[0]);
+    printf("%20s: Correct: %.15e\n", argv[0] + 13, (float)sum);
+    printf("%20s: Correct: %.15e\n", argv[0] + 13, (float)f[0]);
 }
diff --git a/inner_loop_reduce.c b/inner_loop_reduce.c
@@ -30,6 +30,7 @@ void memory_reduce_add_knc(float *a, __m512 b) {
 }
 #endif
 #endif
+__declspec(noinline)
 void inner_loop_reduce(int N, int * restrict marr, int * restrict base, int * restrict offs, float * restrict x, float * restrict f) {
 //    #pragma omp parallel for
     for (int i = 0; i < N; i++) {
@@ -86,10 +87,10 @@ int main(int argc, char **argv) {
     uint64_t start = __rdtsc();
     inner_loop_reduce(N, marr, base, offs, x, f);
     uint64_t end = __rdtsc();
-    printf("Time   : %.15e\n", (float) (end - start));
+    printf("%21s: Time   : %.15e\n", argv[0] + 20, (float) (end - start));
     double sum = 0;
     for (int i = 0; i < N; i++) {
         sum += f[i];
     }
-    printf("Correct: %.15e\n", (float)sum);
+    printf("%21s: Correct: %.15e\n", argv[0] + 20, (float)sum);
 }