diff --git a/Makefile.fill_lanes b/Makefile.fill_lanes new file mode 100644 index 0000000..398c817 --- /dev/null +++ b/Makefile.fill_lanes @@ -0,0 +1,30 @@ +all: fill_lanes.none.x-mic fill_lanes.none.x-host fill_lanes.simd.x-mic fill_lanes.simd.x-host fill_lanes.fill-temp.x-mic fill_lanes.fill-temp.x-host fill_lanes.fill-direct.x-mic fill_lanes.fill-direct.x-host fill_lanes.fill-intr.x-mic +CC=icc -O3 -std=c99 -w2 -qopt-report=5 -wd10397 -wd10382 +fill_lanes.none.x-mic: fill_lanes.c + $(CC) -o$@ -mmic fill_lanes.c +fill_lanes.none.x-host: fill_lanes.c + $(CC) -o$@ -xHost fill_lanes.c +fill_lanes.simd.x-mic: fill_lanes.c + $(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT +fill_lanes.simd.x-host: fill_lanes.c + $(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT +fill_lanes.fill-temp.x-mic: fill_lanes.c + $(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \ + -DFILL -DFILL_TEMP +fill_lanes.fill-temp.x-host: fill_lanes.c + $(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \ + -DFILL -DFILL_TEMP +#fill_lanes.fill-store.x-mic: fill_lanes.c +# $(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \ +# -DFILL -DFILL_STORE +#fill_lanes.fill-store.x-host: fill_lanes.c +# $(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \ +# -DFILL -DFILL_STORE +fill_lanes.fill-direct.x-mic: fill_lanes.c + $(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \ + -DFILL -DFILL_DIRECT -DVECTOR_VARIANT +fill_lanes.fill-direct.x-host: fill_lanes.c + $(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \ + -DFILL -DFILL_DIRECT -DVECTOR_VARIANT +fill_lanes.fill-intr.x-mic: fill_lanes.c + $(CC) -o$@ -mmic fill_lanes.c -DFILL_INTR_PHI diff --git a/Makefile.inner_loop_reduce b/Makefile.inner_loop_reduce new file mode 100644 index 0000000..a4b4ebe --- /dev/null +++ b/Makefile.inner_loop_reduce @@ -0,0 +1,14 @@ +all: inner_loop_reduce.none.x-mic inner_loop_reduce.none.x-host inner_loop_reduce.simd.x-mic inner_loop_reduce.simd.x-host inner_loop_reduce.vector_variant.x-mic inner_loop_reduce.vector_variant.x-host +CC=icc -O3 -std=c99 -w2 -qopt-report=5 -wd10397 -wd10382 +inner_loop_reduce.none.x-mic: inner_loop_reduce.c + $(CC) -o$@ -mmic inner_loop_reduce.c +inner_loop_reduce.none.x-host: inner_loop_reduce.c + $(CC) -o$@ -xHost inner_loop_reduce.c +inner_loop_reduce.simd.x-mic: inner_loop_reduce.c + $(CC) -o$@ -mmic inner_loop_reduce.c -DSIMD -DSIMD_CORRECT +inner_loop_reduce.simd.x-host: inner_loop_reduce.c + $(CC) -o$@ -xHost inner_loop_reduce.c -DSIMD -DSIMD_CORRECT +inner_loop_reduce.vector_variant.x-mic: inner_loop_reduce.c + $(CC) -o$@ -mmic inner_loop_reduce.c -DSIMD -DSIMD_CORRECT -DVECTOR_VARIANT +inner_loop_reduce.vector_variant.x-host: inner_loop_reduce.c + $(CC) -o$@ -xHost inner_loop_reduce.c -DSIMD -DSIMD_CORRECT -DVECTOR_VARIANT diff --git a/README b/README index ea6986c..8f12115 100644 --- a/README +++ b/README @@ -2,3 +2,13 @@ Advanced Vectorization Examples =============================== This repository contains a number of examples that use advanced vectorization. +These are examples of programs that need some degree of trickery to work. +Each example consists of a README file, a Makefile and the source code. + +* inner_loop_reduce + showcases ways to resolve conflicts among lanes, where we have to reduce into a variable memory location. +* fill_lanes + transforms an inner loop in a vectorized algorithm in such a way that the body gets executed with as many lanes as possible. + +Generally speaking, these examples will only work with an up-to-date Intel compiler. +Their effect is best shown on a Xeon Phi. diff --git a/README.fill_lanes b/README.fill_lanes new file mode 100644 index 0000000..1f7f460 --- /dev/null +++ b/README.fill_lanes @@ -0,0 +1,53 @@ +Vectorization Example: Fill Lanes +================================= + +Consider a program, where we have to vectorize a loop that contains another, inner loop. +That inner loop iterates over different pieces of data, and contains a "continue" statement halfway through. +This "continue" statement guards the execution of a very expensive function. + +Vectorizing this loop conventionally leads to code that executes that expensive function frequently, and most of the times with incomplete masks, i.e. we waste some "space" in our SIMD unit. + +Instead, it is possible to only execute that expensive function once we have filled our vector registers to the max: +We do not execute the inner for loop in lock-step anymore, but allow different lanes to proceed independently from each other. + +To implement this, we have to figure out if any lane has executed continue, and we have to figure out if all loops are ready to execute a function. +This means that we need reductions on masks among the lanes. +The current compiler does not seem to support that. + +We present a number of work-arounds (of various effectiveness). +One relies on non-vectorized functions and global variables (fill-temp), another on vector_variant functions (fill-direct), another on masked stores and global variables (fill-store, defunct), and the last one is an explicitly vectorized code using intrinsics (fill-intr, only Phi). + +Measurements +============ + +Note: none is the non-vectorized version, simd the trivially vectorized version, fill-* are attempts at filling the lanes. +Only the intrinsic filling achieves a speedup. +All versions are correct (i.e. the first correctness parameter is < 1e-5, the second correctness parameter approx -5.99130). + + none.x-mic: Time : 3.723747200000000e+08 + none.x-mic: Correct: 3.066983936150791e-06 + none.x-mic: Correct: -5.991305351257324e+00 + simd.x-mic: Time : 1.661247360000000e+08 + simd.x-mic: Correct: 3.066983936150791e-06 + simd.x-mic: Correct: -5.991305351257324e+00 + fill-direct.x-mic: Time : 4.607440000000000e+08 + fill-direct.x-mic: Correct: 3.066983936150791e-06 + fill-direct.x-mic: Correct: -5.991305351257324e+00 + fill-temp.x-mic: Time : 6.023345280000000e+08 + fill-temp.x-mic: Correct: 3.066983936150791e-06 + fill-temp.x-mic: Correct: -5.991305351257324e+00 + fill-intr.x-mic: Time : 3.659471200000000e+07 + fill-intr.x-mic: Correct: 4.884550889983075e-06 + fill-intr.x-mic: Correct: -5.991304397583008e+00 + none.x-host: Time : 7.618968800000000e+07 + none.x-host: Correct: 3.066983936150791e-06 + none.x-host: Correct: -5.991305351257324e+00 + simd.x-host: Time : 8.071620000000000e+07 + simd.x-host: Correct: 3.066983936150791e-06 + simd.x-host: Correct: -5.991305351257324e+00 + fill-direct.x-host: Time : 1.553101760000000e+08 + fill-direct.x-host: Correct: 3.066983936150791e-06 + fill-direct.x-host: Correct: -5.991305351257324e+00 + fill-temp.x-host: Time : 2.124863200000000e+08 + fill-temp.x-host: Correct: 3.066983936150791e-06 + fill-temp.x-host: Correct: -5.991305351257324e+00 diff --git a/README.inner_loop_reduce b/README.inner_loop_reduce new file mode 100644 index 0000000..44e046f --- /dev/null +++ b/README.inner_loop_reduce @@ -0,0 +1,36 @@ +Vectorization Example: Inner Loop Reduce +======================================== + +Consider an example where we vectorize an outer loop, and a inner loop is iterated through. +We need to accumulate a value within to a memory location dependent on variables in the inner loop. +The simd pragma can specify a reduction clause for arrays. +However, this means that the array will be augmented with another dimension of size , and the code below the pragma will just add into that. +The reduction happens only in the end. +This may actually be desireable behaviour if we often accumulate into relatively few distinct memory locations. +It however is unsuited if we accumulate only a few times per memory location, if we are constrained in terms of memory usage, or if we need to allocate on the heap. + +The mitigation strategies herein circle around the idea to "hide" code from the compiler in function calls. +These function calls have to be serialized by the compiler, and we get the desired behaviour. +Note that it is crucial to annotate the function (memory_reduce_add) with __declspec(noinline). +If inlined, it will not work: The code will compute invalid results. +As another additional step, we can make use of the vector_variant annotation, and perform the reduction using compiler intrinsics. + +Measurement +=========== + +Note: none has no pragma-based vectorization, simd is explicitly vectorized, and vector_variant uses vector_variant-declarations additionally. +It is clear that performance benefits on both the Phi and the Host. + + none.x-mic: Time : 3.473042000000000e+06 + none.x-mic: Correct: 4.835128784179688e-04 + simd.x-mic: Time : 1.219468000000000e+06 + simd.x-mic: Correct: 1.049041748046875e-04 + vector_variant.x-mic: Time : 3.594990000000000e+05 + vector_variant.x-mic: Correct: 3.814697265625000e-05 + none.x-host: Time : 1.234876000000000e+06 + none.x-host: Correct: 4.835128784179688e-04 + simd.x-host: Time : 4.378800000000000e+05 + simd.x-host: Correct: 3.433227539062500e-05 +vector_variant.x-host: Time : 2.689560000000000e+05 +vector_variant.x-host: Correct: -6.866455078125000e-05 + diff --git a/fill_lanes.c b/fill_lanes.c index d456e12..37868e9 100644 --- a/fill_lanes.c +++ b/fill_lanes.c @@ -3,7 +3,7 @@ #include #include #include -__declspec(vector(nomask)) +__declspec(vector) void compute_f(float dij, float dik, float *fi, float *fj, float *fk) { *fi = exp(dik) * cos(dij) * sin(dik) * dij; *fj = sin(dij) * exp(dij) * cos(dik) * dik; @@ -68,10 +68,13 @@ __m128i reduce_land_sse(__m128i b) { #endif #endif +#ifdef FILL + int temp; +#endif __declspec(noinline) void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * offs, float * x, float * f, float rsq) { #ifdef FILL - int temp; + int * tempp = &temp; #endif #ifdef SIMD #pragma simd @@ -99,6 +102,10 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off while (temp) { #elif defined(FILL_DIRECT) while(reduce_lor(active_mask)) { + //#elif defined(FILL_STORE) + // *tempp = 0; + // if (active_mask) *tempp = 1; + // while (*tempp) { #else while (/*some*/active_mask) { #endif @@ -116,6 +123,10 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off if (temp) { #elif defined(FILL_DIRECT) if (reduce_land(eff_mask || ! active_mask)) { + //#elif defined(FILL_STORE) + // *tempp = 1; + // if (eff_mask || ! active_mask) *tempp = 0; + // if (*tempp) { #else if (/*all*/eff_mask || ! active_mask) { #endif @@ -138,6 +149,9 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off #ifdef FILL_TEMP memory_assign(&temp, 0); memory_reduce_lor(&temp, active_mask); + //#elif defined(FILL_STORE) + // *tempp = 0; + // if (active_mask) *tempp = 1; #endif } #else @@ -166,6 +180,7 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off #endif } } +#ifdef FILL_INTR_PHI __declspec(noinline) void fill_lanes_intr(int N, int * iarr, int * jarr, int * marr, int * base, int * offs, float * x, float * f, float rsq) { float tmpf[16] __attribute__((aligned(64))); @@ -228,6 +243,7 @@ void fill_lanes_intr(int N, int * iarr, int * jarr, int * marr, int * base, int } } } +#endif int main(int argc, char **argv) { int N = 10000; int M = 16; @@ -252,13 +268,17 @@ int main(int argc, char **argv) { float rsq = 0.25f * M / N; rsq *= rsq; uint64_t start = __rdtsc(); +#ifdef FILL_INTR_PHI fill_lanes_intr(N * M, iarr, jarr, marr, base, offs, x, f, rsq); +#else + fill_lanes(N * M, iarr, jarr, marr, base, offs, x, f, rsq); +#endif uint64_t end = __rdtsc(); - printf("Time : %.15e\n", (float) (end - start)); + printf("%20s: Time : %.15e\n", argv[0] + 13, (float) (end - start)); double sum = 0; for (int i = 0; i < N; i++) { sum += f[i]; } - printf("Correct: %.15e\n", (float)sum); - printf("Correct: %.15e\n", (float)f[0]); + printf("%20s: Correct: %.15e\n", argv[0] + 13, (float)sum); + printf("%20s: Correct: %.15e\n", argv[0] + 13, (float)f[0]); } diff --git a/inner_loop_reduce.c b/inner_loop_reduce.c index b71e5c2..6b69cfd 100644 --- a/inner_loop_reduce.c +++ b/inner_loop_reduce.c @@ -30,6 +30,7 @@ void memory_reduce_add_knc(float *a, __m512 b) { } #endif #endif +__declspec(noinline) void inner_loop_reduce(int N, int * restrict marr, int * restrict base, int * restrict offs, float * restrict x, float * restrict f) { // #pragma omp parallel for for (int i = 0; i < N; i++) { @@ -86,10 +87,10 @@ int main(int argc, char **argv) { uint64_t start = __rdtsc(); inner_loop_reduce(N, marr, base, offs, x, f); uint64_t end = __rdtsc(); - printf("Time : %.15e\n", (float) (end - start)); + printf("%21s: Time : %.15e\n", argv[0] + 20, (float) (end - start)); double sum = 0; for (int i = 0; i < N; i++) { sum += f[i]; } - printf("Correct: %.15e\n", (float)sum); + printf("%21s: Correct: %.15e\n", argv[0] + 20, (float)sum); }