Skip to content

Commit

Permalink
added readme w/ discussion and makefiles
Browse files Browse the repository at this point in the history
  • Loading branch information
v0i0 committed Jan 21, 2016
1 parent 1ac3054 commit a8d1e56
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 7 deletions.
30 changes: 30 additions & 0 deletions Makefile.fill_lanes
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
all: fill_lanes.none.x-mic fill_lanes.none.x-host fill_lanes.simd.x-mic fill_lanes.simd.x-host fill_lanes.fill-temp.x-mic fill_lanes.fill-temp.x-host fill_lanes.fill-direct.x-mic fill_lanes.fill-direct.x-host fill_lanes.fill-intr.x-mic
CC=icc -O3 -std=c99 -w2 -qopt-report=5 -wd10397 -wd10382
fill_lanes.none.x-mic: fill_lanes.c
$(CC) -o$@ -mmic fill_lanes.c
fill_lanes.none.x-host: fill_lanes.c
$(CC) -o$@ -xHost fill_lanes.c
fill_lanes.simd.x-mic: fill_lanes.c
$(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT
fill_lanes.simd.x-host: fill_lanes.c
$(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT
fill_lanes.fill-temp.x-mic: fill_lanes.c
$(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \
-DFILL -DFILL_TEMP
fill_lanes.fill-temp.x-host: fill_lanes.c
$(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \
-DFILL -DFILL_TEMP
#fill_lanes.fill-store.x-mic: fill_lanes.c
# $(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \
# -DFILL -DFILL_STORE
#fill_lanes.fill-store.x-host: fill_lanes.c
# $(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \
# -DFILL -DFILL_STORE
fill_lanes.fill-direct.x-mic: fill_lanes.c
$(CC) -o$@ -mmic fill_lanes.c -DSIMD -DSIMD_CORRECT \
-DFILL -DFILL_DIRECT -DVECTOR_VARIANT
fill_lanes.fill-direct.x-host: fill_lanes.c
$(CC) -o$@ -xHost fill_lanes.c -DSIMD -DSIMD_CORRECT \
-DFILL -DFILL_DIRECT -DVECTOR_VARIANT
fill_lanes.fill-intr.x-mic: fill_lanes.c
$(CC) -o$@ -mmic fill_lanes.c -DFILL_INTR_PHI
14 changes: 14 additions & 0 deletions Makefile.inner_loop_reduce
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
all: inner_loop_reduce.none.x-mic inner_loop_reduce.none.x-host inner_loop_reduce.simd.x-mic inner_loop_reduce.simd.x-host inner_loop_reduce.vector_variant.x-mic inner_loop_reduce.vector_variant.x-host
CC=icc -O3 -std=c99 -w2 -qopt-report=5 -wd10397 -wd10382
inner_loop_reduce.none.x-mic: inner_loop_reduce.c
$(CC) -o$@ -mmic inner_loop_reduce.c
inner_loop_reduce.none.x-host: inner_loop_reduce.c
$(CC) -o$@ -xHost inner_loop_reduce.c
inner_loop_reduce.simd.x-mic: inner_loop_reduce.c
$(CC) -o$@ -mmic inner_loop_reduce.c -DSIMD -DSIMD_CORRECT
inner_loop_reduce.simd.x-host: inner_loop_reduce.c
$(CC) -o$@ -xHost inner_loop_reduce.c -DSIMD -DSIMD_CORRECT
inner_loop_reduce.vector_variant.x-mic: inner_loop_reduce.c
$(CC) -o$@ -mmic inner_loop_reduce.c -DSIMD -DSIMD_CORRECT -DVECTOR_VARIANT
inner_loop_reduce.vector_variant.x-host: inner_loop_reduce.c
$(CC) -o$@ -xHost inner_loop_reduce.c -DSIMD -DSIMD_CORRECT -DVECTOR_VARIANT
10 changes: 10 additions & 0 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,13 @@ Advanced Vectorization Examples
===============================

This repository contains a number of examples that use advanced vectorization.
These are examples of programs that need some degree of trickery to work.
Each example consists of a README file, a Makefile and the source code.

* inner_loop_reduce
showcases ways to resolve conflicts among lanes, where we have to reduce into a variable memory location.
* fill_lanes
transforms an inner loop in a vectorized algorithm in such a way that the body gets executed with as many lanes as possible.

Generally speaking, these examples will only work with an up-to-date Intel compiler.
Their effect is best shown on a Xeon Phi.
53 changes: 53 additions & 0 deletions README.fill_lanes
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
Vectorization Example: Fill Lanes
=================================

Consider a program, where we have to vectorize a loop that contains another, inner loop.
That inner loop iterates over different pieces of data, and contains a "continue" statement halfway through.
This "continue" statement guards the execution of a very expensive function.

Vectorizing this loop conventionally leads to code that executes that expensive function frequently, and most of the times with incomplete masks, i.e. we waste some "space" in our SIMD unit.

Instead, it is possible to only execute that expensive function once we have filled our vector registers to the max:
We do not execute the inner for loop in lock-step anymore, but allow different lanes to proceed independently from each other.

To implement this, we have to figure out if any lane has executed continue, and we have to figure out if all loops are ready to execute a function.
This means that we need reductions on masks among the lanes.
The current compiler does not seem to support that.

We present a number of work-arounds (of various effectiveness).
One relies on non-vectorized functions and global variables (fill-temp), another on vector_variant functions (fill-direct), another on masked stores and global variables (fill-store, defunct), and the last one is an explicitly vectorized code using intrinsics (fill-intr, only Phi).

Measurements
============

Note: none is the non-vectorized version, simd the trivially vectorized version, fill-* are attempts at filling the lanes.
Only the intrinsic filling achieves a speedup.
All versions are correct (i.e. the first correctness parameter is < 1e-5, the second correctness parameter approx -5.99130).

none.x-mic: Time : 3.723747200000000e+08
none.x-mic: Correct: 3.066983936150791e-06
none.x-mic: Correct: -5.991305351257324e+00
simd.x-mic: Time : 1.661247360000000e+08
simd.x-mic: Correct: 3.066983936150791e-06
simd.x-mic: Correct: -5.991305351257324e+00
fill-direct.x-mic: Time : 4.607440000000000e+08
fill-direct.x-mic: Correct: 3.066983936150791e-06
fill-direct.x-mic: Correct: -5.991305351257324e+00
fill-temp.x-mic: Time : 6.023345280000000e+08
fill-temp.x-mic: Correct: 3.066983936150791e-06
fill-temp.x-mic: Correct: -5.991305351257324e+00
fill-intr.x-mic: Time : 3.659471200000000e+07
fill-intr.x-mic: Correct: 4.884550889983075e-06
fill-intr.x-mic: Correct: -5.991304397583008e+00
none.x-host: Time : 7.618968800000000e+07
none.x-host: Correct: 3.066983936150791e-06
none.x-host: Correct: -5.991305351257324e+00
simd.x-host: Time : 8.071620000000000e+07
simd.x-host: Correct: 3.066983936150791e-06
simd.x-host: Correct: -5.991305351257324e+00
fill-direct.x-host: Time : 1.553101760000000e+08
fill-direct.x-host: Correct: 3.066983936150791e-06
fill-direct.x-host: Correct: -5.991305351257324e+00
fill-temp.x-host: Time : 2.124863200000000e+08
fill-temp.x-host: Correct: 3.066983936150791e-06
fill-temp.x-host: Correct: -5.991305351257324e+00
36 changes: 36 additions & 0 deletions README.inner_loop_reduce
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
Vectorization Example: Inner Loop Reduce
========================================

Consider an example where we vectorize an outer loop, and a inner loop is iterated through.
We need to accumulate a value within to a memory location dependent on variables in the inner loop.
The simd pragma can specify a reduction clause for arrays.
However, this means that the array will be augmented with another dimension of size <vector length>, and the code below the pragma will just add into that.
The reduction happens only in the end.
This may actually be desireable behaviour if we often accumulate into relatively few distinct memory locations.
It however is unsuited if we accumulate only a few times per memory location, if we are constrained in terms of memory usage, or if we need to allocate on the heap.

The mitigation strategies herein circle around the idea to "hide" code from the compiler in function calls.
These function calls have to be serialized by the compiler, and we get the desired behaviour.
Note that it is crucial to annotate the function (memory_reduce_add) with __declspec(noinline).
If inlined, it will not work: The code will compute invalid results.
As another additional step, we can make use of the vector_variant annotation, and perform the reduction using compiler intrinsics.

Measurement
===========

Note: none has no pragma-based vectorization, simd is explicitly vectorized, and vector_variant uses vector_variant-declarations additionally.
It is clear that performance benefits on both the Phi and the Host.

none.x-mic: Time : 3.473042000000000e+06
none.x-mic: Correct: 4.835128784179688e-04
simd.x-mic: Time : 1.219468000000000e+06
simd.x-mic: Correct: 1.049041748046875e-04
vector_variant.x-mic: Time : 3.594990000000000e+05
vector_variant.x-mic: Correct: 3.814697265625000e-05
none.x-host: Time : 1.234876000000000e+06
none.x-host: Correct: 4.835128784179688e-04
simd.x-host: Time : 4.378800000000000e+05
simd.x-host: Correct: 3.433227539062500e-05
vector_variant.x-host: Time : 2.689560000000000e+05
vector_variant.x-host: Correct: -6.866455078125000e-05

30 changes: 25 additions & 5 deletions fill_lanes.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include<stdlib.h>
#include<stdio.h>
#include<inttypes.h>
__declspec(vector(nomask))
__declspec(vector)
void compute_f(float dij, float dik, float *fi, float *fj, float *fk) {
*fi = exp(dik) * cos(dij) * sin(dik) * dij;
*fj = sin(dij) * exp(dij) * cos(dik) * dik;
Expand Down Expand Up @@ -68,10 +68,13 @@ __m128i reduce_land_sse(__m128i b) {
#endif
#endif

#ifdef FILL
int temp;
#endif
__declspec(noinline)
void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * offs, float * x, float * f, float rsq) {
#ifdef FILL
int temp;
int * tempp = &temp;
#endif
#ifdef SIMD
#pragma simd
Expand Down Expand Up @@ -99,6 +102,10 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off
while (temp) {
#elif defined(FILL_DIRECT)
while(reduce_lor(active_mask)) {
//#elif defined(FILL_STORE)
// *tempp = 0;
// if (active_mask) *tempp = 1;
// while (*tempp) {
#else
while (/*some*/active_mask) {
#endif
Expand All @@ -116,6 +123,10 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off
if (temp) {
#elif defined(FILL_DIRECT)
if (reduce_land(eff_mask || ! active_mask)) {
//#elif defined(FILL_STORE)
// *tempp = 1;
// if (eff_mask || ! active_mask) *tempp = 0;
// if (*tempp) {
#else
if (/*all*/eff_mask || ! active_mask) {
#endif
Expand All @@ -138,6 +149,9 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off
#ifdef FILL_TEMP
memory_assign(&temp, 0);
memory_reduce_lor(&temp, active_mask);
//#elif defined(FILL_STORE)
// *tempp = 0;
// if (active_mask) *tempp = 1;
#endif
}
#else
Expand Down Expand Up @@ -166,6 +180,7 @@ void fill_lanes(int N, int * iarr, int * jarr, int * marr, int * base, int * off
#endif
}
}
#ifdef FILL_INTR_PHI
__declspec(noinline)
void fill_lanes_intr(int N, int * iarr, int * jarr, int * marr, int * base, int * offs, float * x, float * f, float rsq) {
float tmpf[16] __attribute__((aligned(64)));
Expand Down Expand Up @@ -228,6 +243,7 @@ void fill_lanes_intr(int N, int * iarr, int * jarr, int * marr, int * base, int
}
}
}
#endif
int main(int argc, char **argv) {
int N = 10000;
int M = 16;
Expand All @@ -252,13 +268,17 @@ int main(int argc, char **argv) {
float rsq = 0.25f * M / N;
rsq *= rsq;
uint64_t start = __rdtsc();
#ifdef FILL_INTR_PHI
fill_lanes_intr(N * M, iarr, jarr, marr, base, offs, x, f, rsq);
#else
fill_lanes(N * M, iarr, jarr, marr, base, offs, x, f, rsq);
#endif
uint64_t end = __rdtsc();
printf("Time : %.15e\n", (float) (end - start));
printf("%20s: Time : %.15e\n", argv[0] + 13, (float) (end - start));
double sum = 0;
for (int i = 0; i < N; i++) {
sum += f[i];
}
printf("Correct: %.15e\n", (float)sum);
printf("Correct: %.15e\n", (float)f[0]);
printf("%20s: Correct: %.15e\n", argv[0] + 13, (float)sum);
printf("%20s: Correct: %.15e\n", argv[0] + 13, (float)f[0]);
}
5 changes: 3 additions & 2 deletions inner_loop_reduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ void memory_reduce_add_knc(float *a, __m512 b) {
}
#endif
#endif
__declspec(noinline)
void inner_loop_reduce(int N, int * restrict marr, int * restrict base, int * restrict offs, float * restrict x, float * restrict f) {
// #pragma omp parallel for
for (int i = 0; i < N; i++) {
Expand Down Expand Up @@ -86,10 +87,10 @@ int main(int argc, char **argv) {
uint64_t start = __rdtsc();
inner_loop_reduce(N, marr, base, offs, x, f);
uint64_t end = __rdtsc();
printf("Time : %.15e\n", (float) (end - start));
printf("%21s: Time : %.15e\n", argv[0] + 20, (float) (end - start));
double sum = 0;
for (int i = 0; i < N; i++) {
sum += f[i];
}
printf("Correct: %.15e\n", (float)sum);
printf("%21s: Correct: %.15e\n", argv[0] + 20, (float)sum);
}

0 comments on commit a8d1e56

Please sign in to comment.