Skip to content

Commit c303094

Browse files
author
James
committed
add new internal function __vectorized_square_accumulate fixing issue #126
1 parent bc83811 commit c303094

File tree

4 files changed

+22
-2
lines changed

4 files changed

+22
-2
lines changed

library/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ OPT_FLAGS := -O1
2525
LDFLAGS := -lm -lrt -pthread -shared -Wl,-soname,$(SONAME)
2626

2727
# different compile flags for math libs
28-
MATH_OPT_FLAGS := -O3 -ffast-math -ftree-vectorize -Wno-restrict
28+
MATH_OPT_FLAGS := -O3 -ffast-math -ftree-vectorize
2929

3030
# commands
3131
RM := rm -rf

library/src/math/algebra.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ int __householder_reflection(int step, rc_matrix_t* Q, rc_matrix_t* R)
7474

7575
// pre-calculate matrix multiplication coefficient tau
7676
// doing this on one line causes a compiler optimization error :-/
77-
dot = __vectorized_mult_accumulate(x,x,n);
77+
dot = __vectorized_square_accumulate(x,n);
7878
tau = -2.0/dot;
7979

8080
// fill in diagonal and upper triangle of H

library/src/math/algebra_common.c

+10
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,13 @@ double __vectorized_mult_accumulate(double * __restrict__ a, double * __restrict
1515
return sum;
1616
}
1717

18+
19+
double __vectorized_square_accumulate(double * __restrict__ a, int n)
20+
{
21+
int i;
22+
double sum = 0.0;
23+
for(i=0;i<n;i++){
24+
sum+=a[i]*a[i];
25+
}
26+
return sum;
27+
}

library/src/math/algebra_common.h

+10
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,14 @@
2525
*/
2626
double __vectorized_mult_accumulate(double * __restrict__ a, double * __restrict__ b, int n);
2727

28+
/*
29+
* Performs a vector dot product on the contents of a with itself
30+
*
31+
* This is a dangerous function that could segfault if not used properly. Hence
32+
* it is only for internal use in the RC library. the 'restrict' attributes tell
33+
* the C compiler that the pointers are not aliased which helps the vectorization
34+
* process for optimization with the NEON FPU or similar
35+
*/
36+
double __vectorized_square_accumulate(double * __restrict__ a, int n);
37+
2838
#endif // RC_ALGEBRA_COMMON_H

0 commit comments

Comments
 (0)