Skip to content

Commit 5603463

Browse files
malfetfacebook-github-bot
authored andcommitted
Workaround arm64 gcc error in std::copysign (pytorch#51900)
Summary: Move definition of copysign template and specialization for bfloat16/half types before first use of copysign in that file Add comment explaining why this is necessary Fixes pytorch#51889 Pull Request resolved: pytorch#51900 Reviewed By: walterddr Differential Revision: D26321741 Pulled By: malfet fbshipit-source-id: 888858b11d9708fa140fe9c0570cc5a24599205b
1 parent 015cabf commit 5603463

File tree

1 file changed

+22
-18
lines changed

1 file changed

+22
-18
lines changed

aten/src/ATen/native/cpu/BinaryOpsKernel.cpp

+22-18
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,27 @@ namespace {
1919

2020
using namespace vec256;
2121

22+
// Note: Explicit implementation of copysign for Half and BFloat16
23+
// is needed to workaround g++-7/8 crash on aarch64, but also makes
24+
// copysign faster for the half-precision types
25+
template<typename T>
26+
T copysign(T a, T b) {
27+
return std::copysign(a, b);
28+
}
29+
30+
// Implement copysign for half precision floats using bit ops
31+
// Sign is the most significant bit for both half and bfloat16 types
32+
template<>
33+
c10::Half copysign(c10::Half a, c10::Half b) {
34+
return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
35+
}
36+
37+
template<>
38+
c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
39+
return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
40+
}
41+
42+
2243
// Note: Undefined behavior when performing addition is intentionally
2344
// ignored.
2445
void add_kernel(TensorIteratorBase& iter, Scalar alpha_scalar) {
@@ -180,7 +201,7 @@ void div_floor_kernel(TensorIterator& iter) {
180201
floordiv += scalar_t(1.0);
181202
}
182203
} else {
183-
floordiv = std::copysign(scalar_t(0), a / b);
204+
floordiv = copysign(scalar_t(0), a / b);
184205
}
185206
return floordiv;
186207
});
@@ -889,23 +910,6 @@ void heaviside_kernel(TensorIterator& iter) {
889910
});
890911
}
891912

892-
template<typename T>
893-
T copysign(T a, T b) {
894-
return std::copysign(a, b);
895-
}
896-
897-
// Implement copysign for half precision floats using bit ops
898-
// Sign is the most significant bit for both half and bfloat16 types
899-
template<>
900-
c10::Half copysign(c10::Half a, c10::Half b) {
901-
return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
902-
}
903-
904-
template<>
905-
c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
906-
return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
907-
}
908-
909913
void copysign_kernel(TensorIterator& iter) {
910914
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() {
911915
cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {

0 commit comments

Comments
 (0)