Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 40 additions & 75 deletions DataFormats/Math/interface/libminifloat.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,68 +4,49 @@
#include <cstdint>
#include <cassert>
#include <algorithm>
#include <cstring>

// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
class MiniFloatConverter {
public:
MiniFloatConverter();
inline static float float16to32(uint16_t h) {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
return conv.flt;
uint32_t i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
return bit_cast<float>(i32);
}
inline static uint16_t float32to16(float x) { return float32to16round(x); }
/// Fast implementation, but it crops the number so it biases low
inline static uint16_t float32to16crop(float x) {
union {
float flt;
uint32_t i32;
} conv;
conv.flt = x;
return basetable[(conv.i32 >> 23) & 0x1ff] + ((conv.i32 & 0x007fffff) >> shifttable[(conv.i32 >> 23) & 0x1ff]);
uint32_t i32 = bit_cast<uint32_t>(x);
return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
}
/// Slower implementation, but it rounds to avoid biases
inline static uint16_t float32to16round(float x) {
union {
float flt;
uint32_t i32;
} conv;
conv.flt = x;
uint8_t shift = shifttable[(conv.i32 >> 23) & 0x1ff];
uint32_t i32 = bit_cast<uint32_t>(x);
uint8_t shift = shifttable[(i32 >> 23) & 0x1ff];
if (shift == 13) {
uint16_t base2 = (conv.i32 & 0x007fffff) >> 12;
uint16_t base2 = (i32 & 0x007fffff) >> 12;
uint16_t base = base2 >> 1;
if (((base2 & 1) != 0) && (base < 1023))
base++;
return basetable[(conv.i32 >> 23) & 0x1ff] + base;
return basetable[(i32 >> 23) & 0x1ff] + base;
} else {
return basetable[(conv.i32 >> 23) & 0x1ff] + ((conv.i32 & 0x007fffff) >> shifttable[(conv.i32 >> 23) & 0x1ff]);
return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
}
}
template <int bits>
inline static float reduceMantissaToNbits(const float &f) {
static_assert(bits <= 23, "max mantissa size is 23 bits");
constexpr uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
conv.i32 &= mask;
return conv.flt;
uint32_t i32 = bit_cast<uint32_t>(f);
i32 &= mask;
return bit_cast<float>(i32);
}
inline static float reduceMantissaToNbits(const float &f, int bits) {
uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
conv.i32 &= mask;
return conv.flt;
uint32_t i32 = bit_cast<uint32_t>(f);
i32 &= mask;
return bit_cast<float>(i32);
}

class ReduceMantissaToNbitsRounding {
Expand All @@ -77,20 +58,16 @@ class MiniFloatConverter {
float operator()(float f) const {
constexpr uint32_t low23 = (0x007FFFFF); // mask to keep lowest 23 bits = mantissa
constexpr uint32_t hi9 = (0xFF800000); // mask to keep highest 9 bits = the rest
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
if (conv.i32 & test) { // need to round
uint32_t mantissa = (conv.i32 & low23) >> shift;
uint32_t i32 = bit_cast<uint32_t>(f);
if (i32 & test) { // need to round
uint32_t mantissa = (i32 & low23) >> shift;
if (mantissa < maxn)
mantissa++;
conv.i32 = (conv.i32 & hi9) | (mantissa << shift);
i32 = (i32 & hi9) | (mantissa << shift);
} else {
conv.i32 &= mask;
i32 &= mask;
}
return conv.flt;
return bit_cast<float>(i32);
}

private:
Expand All @@ -114,54 +91,34 @@ class MiniFloatConverter {
}

inline static float max() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
return conv.flt;
constexpr uint32_t i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
return bit_cast<float>(i32);
}

// Maximum float32 value that gets rounded to max()
inline static float max32RoundedToMax16() {
union {
float flt;
uint32_t i32;
} conv;
// 2^16 in float32 is the first to result inf in float16, so
// 2^16-1 is the last float32 to result max() in float16
conv.i32 = (0x8f << 23) - 1;
return conv.flt;
constexpr uint32_t i32 = (0x8f << 23) - 1;
return bit_cast<float>(i32);
}

inline static float min() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
return conv.flt;
constexpr uint32_t i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
return bit_cast<float>(i32);
}

// Minimum float32 value that gets rounded to min()
inline static float min32RoundedToMin16() {
union {
float flt;
uint32_t i32;
} conv;
// 2^-14-1 in float32 is the first to result denormalized in float16, so
// 2^-14 is the first float32 to result min() in float16
conv.i32 = (0x71 << 23);
return conv.flt;
constexpr uint32_t i32 = (0x71 << 23);
return bit_cast<float>(i32);
}

inline static float denorm_min() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
return conv.flt;
constexpr uint32_t i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
return bit_cast<float>(i32);
}

inline static bool isdenorm(uint16_t h) {
Expand All @@ -170,6 +127,14 @@ class MiniFloatConverter {
}

private:
//in C++20 we can use std::bit_cast which is constexpr
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it worthwhile to fold this in #ifndef __cpp_lib_bit_cast and #include <bit> at the top if it's defined?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took a look at https://en.cppreference.com/w/cpp/feature_test and it seems to me like the feature test system is really meant for libraries which are meant to support many different C++ versions simultaneously. That isn't what we really do. Once we validate C++20 option for our compilers (which we haven't even started since no compiler supports all the items we might want) we will not need to compile that code with an older compiler.

So personally, I don't see a compelling reason to add 10+ lines of code (as a guess) which are only useful during the short window between the C++17 to C++20 transition. If you really want that ability, I will change the code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@slava77 are you 'OK' with my reasoning about not using the test feature or would you prefer them to be added to the code?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a more broad question: what are we going to do with all other use cases of union?
e.g. DataFormats/Math/interface/FastMath.h, SIMDVec.h, approx_math.h

the bit decoding is also used quite extensively in DataFormats/GEMDigi, e.g. interface/GEMVFAT.h; is that part OK, or does it need to switch to memcpy as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@slava77 I'm of the opinion that we should change all the uses of union for bit decoding eventually. It would be nice to have the bit_cast for that but I'd be OK with doing the memcpy.

The only place we discovered memcpy to be a problem is when used with the NVidia compiler as that one still calls the memcpy function rather than replacing it with inlined machine code. So for GPU shared code, we have to wait for the bit_cast support (in gcc/clang and ncc) before we can remove the undefined behavior.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking that just calling memcpy in the other cases directly was how I was going to handle it. I first tried it that way here, but there was so much repetition that I switched to defining a bit_cast (which drastically simplified the code, surprisingly).

If we were to have our own bit_cast it would more likely be put in FWCore/Utilities since we've often put soon to come to std items there in the past.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As for the other places in DataFormat/Math I'm not sure the compilers are smart enough to be able to vectorize the replacements for memcpy they use. So I'd skip those for now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we were to have our own bit_cast it would more likely be put in FWCore/Utilities since we've often put soon to come to std items there in the past.

I'm certainly fine with FWCore/Utilities

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want us to make the bit_cast separately for this pull request or can we wait until we see a need?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want us to make the bit_cast separately for this pull request or can we wait until we see a need?

from #35054 (comment)
I concluded that there is a need (there just isn't another PR yet that addresses other use cases of a union).

template <class To, class From>
inline static To bit_cast(const From &src) noexcept {
static_assert(sizeof(To) == sizeof(From), "incompatible types");
To dst;
std::memcpy(&dst, &src, sizeof(To));
return dst;
}
CMS_THREAD_SAFE static uint32_t mantissatable[2048];
CMS_THREAD_SAFE static uint32_t exponenttable[64];
CMS_THREAD_SAFE static uint16_t offsettable[64];
Expand Down