Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 32 additions & 75 deletions DataFormats/Math/interface/libminifloat.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef libminifloat_h
#define libminifloat_h
#include "FWCore/Utilities/interface/thread_safety_macros.h"
#include "FWCore/Utilities/interface/bit_cast.h"
#include <cstdint>
#include <cassert>
#include <algorithm>
Expand All @@ -10,62 +11,42 @@ class MiniFloatConverter {
public:
MiniFloatConverter();
inline static float float16to32(uint16_t h) {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
return conv.flt;
uint32_t i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
return edm::bit_cast<float>(i32);
}
inline static uint16_t float32to16(float x) { return float32to16round(x); }
/// Fast implementation, but it crops the number so it biases low
inline static uint16_t float32to16crop(float x) {
union {
float flt;
uint32_t i32;
} conv;
conv.flt = x;
return basetable[(conv.i32 >> 23) & 0x1ff] + ((conv.i32 & 0x007fffff) >> shifttable[(conv.i32 >> 23) & 0x1ff]);
uint32_t i32 = edm::bit_cast<uint32_t>(x);
return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
}
/// Slower implementation, but it rounds to avoid biases
inline static uint16_t float32to16round(float x) {
union {
float flt;
uint32_t i32;
} conv;
conv.flt = x;
uint8_t shift = shifttable[(conv.i32 >> 23) & 0x1ff];
uint32_t i32 = edm::bit_cast<uint32_t>(x);
uint8_t shift = shifttable[(i32 >> 23) & 0x1ff];
if (shift == 13) {
uint16_t base2 = (conv.i32 & 0x007fffff) >> 12;
uint16_t base2 = (i32 & 0x007fffff) >> 12;
uint16_t base = base2 >> 1;
if (((base2 & 1) != 0) && (base < 1023))
base++;
return basetable[(conv.i32 >> 23) & 0x1ff] + base;
return basetable[(i32 >> 23) & 0x1ff] + base;
} else {
return basetable[(conv.i32 >> 23) & 0x1ff] + ((conv.i32 & 0x007fffff) >> shifttable[(conv.i32 >> 23) & 0x1ff]);
return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
}
}
template <int bits>
inline static float reduceMantissaToNbits(const float &f) {
static_assert(bits <= 23, "max mantissa size is 23 bits");
constexpr uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
conv.i32 &= mask;
return conv.flt;
uint32_t i32 = edm::bit_cast<uint32_t>(f);
i32 &= mask;
return edm::bit_cast<float>(i32);
}
inline static float reduceMantissaToNbits(const float &f, int bits) {
uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
conv.i32 &= mask;
return conv.flt;
uint32_t i32 = edm::bit_cast<uint32_t>(f);
i32 &= mask;
return edm::bit_cast<float>(i32);
}

class ReduceMantissaToNbitsRounding {
Expand All @@ -77,20 +58,16 @@ class MiniFloatConverter {
float operator()(float f) const {
constexpr uint32_t low23 = (0x007FFFFF); // mask to keep lowest 23 bits = mantissa
constexpr uint32_t hi9 = (0xFF800000); // mask to keep highest 9 bits = the rest
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
if (conv.i32 & test) { // need to round
uint32_t mantissa = (conv.i32 & low23) >> shift;
uint32_t i32 = edm::bit_cast<uint32_t>(f);
if (i32 & test) { // need to round
uint32_t mantissa = (i32 & low23) >> shift;
if (mantissa < maxn)
mantissa++;
conv.i32 = (conv.i32 & hi9) | (mantissa << shift);
i32 = (i32 & hi9) | (mantissa << shift);
} else {
conv.i32 &= mask;
i32 &= mask;
}
return conv.flt;
return edm::bit_cast<float>(i32);
}

private:
Expand All @@ -114,54 +91,34 @@ class MiniFloatConverter {
}

inline static float max() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
return conv.flt;
constexpr uint32_t i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
return edm::bit_cast<float>(i32);
}

// Maximum float32 value that gets rounded to max()
inline static float max32RoundedToMax16() {
union {
float flt;
uint32_t i32;
} conv;
// 2^16 in float32 is the first to result inf in float16, so
// 2^16-1 is the last float32 to result max() in float16
conv.i32 = (0x8f << 23) - 1;
return conv.flt;
constexpr uint32_t i32 = (0x8f << 23) - 1;
return edm::bit_cast<float>(i32);
}

inline static float min() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
return conv.flt;
constexpr uint32_t i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
return edm::bit_cast<float>(i32);
}

// Minimum float32 value that gets rounded to min()
inline static float min32RoundedToMin16() {
union {
float flt;
uint32_t i32;
} conv;
// 2^-14-1 in float32 is the first to result denormalized in float16, so
// 2^-14 is the first float32 to result min() in float16
conv.i32 = (0x71 << 23);
return conv.flt;
constexpr uint32_t i32 = (0x71 << 23);
return edm::bit_cast<float>(i32);
}

inline static float denorm_min() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
return conv.flt;
constexpr uint32_t i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
return edm::bit_cast<float>(i32);
}

inline static bool isdenorm(uint16_t h) {
Expand Down
36 changes: 36 additions & 0 deletions FWCore/Utilities/interface/bit_cast.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef FWCore_Utilities_bit_cast_h
#define FWCore_Utilities_bit_cast_h
// -*- C++ -*-
//
// Package: FWCore/Utilities
// Class : bit_cast
//
/**\function edm::bit_cast bit_cast.h "FWCore/Utilities/interface/bit_cast.h"

Description: C++ 20 std::bit_cast stand-in

Usage:
See documentation on std::bit_cast in C++ 20

*/
//
// Original Author: Christopher Jones
// Created: Wed, 01 Sep 2021 19:11:41 GMT
//

// system include files
#include <cstring>

// user include files

namespace edm {
//in C++20 we can use std::bit_cast which is constexpr
template <class To, class From>
inline To bit_cast(const From &src) noexcept {
static_assert(sizeof(To) == sizeof(From), "incompatible types");
To dst;
std::memcpy(&dst, &src, sizeof(To));
return dst;
}
} // namespace edm
#endif