Add float8_e4m3

jax-ml · Aug 21, 2024 · 27d81a6 · 27d81a6
1 parent b157c19
commit 27d81a6
Show file tree

Hide file tree

Showing 9 changed files with 362 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,9 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`):
 
 ## [Unreleased]
 
+* Added new 8-bit float type following IEEE 754 convention:
+  `ml_dtypes.float8_e4m3`.
+
 ## [0.4.0] - 2024-04-1
 
 * Updates `ml_dtypes` for compatibility with future NumPy 2.0 release.

diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@
   an alternative to the standard [`float16`](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) format
 - `float8_*`: several experimental 8-bit floating point representations
   including:
+  * `float8_e4m3`
   * `float8_e4m3b11fnuz`
   * `float8_e4m3fn`
   * `float8_e4m3fnuz`
@@ -64,6 +65,10 @@ A `bfloat16` number is a single-precision float truncated at 16 bits.
 
 Exponent: 8, Mantissa: 7, exponent bias: 127. IEEE 754, with NaN and inf.
 
+### `float8_e4m3`
+
+Exponent: 4, Mantissa: 3, bias: 7. IEEE 754, with NaN and inf.
+
 ### `float8_e4m3b11fnuz`
 
 Exponent: 4, Mantissa: 3, bias: 11.

diff --git a/ml_dtypes/__init__.py b/ml_dtypes/__init__.py
@@ -17,6 +17,7 @@
     "__version__",
     "bfloat16",
     "finfo",
+    "float8_e4m3",
     "float8_e4m3b11fnuz",
     "float8_e4m3fn",
     "float8_e4m3fnuz",
@@ -34,6 +35,7 @@
 from ml_dtypes._finfo import finfo
 from ml_dtypes._iinfo import iinfo
 from ml_dtypes._ml_dtypes_ext import bfloat16
+from ml_dtypes._ml_dtypes_ext import float8_e4m3
 from ml_dtypes._ml_dtypes_ext import float8_e4m3b11fnuz
 from ml_dtypes._ml_dtypes_ext import float8_e4m3fn
 from ml_dtypes._ml_dtypes_ext import float8_e4m3fnuz
@@ -46,6 +48,7 @@
 import numpy as np
 
 bfloat16: Type[np.generic]
+float8_e4m3: Type[np.generic]
 float8_e4m3b11fnuz: Type[np.generic]
 float8_e4m3fn: Type[np.generic]
 float8_e4m3fnuz: Type[np.generic]

diff --git a/ml_dtypes/_finfo.py b/ml_dtypes/_finfo.py
@@ -17,6 +17,7 @@
 from typing import Dict
 
 from ml_dtypes._ml_dtypes_ext import bfloat16
+from ml_dtypes._ml_dtypes_ext import float8_e4m3
 from ml_dtypes._ml_dtypes_ext import float8_e4m3b11fnuz
 from ml_dtypes._ml_dtypes_ext import float8_e4m3fn
 from ml_dtypes._ml_dtypes_ext import float8_e4m3fnuz
@@ -25,6 +26,7 @@
 import numpy as np
 
 _bfloat16_dtype = np.dtype(bfloat16)
+_float8_e4m3_dtype = np.dtype(float8_e4m3)
 _float8_e4m3b11fnuz_dtype = np.dtype(float8_e4m3b11fnuz)
 _float8_e4m3fn_dtype = np.dtype(float8_e4m3fn)
 _float8_e4m3fnuz_dtype = np.dtype(float8_e4m3fnuz)
@@ -41,6 +43,14 @@ def __init__(self):
     self.smallest_subnormal = bfloat16(smallest_subnormal)
 
 
+class _Float8E4m3MachArLike:
+
+  def __init__(self):
+    smallest_normal = float.fromhex("0x1p-6")
+    self.smallest_normal = float8_e4m3(smallest_normal)
+    smallest_subnormal = float.fromhex("0x1p-9")
+    self.smallest_subnormal = float8_e4m3(smallest_subnormal)
+
 class _Float8E4m3b11fnuzMachArLike:
 
   def __init__(self):
@@ -135,6 +145,51 @@ def float_to_str(f):
     # pylint: enable=protected-access
     return obj
 
+  @staticmethod
+  def _float8_e4m3_finfo():
+    def float_to_str(f):
+      return "%6.2e" % float(f)
+
+    tiny = float.fromhex("0x1p-6")  # 1/64 min normal
+    resolution = 0.1
+    eps = float.fromhex("0x1p-3")  # 1/8
+    epsneg = float.fromhex("0x1p-4")  # 1/16
+    max_ = float.fromhex("0x1.Ep7")  # 240 max normal
+
+    obj = object.__new__(np.finfo)
+    obj.dtype = _float8_e4m3_dtype
+    obj.bits = 8
+    obj.eps = float8_e4m3(eps)
+    obj.epsneg = float8_e4m3(epsneg)
+    obj.machep = -3
+    obj.negep = -4
+    obj.max = float8_e4m3(max_)
+    obj.min = float8_e4m3(-max_)
+    obj.nexp = 4
+    obj.nmant = 3
+    obj.iexp = obj.nexp
+    obj.maxexp = 8
+    obj.minexp = -6
+    obj.precision = 1
+    obj.resolution = float8_e4m3(resolution)
+    # pylint: disable=protected-access
+    obj._machar = _Float8E4m3MachArLike()
+    if not hasattr(obj, "tiny"):
+      obj.tiny = float8_e4m3(tiny)
+    if not hasattr(obj, "smallest_normal"):
+      obj.smallest_normal = obj._machar.smallest_normal
+    obj.smallest_subnormal = obj._machar.smallest_subnormal
+
+    obj._str_tiny = float_to_str(tiny)
+    obj._str_smallest_normal = float_to_str(tiny)
+    obj._str_smallest_subnormal = float_to_str(obj.smallest_subnormal)
+    obj._str_max = float_to_str(max_)
+    obj._str_epsneg = float_to_str(epsneg)
+    obj._str_eps = float_to_str(eps)
+    obj._str_resolution = float_to_str(resolution)
+    # pylint: enable=protected-access
+    return obj
+
   @staticmethod
   def _float8_e4m3b11fnuz_finfo():
     def float_to_str(f):
@@ -369,6 +424,14 @@ def __new__(cls, dtype):
       if _bfloat16_dtype not in cls._finfo_cache:
         cls._finfo_cache[_bfloat16_dtype] = cls._bfloat16_finfo()
       return cls._finfo_cache[_bfloat16_dtype]
+    if (
+        isinstance(dtype, str)
+        and dtype == "float8_e4m3"
+        or dtype == _float8_e4m3_dtype
+    ):
+      if _float8_e4m3_dtype not in cls._finfo_cache:
+        cls._finfo_cache[_float8_e4m3_dtype] = cls._float8_e4m3_finfo()
+      return cls._finfo_cache[_float8_e4m3_dtype]
     if (
         isinstance(dtype, str)
         and dtype == "float8_e4m3b11fnuz"

diff --git a/ml_dtypes/_src/dtypes.cc b/ml_dtypes/_src/dtypes.cc
@@ -60,6 +60,20 @@ struct TypeDescriptor<bfloat16> : CustomFloatType<bfloat16> {
   static constexpr char kNpyDescrByteorder = '=';
 };
 
+template <>
+struct TypeDescriptor<float8_e4m3> : CustomFloatType<float8_e4m3> {
+  typedef float8_e4m3 T;
+  static constexpr bool is_floating = true;
+  static constexpr bool is_integral = false;
+  static constexpr const char* kTypeName = "float8_e4m3";
+  static constexpr const char* kQualifiedTypeName = "ml_dtypes.float8_e4m3";
+  static constexpr const char* kTpDoc = "float8_e4m3 floating-point values";
+  // Set e4m3 kind as Void since kind=f (float) with itemsize=1 is used by e5m2
+  static constexpr char kNpyDescrKind = 'V';       // Void
+  static constexpr char kNpyDescrType = '7';       // '4' is reserved for e4m3fn
+  static constexpr char kNpyDescrByteorder = '=';  // Native byte order
+};
+
 template <>
 struct TypeDescriptor<float8_e4m3b11fnuz>
     : CustomFloatType<float8_e4m3b11fnuz> {
@@ -269,6 +283,9 @@ bool Initialize() {
   if (!RegisterFloatDtype<bfloat16>(numpy.get())) {
     return false;
   }
+  if (!RegisterFloatDtype<float8_e4m3>(numpy.get())) {
+    return false;
+  }
   if (!RegisterFloatDtype<float8_e4m3b11fnuz>(numpy.get())) {
     return false;
   }
@@ -319,6 +336,12 @@ bool Initialize() {
   success &= RegisterTwoWayCustomCast<float8_e5m2fnuz, float8_e4m3fn, float>();
   success &= RegisterTwoWayCustomCast<float8_e4m3fnuz, float8_e5m2, float>();
   success &= RegisterTwoWayCustomCast<float8_e5m2fnuz, float8_e5m2, float>();
+  success &= RegisterTwoWayCustomCast<float8_e4m3, bfloat16, float>();
+  success &= RegisterTwoWayCustomCast<float8_e4m3, float8_e4m3b11fnuz, float>();
+  success &= RegisterTwoWayCustomCast<float8_e4m3, float8_e5m2fnuz, float>();
+  success &= RegisterTwoWayCustomCast<float8_e4m3, float8_e4m3fnuz, float>();
+  success &= RegisterTwoWayCustomCast<float8_e4m3, float8_e4m3fn, float>();
+  success &= RegisterTwoWayCustomCast<float8_e4m3, float8_e5m2, float>();
   success &= RegisterOneWayCustomCast<int2, int4, int8_t>();
   success &= RegisterOneWayCustomCast<uint2, uint4, uint8_t>();
   return success;
@@ -349,6 +372,11 @@ extern "C" EXPORT_SYMBOL PyObject* PyInit__ml_dtypes_ext() {
     return nullptr;
   }
 
+  if (PyObject_SetAttrString(m.get(), "float8_e4m3",
+                             reinterpret_cast<PyObject*>(
+                                 TypeDescriptor<float8_e4m3>::type_ptr)) < 0) {
+    return nullptr;
+  }
   if (PyObject_SetAttrString(
           m.get(), "float8_e4m3b11fnuz",
           reinterpret_cast<PyObject*>(