fat-seer-6 (#196)

Elo | 14.98 +- 6.61 (95%) SPRT | 40.0+0.40s Threads=1 Hash=64MB LLR | 2.97 (-2.94, 2.94) [0.00, 5.00] Games | N: 2600 W: 705 L: 593 D: 1302 Penta | [5, 242, 698, 346, 9] bench: 4173348
connormcmonigle · Nov 23, 2024 · b04754b · b04754b
1 parent bb4ebcf
commit b04754b
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 356 deletions.
diff --git a/README.md b/README.md
@@ -31,6 +31,6 @@ Seer is an original, strong UCI chess engine. Seer relies on a neural network es
 The latest network can be found [here](https://github.com/connormcmonigle/seer-training/releases)
 ```
 cd build
-wget -O eval.bin https://github.com/connormcmonigle/seer-training/releases/download/0x35ddef41/q0x35ddef41.bin
+wget -O eval.bin https://github.com/connormcmonigle/seer-training/releases/download/0x2291e0ff/q0x2291e0ff.bin
 make pgo EVALFILE=eval.bin
 ```
diff --git a/build/makefile b/build/makefile
@@ -2,7 +2,7 @@ EXE = seer
 CXX = g++
 
 CXXSTANDARD = 17
-EVALFILE = weights/q0x35ddef41.bin
+EVALFILE = weights/q0x2291e0ff.bin
 OPSLIMIT = 1000000000
 
 CXXSRC += $(wildcard ../src/*.cc )

diff --git a/include/nnue/dense_relu_affine_layer.h b/include/nnue/dense_relu_affine_layer.h
@@ -22,63 +22,87 @@
 #include <nnue/dot_type.h>
 #include <nnue/simd.h>
 
+#include <algorithm>
 #include <cstddef>
 
 namespace nnue {
 
-template <typename T, std::size_t dim0, std::size_t dim1>
+template <std::size_t dim0, std::size_t dim1, typename T, typename I = T, typename O = dot_type<I>>
 struct dense_relu_affine_layer {
   static constexpr std::size_t W_numel = dim0 * dim1;
   static constexpr std::size_t b_numel = dim1;
 
   alignas(simd::alignment) T W[W_numel];
-  alignas(simd::alignment) dot_type<T> b[b_numel];
+  alignas(simd::alignment) O b[b_numel];
 
   [[nodiscard]] constexpr std::size_t num_parameters() const noexcept { return W_numel + b_numel; }
 
-  [[nodiscard]] inline aligned_vector<dot_type<T>, dim1> forward(const aligned_vector<T, dim0>& x) const noexcept {
-    auto result = aligned_vector<dot_type<T>, dim1>::from(b);
+  [[nodiscard]] inline aligned_vector<O, dim1> forward_relu(const aligned_vector<I, dim0>& x) const noexcept {
+    auto result = aligned_vector<O, dim1>::from(b);
     simd::relu_matrix_vector_product<dim0, dim1>(W, x.data, result.data);
     return result;
   }
 
-  [[nodiscard]] inline aligned_vector<dot_type<T>, dim1> forward(const aligned_slice<T, dim0>& x) const noexcept {
-    auto result = aligned_vector<dot_type<T>, dim1>::from(b);
+  [[nodiscard]] inline aligned_vector<O, dim1> forward_relu(const aligned_slice<I, dim0>& x) const noexcept {
+    auto result = aligned_vector<O, dim1>::from(b);
     simd::relu_matrix_vector_product<dim0, dim1>(W, x.data, result.data);
     return result;
   }
 
+  [[nodiscard]] inline aligned_vector<O, dim1> forward_crelu255(const aligned_vector<I, dim0>& x) const noexcept {
+    auto result = aligned_vector<O, dim1>::from(b);
+    simd::crelu255_matrix_vector_product<dim0, dim1>(W, x.data, result.data);
+    return result;
+  }
+
+  [[nodiscard]] inline aligned_vector<O, dim1> forward_crelu255(const aligned_slice<I, dim0>& x) const noexcept {
+    auto result = aligned_vector<O, dim1>::from(b);
+    simd::crelu255_matrix_vector_product<dim0, dim1>(W, x.data, result.data);
+    return result;
+  }
+
   template <typename streamer_type>
-  [[maybe_unused]] dense_relu_affine_layer<T, dim0, dim1>& load_(streamer_type& streamer) noexcept {
-    streamer.template stream<T>(W, W_numel).template stream<dot_type<T>>(b, b_numel);
+  [[maybe_unused]] dense_relu_affine_layer<dim0, dim1, T, I, O>& load_(streamer_type& streamer) noexcept {
+    streamer.template stream<T>(W, W_numel).template stream<O>(b, b_numel);
     return *this;
   }
 
   template <typename exporter_type>
-  [[maybe_unused]] const dense_relu_affine_layer<T, dim0, dim1>& write_(exporter_type& exporter) const noexcept {
-    exporter.template write<T>(W, W_numel).template write<dot_type<T>>(b, b_numel);
+  [[maybe_unused]] const dense_relu_affine_layer<dim0, dim1, T, I, O>& write_(exporter_type& exporter) const noexcept {
+    exporter.template write<T>(W, W_numel).template write<O>(b, b_numel);
     return *this;
   }
 
-  [[nodiscard]] dense_relu_affine_layer<T, dim0, dim1> half_input_flipped() const noexcept {
+  [[nodiscard]] dense_relu_affine_layer<dim0, dim1, T, I, O> half_input_flipped() const noexcept {
     static_assert(dim0 % 2 == 0);
     constexpr std::size_t half_dim0 = dim0 / 2;
 
-    dense_relu_affine_layer<T, dim0, dim1> result = *this;
+    dense_relu_affine_layer<dim0, dim1, T, I, O> result = *this;
+
     for (std::size_t i(0); i < W_numel; i += dim0) {
       for (std::size_t j(0); j < half_dim0; ++j) { std::iter_swap(result.W + i + j, result.W + half_dim0 + i + j); }
     }
 
     return result;
   }
 
-  template <typename U>
-  [[nodiscard]] dense_relu_affine_layer<U, dim0, dim1> quantized(const T& weight_scale, const T& bias_scale) const noexcept {
-    static_assert(std::is_floating_point_v<T> && std::is_integral_v<U>);
-    dense_relu_affine_layer<U, dim0, dim1> result{};
-#pragma omp simd
-    for (std::size_t i = 0; i < W_numel; ++i) { result.W[i] = static_cast<U>(std::round(weight_scale * W[i])); }
-    for (std::size_t i = 0; i < b_numel; ++i) { result.b[i] = static_cast<dot_type<U>>(std::round(bias_scale * b[i])); }
+  template <typename Q, typename QI = Q, typename QO = dot_type<QI>>
+  [[nodiscard]] dense_relu_affine_layer<dim0, dim1, Q, QI, QO> quantized(const T& weight_scale, const T& bias_scale) const noexcept {
+    static_assert(std::is_floating_point_v<T> && std::is_integral_v<Q> && std::is_integral_v<QI> && std::is_integral_v<QO>);
+    dense_relu_affine_layer<dim0, dim1, Q, QI, QO> result{};
+
+    for (std::size_t i = 0; i < W_numel; ++i) {
+      const float lower_limit = static_cast<float>(std::numeric_limits<Q>::min());
+      const float upper_limit = static_cast<float>(std::numeric_limits<Q>::max());
+      result.W[i] = static_cast<Q>(std::clamp(std::round(weight_scale * W[i]), lower_limit, upper_limit));
+    }
+
+    for (std::size_t i = 0; i < b_numel; ++i) {
+      const float lower_limit = static_cast<float>(std::numeric_limits<QO>::min());
+      const float upper_limit = static_cast<float>(std::numeric_limits<QO>::max());
+      result.b[i] = static_cast<QO>(std::clamp(std::round(bias_scale * b[i]), lower_limit, upper_limit));
+    }
+
     return result;
   }
 };

diff --git a/include/nnue/eval.h b/include/nnue/eval.h
@@ -90,10 +90,10 @@ struct eval : public chess::sided<eval, feature_transformer<weights::quantized_p
 
   template <typename F>
   [[nodiscard]] inline propagate_data<std::invoke_result_t<F, final_output_type>> propagate(const bool pov, F&& final_output_encoder) const noexcept {
-    const auto x1 = (pov ? weights_->white_fc0 : weights_->black_fc0).forward(base_).dequantized<parameter_type>(weights::dequantization_scale);
-    const auto x2 = concat(x1, weights_->fc1.forward(x1));
-    const auto x3 = concat(x2, weights_->fc2.forward(x2));
-    return propagate_data(final_output_encoder(x3), weights_->fc3.forward(x3).item());
+    const auto x1 = (pov ? weights_->white_fc0 : weights_->black_fc0).forward_crelu255(base_).dequantized<parameter_type>(weights::dequantization_scale);
+    const auto x2 = concat(x1, weights_->fc1.forward_relu(x1));
+    const auto x3 = concat(x2, weights_->fc2.forward_relu(x2));
+    return propagate_data(final_output_encoder(x3), weights_->fc3.forward_relu(x3).item());
   }
 
   template <typename F = void_final_output_encoder>