From 4045540168dfc0eb871bb4e2ff141a6823478e71 Mon Sep 17 00:00:00 2001
From: Griffin Bassman <griffinbassman@gmail.com>
Date: Fri, 10 Nov 2023 16:06:20 -0500
Subject: [PATCH] revert

---
 .github/workflows/vendor_build.yml            |  4 +-
 .vscode/settings.json                         | 85 ++++++++++++++++++-
 test/run_tests.py                             |  4 +-
 .../core/include/vw/core/gd_predict.h         | 16 +---
 vowpalwabbit/core/src/loss_functions.cc       | 29 +++----
 vowpalwabbit/core/src/reductions/gd.cc        | 71 +++-------------
 6 files changed, 110 insertions(+), 99 deletions(-)

diff --git a/.github/workflows/vendor_build.yml b/.github/workflows/vendor_build.yml
index d52ba9f7f4d..e15b5ae89ab 100644
--- a/.github/workflows/vendor_build.yml
+++ b/.github/workflows/vendor_build.yml
@@ -22,7 +22,7 @@ jobs:
         build_type: ["Debug", "Release"]
         compiler:
         - { cc: "gcc", cxx: "g++"}
-        #- { cc: "clang", cxx: "clang++"}
+        - { cc: "clang", cxx: "clang++"}
     runs-on: ${{matrix.os}}
     steps:
       - uses: actions/checkout@v3
@@ -56,8 +56,6 @@ jobs:
         run: ctest --output-on-failure --no-tests=error --label-regex VWTestList --parallel 2
       - name: Test
         run: python3 test/run_tests.py -f --clean_dirty -E 0.001
-        env:
-          NUM_JOBS: 1
   build_vendor_windows:
     name: core-cli.${{ matrix.os }}.amd64.${{ matrix.build_type }}.msvc.standalone
     runs-on: ${{matrix.os}}
diff --git a/.vscode/settings.json b/.vscode/settings.json
index f7a94e718de..f7ee20978da 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -6,5 +6,88 @@
             ],
             "url": "./test/vwtest.schema.json"
         }
-    ]
+    ],
+    "files.associations": {
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "csignal": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "*.ipp": "cpp",
+        "any": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "hash_map": "cpp",
+        "hash_set": "cpp",
+        "strstream": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "cfenv": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "cinttypes": "cpp",
+        "codecvt": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "condition_variable": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "forward_list": "cpp",
+        "list": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "regex": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "future": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "ranges": "cpp",
+        "semaphore": "cpp",
+        "span": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "stop_token": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "valarray": "cpp",
+        "variant": "cpp"
+    }
 }
diff --git a/test/run_tests.py b/test/run_tests.py
index 23f9ae62497..ecb38118c87 100644
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -1027,7 +1027,7 @@ def main():
         "-j",
         "--jobs",
         type=int,
-        default=1,
+        default=os.cpu_count(),
         help="Number of tests to run in parallel. Default is current machine core count.",
     )
     parser.add_argument(
@@ -1234,7 +1234,7 @@ def main():
     tasks: List[Future[TestOutcome]] = []
     completed_tests = Completion()
 
-    executor = ThreadPoolExecutor(max_workers=1)
+    executor = ThreadPoolExecutor(max_workers=args.jobs)
 
     for test in tests:
         tasks.append(
diff --git a/vowpalwabbit/core/include/vw/core/gd_predict.h b/vowpalwabbit/core/include/vw/core/gd_predict.h
index 886d9f7ce98..868756250f8 100644
--- a/vowpalwabbit/core/include/vw/core/gd_predict.h
+++ b/vowpalwabbit/core/include/vw/core/gd_predict.h
@@ -7,7 +7,6 @@
 #include "vw/core/example_predict.h"
 #include "vw/core/interactions_predict.h"
 #include "vw/core/v_array.h"
-#include <iostream>
 
 #undef VW_DEBUG_LOG
 #define VW_DEBUG_LOG vw_dbg::GD_PREDICT
@@ -38,15 +37,7 @@ inline void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& da
   for (const auto& f : fs)
   {
     VW::weight& w = weights[(f.index() + offset)];
-    //std::cout << "Upd Index: " << f.index() << "\n";
-    //std::cout << "Upd Value: " << f.value() << "\n";
-    //std::cout << "Upd Mult: " << mult << "\n";
     FuncT(dat, mult * f.value(), w);
-    //VW::weight* w_ptr = &w;
-    //std::cout << "Upd w[0]: " << w_ptr[0] << "\n";
-    //std::cout << "Upd w[1]: " << w_ptr[1] << "\n";
-    //std::cout << "Upd w[2]: " << w_ptr[2] << "\n";
-    //std::cout << "Upd w[3]: " << w_ptr[3] << "\n";
   }
 }
 
@@ -55,12 +46,7 @@ template <class DataT, void (*FuncT)(DataT&, float, float), class WeightsT>
 inline void foreach_feature(
     const WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
 {
-  for (const auto& f : fs)
-  {
-    //std::cout << "Pred Index: " << f.index() << "\n";
-    //std::cout << "Pred Weight: " << weights[static_cast<size_t>(f.index() + offset)] << "\n";
-    FuncT(dat, mult * f.value(), weights[static_cast<size_t>(f.index() + offset)]);
-  }
+  for (const auto& f : fs) { FuncT(dat, mult * f.value(), weights[static_cast<size_t>(f.index() + offset)]); }
 }
 
 template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT),
diff --git a/vowpalwabbit/core/src/loss_functions.cc b/vowpalwabbit/core/src/loss_functions.cc
index 2d0aa9d470b..520106ec833 100644
--- a/vowpalwabbit/core/src/loss_functions.cc
+++ b/vowpalwabbit/core/src/loss_functions.cc
@@ -13,7 +13,6 @@
 #include <cfloat>
 #include <cmath>
 #include <cstdlib>
-#include <iostream> // Make sure to include this at the top of your file
 
 namespace
 {
@@ -43,24 +42,16 @@ inline float squared_loss_impl_get_loss(const VW::shared_data* sd, float predict
 
 inline float squared_loss_impl_get_update(float prediction, float label, float update_scale, float pred_per_update)
 {
-    //std::cout << "Prediction: " << prediction << ", Label: " << label << ", Update Scale: " << update_scale 
-             // << ", Pred Per Update: " << pred_per_update << "\n";
-
-    if (update_scale * pred_per_update < 1e-6)
-    {
-       // std::cout << "Entering first branch (update_scale * pred_per_update < 1e-6)" << "\n";
-        float update = 2.f * (label - prediction) * update_scale;
-       // std::cout << "Update (first branch): " << update << "\n";
-        return update;
-    }
-
-   // std::cout << "Entering second branch" << "\n";
-    float exp_component = VW::details::correctedExp(-2.f * update_scale * pred_per_update);
-   // std::cout << "Exp Component: " << exp_component << "\n";
-    float update = (label - prediction) * (1.f - exp_component) / pred_per_update;
-   // std::cout << "Update (second branch): " << update << "\n";
-
-    return update;
+  if (update_scale * pred_per_update < 1e-6)
+  {
+    /* When exp(-eta_t)~= 1 we replace 1-exp(-eta_t)
+     * with its first order Taylor expansion around 0
+     * to avoid catastrophic cancellation.
+     */
+    return 2.f * (label - prediction) * update_scale;
+  }
+  return (label - prediction) * (1.f - VW::details::correctedExp(-2.f * update_scale * pred_per_update)) /
+      pred_per_update;
 }
 
 inline float squared_loss_impl_get_unsafe_update(float prediction, float label, float update_scale)
diff --git a/vowpalwabbit/core/src/reductions/gd.cc b/vowpalwabbit/core/src/reductions/gd.cc
index 97027f538a2..8ac8ac6a7f4 100644
--- a/vowpalwabbit/core/src/reductions/gd.cc
+++ b/vowpalwabbit/core/src/reductions/gd.cc
@@ -16,8 +16,6 @@
 
 #include <algorithm>
 #include <cfloat>
-#include <iostream>
-#include <iomanip>
 
 #if !defined(VW_NO_INLINE_SIMD)
 #  if !defined(__SSE2__) && (defined(_M_AMD64) || defined(_M_X64))
@@ -166,19 +164,15 @@ static inline float inv_sqrt(float x)
   // N-R iteration 2
   float32x2_t e3 = vmul_f32(e2, vrsqrts_f32(v1, vmul_f32(e2, e2)));
   // Extract result
-  std::cout << "__ARM_NEON__" << "\n";
   return vget_lane_f32(e3, 0);
 #  elif defined(__SSE2__)
   __m128 eta = _mm_load_ss(&x);
   eta = _mm_rsqrt_ss(eta);
   _mm_store_ss(&x, eta);
-  std::cout << "__SSE2__" << "\n";
 #  else
-  std::cout << "None" << "\n";
   x = quake_inv_sqrt(x);
 #  endif
 #else
-  std::cout << "VW_NO_INLINE_SIMD" << "\n";
   x = quake_inv_sqrt(x);
 #endif
 
@@ -194,12 +188,7 @@ inline void update_feature(float& update, float x, float& fw)
   bool modify = x < FLT_MAX && x > -FLT_MAX && (feature_mask_off || fw != 0.);
   if (modify)
   {
-    if VW_STD17_CONSTEXPR (spare != 0)
-    {
-      //std::cout << "Upd spare: " << w[spare] << "\n";
-      x *= w[spare];
-    }
-    //std::cout << "Upd update: " << update << "\n";
+    if VW_STD17_CONSTEXPR (spare != 0) { x *= w[spare]; }
     w[0] += update * x;
   }
 }
@@ -630,42 +619,22 @@ class power_data
 template <bool sqrt_rate, size_t adaptive, size_t normalized>
 inline float compute_rate_decay(power_data& s, float& fw)
 {
-  std::cout << std::fixed << std::setprecision(10); // Set high precision for floating-point output
-
   VW::weight* w = &fw;
-  //std::cout << "Input fw: " << fw << std::endl;
-
   float rate_decay = 1.f;
   if (adaptive)
   {
-    if (sqrt_rate)
-    {
-      rate_decay = inv_sqrt(w[adaptive]);
-      //std::cout << "Rate decay after inv_sqrt: " << rate_decay << " (inv_sqrt of " << w[adaptive] << ")" << std::endl;
-    }
-    else
-    {
-      rate_decay = powf(w[adaptive], s.minus_power_t);
-      //std::cout << "Rate decay after powf (adaptive): " << rate_decay << " (powf of " << w[adaptive] << " ^ " << s.minus_power_t << ")" << std::endl;
-    }
+    if (sqrt_rate) { rate_decay = inv_sqrt(w[adaptive]); }
+    else { rate_decay = powf(w[adaptive], s.minus_power_t); }
   }
   if VW_STD17_CONSTEXPR (normalized != 0)
   {
     if (sqrt_rate)
     {
       float inv_norm = 1.f / w[normalized];
-      //std::cout << "Intermediate inv_norm: " << inv_norm << std::endl;
-
       if (adaptive) { rate_decay *= inv_norm; }
       else { rate_decay *= inv_norm * inv_norm; }
-
-      //std::cout << "Rate decay after normalization (sqrt_rate): " << rate_decay << std::endl;
-    }
-    else
-    {
-      rate_decay *= powf(w[normalized] * w[normalized], s.neg_norm_power);
-      //std::cout << "Rate decay after powf (normalized): " << rate_decay << " (powf of " << w[normalized] << " * " << w[normalized] << " ^ " << s.neg_norm_power << ")" << std::endl;
     }
+    else { rate_decay *= powf(w[normalized] * w[normalized], s.neg_norm_power); }
   }
   return rate_decay;
 }
@@ -745,37 +714,27 @@ template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, siz
     bool stateless>
 float get_pred_per_update(VW::reductions::gd& g, VW::example& ec)
 {
-  std::cout << std::fixed << std::setprecision(10); // Set high precision for floating-point output
-
+  // We must traverse the features in _precisely_ the same order as during training.
   auto& ld = ec.l.simple;
   VW::workspace& all = *g.all;
 
   float grad_squared = ec.weight;
-  if (!adax) {
-    grad_squared *= all.loss_config.loss->get_square_grad(ec.pred.scalar, ld.label);
-    //std::cout << "Grad Squared: " << grad_squared << std::endl;
-  }
+  if (!adax) { grad_squared *= all.loss_config.loss->get_square_grad(ec.pred.scalar, ld.label); }
 
-  if (grad_squared == 0 && !stateless) {
-    std::cout << "Returning early due to grad_squared == 0 and stateless == false" << std::endl;
-    return 1.;
-  }
+  if (grad_squared == 0 && !stateless) { return 1.; }
 
   norm_data nd = {grad_squared, 0., 0., {g.neg_power_t, g.neg_norm_power}, {0}, &g.all->logger};
-  // Print values in nd here if needed
-
-  VW::foreach_feature<norm_data, pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare, stateless>>(all, ec, nd);
-  // Add prints inside VW::foreach_feature if possible to check individual feature contributions
-
+  VW::foreach_feature<norm_data,
+      pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare, stateless>>(all, ec, nd);
   if VW_STD17_CONSTEXPR (normalized != 0)
   {
     if (!stateless)
     {
       g.current_model_state->normalized_sum_norm_x += (static_cast<double>(ec.weight)) * nd.norm_x;
       g.current_model_state->total_weight += ec.weight;
-      g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(
-        static_cast<float>(g.current_model_state->total_weight),
-        static_cast<float>(g.current_model_state->normalized_sum_norm_x), g.neg_norm_power);
+      g.update_multiplier =
+          average_update<sqrt_rate, adaptive, normalized>(static_cast<float>(g.current_model_state->total_weight),
+              static_cast<float>(g.current_model_state->normalized_sum_norm_x), g.neg_norm_power);
     }
     else
     {
@@ -783,15 +742,11 @@ float get_pred_per_update(VW::reductions::gd& g, VW::example& ec)
       float tw = static_cast<float>(g.current_model_state->total_weight) + ec.weight;
       g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(tw, nsnx, g.neg_norm_power);
     }
-    //std::cout << "Update Multiplier: " << g.update_multiplier << std::endl;
     nd.pred_per_update *= g.update_multiplier;
   }
-
-  //std::cout << "Pred Per Update: " << nd.pred_per_update << std::endl;
   return nd.pred_per_update;
 }
 
-
 template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare,
     bool stateless>
 float sensitivity(VW::reductions::gd& g, VW::example& ec)
@@ -889,8 +844,6 @@ void update(VW::reductions::gd& g, VW::example& ec)
   if ((update = compute_update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(
            g, ec)) != 0.)
   {
-    //std::cout << "Mult: " << g.update_multiplier << "\n";
-    //std::cout << "Update: " << update << "\n";
     train<sqrt_rate, feature_mask_off, adaptive, normalized, spare>(g, ec, update);
   }