root-project · hahnjo · Feb 2, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
@@ -1,3 +1,6 @@
+add_executable(hist_benchmark_atomic hist_benchmark_atomic.cxx)
+target_link_libraries(hist_benchmark_atomic ROOTHist benchmark::benchmark)
+
 add_executable(hist_benchmark_axes hist_benchmark_axes.cxx)
 target_link_libraries(hist_benchmark_axes ROOTHist benchmark::benchmark)
 

@@ -0,0 +1,125 @@
+#include <ROOT/RBinWithError.hxx>
+#include <ROOT/RHistUtils.hxx>
+
+// Note: In principle, the benchmark library supports multithreaded benchmarks. However, the output has historically
+// been confusing (wrong): https://github.com/google/benchmark/issues/1834 This was changed / fixed in version 1.9.0:
+// https://github.com/google/benchmark/pull/1836 Even then, the minimum benchmark time is not correctly respected (in
+// the way I would expect): https://github.com/google/benchmark/issues/2117
+// Moreover, measuring contention heavily depends on the machine, cache coherency, tread placement / binding, and other
+// factors. Reproducibility is very hard to achieve with the limited control that the benchmark library offers.
+// For these reasons, the following benchmarks are not automatically run with multiple threads to benchmark contention.
+// Care should be taken when manually changing and trying to interpret the results!
+// To keep this possiblity though, the benchmarks MUST NOT use benchmark::DoNotOptimize() on non-const references to
+// shared variables: Older compilers (GCC before version 12.1.0) will load and store the value through the reference,
+// which leads to data races!
+#include <benchmark/benchmark.h>
+
+#include <cstddef>
+
+struct RHistAtomic_int : public benchmark::Fixture {
+   int fAtomic = 0;
+};
+
+BENCHMARK_DEFINE_F(RHistAtomic_int, Add)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      fAtomic += 1;
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_int, Add);
+
+BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAdd)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicAdd(&fAtomic, 1);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAdd);
+
+struct RHistAtomic_float : public benchmark::Fixture {
+   float fAtomic = 0;
+};
+
+BENCHMARK_DEFINE_F(RHistAtomic_float, Add)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      fAtomic += 1.0f;
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_float, Add);
+
+BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAdd)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicAdd(&fAtomic, 1.0f);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAdd);
+
+struct RHistAtomic_double : public benchmark::Fixture {
+   double fAtomic = 0;
+};
+
+BENCHMARK_DEFINE_F(RHistAtomic_double, Add)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      fAtomic += 1.0;
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_double, Add);
+
+BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAdd)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicAdd(&fAtomic, 1.0);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAdd);
+
+struct RBinWithError : public benchmark::Fixture {
+   ROOT::Experimental::RBinWithError fBin;
+};
+
+BENCHMARK_DEFINE_F(RBinWithError, Inc)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      fBin++;
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RBinWithError, Inc);
+
+BENCHMARK_DEFINE_F(RBinWithError, AtomicInc)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      fBin.AtomicInc();
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RBinWithError, AtomicInc);
+
+BENCHMARK_DEFINE_F(RBinWithError, Add)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      fBin += 1.0;
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RBinWithError, Add);
+
+BENCHMARK_DEFINE_F(RBinWithError, AtomicAdd)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      fBin.AtomicAdd(1.0);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RBinWithError, AtomicAdd);
+
+BENCHMARK_MAIN();
@@ -7,6 +7,8 @@
 
 #include "RHistUtils.hxx"
 
+#include <cmath>
+
 namespace ROOT {
 namespace Experimental {
 
@@ -58,26 +60,43 @@ struct RBinWithError final {
       return *this;
    }
 
-   void AtomicInc()
+private:
+   void AtomicAdd(double a, double a2)
-   void AtomicAdd(double a, double a2)
+   void SumAtomicAdd(double a, double a2)
-   void AtomicAdd(double a, double a2)
+   void AtomicAddToSum(double a, double a2)
-   void AtomicAdd(double a, double a2)
+   void SumAtomicAdd(double a, double a2)
-   void AtomicAdd(double a, double a2)
+   void AtomicAddToSum(double a, double a2)
    {
-      Internal::AtomicInc(&fSum);
-      Internal::AtomicInc(&fSum2);
+      // The sum of squares is always non-negative. Use the sign bit to implement a cheap spin lock.
+      double origSum2;
+      Internal::AtomicLoad(&fSum2, &origSum2);
+
+      while (true) {
+         // Repeat loads from memory until we see a non-negative value.
+         // NB: do not use origSum2 < 0, it does not work for -0.0!
+         while (std::signbit(origSum2)) {
+            Internal::AtomicLoad(&fSum2, &origSum2);
+         }
+
+         // The variable appears to be unlocked, confirm with a compare-exchange.
+         double negated = std::copysign(origSum2, -1.0);
+         if (Internal::AtomicCompareExchangeAcquire(&fSum2, &origSum2, &negated)) {
+            break;
+         }
+      }
+
+      // By using a spin lock, we do not need atomic operations for fSum.
+      fSum += a;
+
+      double sum2 = origSum2 + a2;
+      Internal::AtomicStoreRelease(&fSum2, &sum2);
    }
 
-   void AtomicAdd(double w)
-   {
-      Internal::AtomicAdd(&fSum, w);
-      Internal::AtomicAdd(&fSum2, w * w);
-   }
+public:
+   void AtomicInc() { AtomicAdd(1.0, 1.0); }
+
+   void AtomicAdd(double w) { AtomicAdd(w, w * w); }
 
    /// Add another bin content using atomic instructions.
    ///
    /// \param[in] rhs another bin content that must not be modified during the operation
-   void AtomicAdd(const RBinWithError &rhs)
-   {
-      Internal::AtomicAdd(&fSum, rhs.fSum);
-      Internal::AtomicAdd(&fSum2, rhs.fSum2);
-   }
+   void AtomicAdd(const RBinWithError &rhs) { AtomicAdd(rhs.fSum, rhs.fSum2); }
 };
 
 } // namespace Experimental

@@ -5,6 +5,7 @@
 #ifndef ROOT_RHistUtils
 #define ROOT_RHistUtils
 
+#include <atomic>
 #include <type_traits>
 
 #ifdef _MSC_VER
@@ -33,6 +34,10 @@ struct AtomicOps<1> {
    {
       *static_cast<char *>(ret) = __iso_volatile_load8(static_cast<const char *>(ptr));
    }
+   static void Store(void *ptr, void *val)
+   {
+      __iso_volatile_store8(static_cast<char *>(ptr), *static_cast<char *>(val));
+   }
    static void Add(void *ptr, const void *val)
    {
       _InterlockedExchangeAdd8(static_cast<char *>(ptr), *static_cast<const char *>(val));
@@ -57,6 +62,10 @@ struct AtomicOps<2> {
    {
       *static_cast<short *>(ret) = __iso_volatile_load16(static_cast<const short *>(ptr));
    }
+   static void Store(void *ptr, void *val)
+   {
+      __iso_volatile_store16(static_cast<short *>(ptr), *static_cast<short *>(val));
+   }
    static void Add(void *ptr, const void *val)
    {
       _InterlockedExchangeAdd16(static_cast<short *>(ptr), *static_cast<const short *>(val));
@@ -81,6 +90,10 @@ struct AtomicOps<4> {
    {
       *static_cast<int *>(ret) = __iso_volatile_load32(static_cast<const int *>(ptr));
    }
+   static void Store(void *ptr, void *val)
+   {
+      __iso_volatile_store32(static_cast<int *>(ptr), *static_cast<int *>(val));
+   }
    static void Add(void *ptr, const void *val)
    {
       _InterlockedExchangeAdd(static_cast<long *>(ptr), *static_cast<const long *>(val));
@@ -105,6 +118,10 @@ struct AtomicOps<8> {
    {
       *static_cast<__int64 *>(ret) = __iso_volatile_load64(static_cast<const __int64 *>(ptr));
    }
+   static void Store(void *ptr, void *val)
+   {
+      __iso_volatile_store64(static_cast<__int64 *>(ptr), *static_cast<__int64 *>(val));
+   }
    static void Add(void *ptr, const void *val);
    static bool CompareExchange(void *ptr, void *expected, const void *desired)
    {
@@ -132,6 +149,18 @@ void AtomicLoad(const T *ptr, T *ret)
 #endif
 }
 
+template <typename T>
+void AtomicStoreRelease(T *ptr, T *val)
+{
+#ifndef _MSC_VER
+   __atomic_store(ptr, val, __ATOMIC_RELEASE);
+#else
+   // Cannot specify the memory order directly, use a fence.
+   std::atomic_thread_fence(std::memory_order_release);
+   MSVC::AtomicOps<sizeof(T)>::Store(ptr, val);
+#endif
+}
+
 template <typename T>
 bool AtomicCompareExchange(T *ptr, T *expected, T *desired)
 {
@@ -142,6 +171,19 @@ bool AtomicCompareExchange(T *ptr, T *expected, T *desired)
 #endif
 }
 
+template <typename T>
+bool AtomicCompareExchangeAcquire(T *ptr, T *expected, T *desired)
+{
+#ifndef _MSC_VER
+   return __atomic_compare_exchange(ptr, expected, desired, /*weak=*/false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+#else
+   bool success = MSVC::AtomicOps<sizeof(T)>::CompareExchange(ptr, expected, desired);
+   // Cannot specify the memory order directly, use an unconditional fence to avoid branching code.
+   std::atomic_thread_fence(std::memory_order_acquire);
+   return success;
+#endif
+}
+
 template <typename T>
 void AtomicAddCompareExchangeLoop(T *ptr, T val)
 {

@@ -53,3 +53,31 @@ TEST(AtomicAdd, FloatDouble)
    ROOT::Experimental::Internal::AtomicAdd(&a, b);
    EXPECT_EQ(a, 3);
 }
+
+TEST(RBinWithError, AtomicAdd)
+{
+   RBinWithError bin;
+   bin.fSum = 1;
+   bin.fSum2 = 2;
+   bin.AtomicAdd(1.5);
+   EXPECT_EQ(bin.fSum, 2.5);
+   EXPECT_EQ(bin.fSum2, 4.25);
+}
+
+TEST(RBinWithError, StressAtomicAdd)
+{
+   static constexpr double Addend = 2.0;
+   static constexpr std::size_t NThreads = 4;
+   static constexpr std::size_t NAddsPerThread = 8000;
+   static constexpr std::size_t NAdds = NThreads * NAddsPerThread;
+
+   RBinWithError bin;
+   StressInParallel(NThreads, [&] {
+      for (std::size_t i = 0; i < NAddsPerThread; i++) {
+         bin.AtomicAdd(Addend);
+      }
+   });
+
+   EXPECT_EQ(bin.fSum, NAdds * Addend);
+   EXPECT_EQ(bin.fSum2, NAdds * Addend * Addend);
+}