Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions hist/histv7/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
add_executable(hist_benchmark_atomic hist_benchmark_atomic.cxx)
target_link_libraries(hist_benchmark_atomic ROOTHist benchmark::benchmark)

add_executable(hist_benchmark_axes hist_benchmark_axes.cxx)
target_link_libraries(hist_benchmark_axes ROOTHist benchmark::benchmark)

Expand Down
125 changes: 125 additions & 0 deletions hist/histv7/benchmark/hist_benchmark_atomic.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#include <ROOT/RBinWithError.hxx>
#include <ROOT/RHistUtils.hxx>

// Note: In principle, the benchmark library supports multithreaded benchmarks. However, the output has historically
// been confusing (wrong): https://github.com/google/benchmark/issues/1834 This was changed / fixed in version 1.9.0:
// https://github.com/google/benchmark/pull/1836 Even then, the minimum benchmark time is not correctly respected (in
// the way I would expect): https://github.com/google/benchmark/issues/2117
// Moreover, measuring contention heavily depends on the machine, cache coherency, tread placement / binding, and other
// factors. Reproducibility is very hard to achieve with the limited control that the benchmark library offers.
// For these reasons, the following benchmarks are not automatically run with multiple threads to benchmark contention.
// Care should be taken when manually changing and trying to interpret the results!
// To keep this possiblity though, the benchmarks MUST NOT use benchmark::DoNotOptimize() on non-const references to
// shared variables: Older compilers (GCC before version 12.1.0) will load and store the value through the reference,
// which leads to data races!
#include <benchmark/benchmark.h>

#include <cstddef>

struct RHistAtomic_int : public benchmark::Fixture {
int fAtomic = 0;
};

BENCHMARK_DEFINE_F(RHistAtomic_int, Add)(benchmark::State &state)
{
for (auto _ : state) {
fAtomic += 1;
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RHistAtomic_int, Add);

BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAdd)(benchmark::State &state)
{
for (auto _ : state) {
ROOT::Experimental::Internal::AtomicAdd(&fAtomic, 1);
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAdd);

struct RHistAtomic_float : public benchmark::Fixture {
float fAtomic = 0;
};

BENCHMARK_DEFINE_F(RHistAtomic_float, Add)(benchmark::State &state)
{
for (auto _ : state) {
fAtomic += 1.0f;
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RHistAtomic_float, Add);

BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAdd)(benchmark::State &state)
{
for (auto _ : state) {
ROOT::Experimental::Internal::AtomicAdd(&fAtomic, 1.0f);
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAdd);

struct RHistAtomic_double : public benchmark::Fixture {
double fAtomic = 0;
};

BENCHMARK_DEFINE_F(RHistAtomic_double, Add)(benchmark::State &state)
{
for (auto _ : state) {
fAtomic += 1.0;
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RHistAtomic_double, Add);

BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAdd)(benchmark::State &state)
{
for (auto _ : state) {
ROOT::Experimental::Internal::AtomicAdd(&fAtomic, 1.0);
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAdd);

struct RBinWithError : public benchmark::Fixture {
ROOT::Experimental::RBinWithError fBin;
};

BENCHMARK_DEFINE_F(RBinWithError, Inc)(benchmark::State &state)
{
for (auto _ : state) {
fBin++;
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RBinWithError, Inc);

BENCHMARK_DEFINE_F(RBinWithError, AtomicInc)(benchmark::State &state)
{
for (auto _ : state) {
fBin.AtomicInc();
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RBinWithError, AtomicInc);

BENCHMARK_DEFINE_F(RBinWithError, Add)(benchmark::State &state)
{
for (auto _ : state) {
fBin += 1.0;
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RBinWithError, Add);

BENCHMARK_DEFINE_F(RBinWithError, AtomicAdd)(benchmark::State &state)
{
for (auto _ : state) {
fBin.AtomicAdd(1.0);
benchmark::ClobberMemory();
}
}
BENCHMARK_REGISTER_F(RBinWithError, AtomicAdd);

BENCHMARK_MAIN();
45 changes: 32 additions & 13 deletions hist/histv7/inc/ROOT/RBinWithError.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

#include "RHistUtils.hxx"

#include <cmath>

namespace ROOT {
namespace Experimental {

Expand Down Expand Up @@ -58,26 +60,43 @@ struct RBinWithError final {
return *this;
}

void AtomicInc()
private:
void AtomicAdd(double a, double a2)
Copy link
Member

@pcanal pcanal Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
void AtomicAdd(double a, double a2)
void SumAtomicAdd(double a, double a2)

As this is specifically to atomic at the sum/sum2.
or

Suggested change
void AtomicAdd(double a, double a2)
void AtomicAddToSum(double a, double a2)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would prefer to keep the current naming scheme: User-facing functions, such as FillAtomic, are implemented in terms of increasingly lower-level functions AtomicAdd. Given that it's a private function that is used to implemented other AtomicAdd functions just below, I think the current name is sufficient.

{
Internal::AtomicInc(&fSum);
Internal::AtomicInc(&fSum2);
// The sum of squares is always non-negative. Use the sign bit to implement a cheap spin lock.
double origSum2;
Internal::AtomicLoad(&fSum2, &origSum2);

while (true) {
// Repeat loads from memory until we see a non-negative value.
// NB: do not use origSum2 < 0, it does not work for -0.0!
while (std::signbit(origSum2)) {
Internal::AtomicLoad(&fSum2, &origSum2);
}

// The variable appears to be unlocked, confirm with a compare-exchange.
double negated = std::copysign(origSum2, -1.0);
if (Internal::AtomicCompareExchangeAcquire(&fSum2, &origSum2, &negated)) {
break;
}
}

// By using a spin lock, we do not need atomic operations for fSum.
fSum += a;

double sum2 = origSum2 + a2;
Internal::AtomicStoreRelease(&fSum2, &sum2);
}

void AtomicAdd(double w)
{
Internal::AtomicAdd(&fSum, w);
Internal::AtomicAdd(&fSum2, w * w);
}
public:
void AtomicInc() { AtomicAdd(1.0, 1.0); }

void AtomicAdd(double w) { AtomicAdd(w, w * w); }

/// Add another bin content using atomic instructions.
///
/// \param[in] rhs another bin content that must not be modified during the operation
void AtomicAdd(const RBinWithError &rhs)
{
Internal::AtomicAdd(&fSum, rhs.fSum);
Internal::AtomicAdd(&fSum2, rhs.fSum2);
}
void AtomicAdd(const RBinWithError &rhs) { AtomicAdd(rhs.fSum, rhs.fSum2); }
};

} // namespace Experimental
Expand Down
42 changes: 42 additions & 0 deletions hist/histv7/inc/ROOT/RHistUtils.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#ifndef ROOT_RHistUtils
#define ROOT_RHistUtils

#include <atomic>
#include <type_traits>

#ifdef _MSC_VER
Expand Down Expand Up @@ -33,6 +34,10 @@ struct AtomicOps<1> {
{
*static_cast<char *>(ret) = __iso_volatile_load8(static_cast<const char *>(ptr));
}
static void Store(void *ptr, void *val)
{
__iso_volatile_store8(static_cast<char *>(ptr), *static_cast<char *>(val));
}
static void Add(void *ptr, const void *val)
{
_InterlockedExchangeAdd8(static_cast<char *>(ptr), *static_cast<const char *>(val));
Expand All @@ -57,6 +62,10 @@ struct AtomicOps<2> {
{
*static_cast<short *>(ret) = __iso_volatile_load16(static_cast<const short *>(ptr));
}
static void Store(void *ptr, void *val)
{
__iso_volatile_store16(static_cast<short *>(ptr), *static_cast<short *>(val));
}
static void Add(void *ptr, const void *val)
{
_InterlockedExchangeAdd16(static_cast<short *>(ptr), *static_cast<const short *>(val));
Expand All @@ -81,6 +90,10 @@ struct AtomicOps<4> {
{
*static_cast<int *>(ret) = __iso_volatile_load32(static_cast<const int *>(ptr));
}
static void Store(void *ptr, void *val)
{
__iso_volatile_store32(static_cast<int *>(ptr), *static_cast<int *>(val));
}
static void Add(void *ptr, const void *val)
{
_InterlockedExchangeAdd(static_cast<long *>(ptr), *static_cast<const long *>(val));
Expand All @@ -105,6 +118,10 @@ struct AtomicOps<8> {
{
*static_cast<__int64 *>(ret) = __iso_volatile_load64(static_cast<const __int64 *>(ptr));
}
static void Store(void *ptr, void *val)
{
__iso_volatile_store64(static_cast<__int64 *>(ptr), *static_cast<__int64 *>(val));
}
static void Add(void *ptr, const void *val);
static bool CompareExchange(void *ptr, void *expected, const void *desired)
{
Expand Down Expand Up @@ -132,6 +149,18 @@ void AtomicLoad(const T *ptr, T *ret)
#endif
}

template <typename T>
void AtomicStoreRelease(T *ptr, T *val)
{
#ifndef _MSC_VER
__atomic_store(ptr, val, __ATOMIC_RELEASE);
#else
// Cannot specify the memory order directly, use a fence.
std::atomic_thread_fence(std::memory_order_release);
MSVC::AtomicOps<sizeof(T)>::Store(ptr, val);
#endif
}

template <typename T>
bool AtomicCompareExchange(T *ptr, T *expected, T *desired)
{
Expand All @@ -142,6 +171,19 @@ bool AtomicCompareExchange(T *ptr, T *expected, T *desired)
#endif
}

template <typename T>
bool AtomicCompareExchangeAcquire(T *ptr, T *expected, T *desired)
{
#ifndef _MSC_VER
return __atomic_compare_exchange(ptr, expected, desired, /*weak=*/false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
#else
bool success = MSVC::AtomicOps<sizeof(T)>::CompareExchange(ptr, expected, desired);
// Cannot specify the memory order directly, use an unconditional fence to avoid branching code.
std::atomic_thread_fence(std::memory_order_acquire);
return success;
#endif
}

template <typename T>
void AtomicAddCompareExchangeLoop(T *ptr, T val)
{
Expand Down
28 changes: 28 additions & 0 deletions hist/histv7/test/hist_atomic.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,31 @@ TEST(AtomicAdd, FloatDouble)
ROOT::Experimental::Internal::AtomicAdd(&a, b);
EXPECT_EQ(a, 3);
}

TEST(RBinWithError, AtomicAdd)
{
RBinWithError bin;
bin.fSum = 1;
bin.fSum2 = 2;
bin.AtomicAdd(1.5);
EXPECT_EQ(bin.fSum, 2.5);
EXPECT_EQ(bin.fSum2, 4.25);
}

TEST(RBinWithError, StressAtomicAdd)
{
static constexpr double Addend = 2.0;
static constexpr std::size_t NThreads = 4;
static constexpr std::size_t NAddsPerThread = 8000;
static constexpr std::size_t NAdds = NThreads * NAddsPerThread;

RBinWithError bin;
StressInParallel(NThreads, [&] {
for (std::size_t i = 0; i < NAddsPerThread; i++) {
bin.AtomicAdd(Addend);
}
});

EXPECT_EQ(bin.fSum, NAdds * Addend);
EXPECT_EQ(bin.fSum2, NAdds * Addend * Addend);
}
Loading