Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 40 additions & 24 deletions folly/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -714,24 +714,61 @@ folly_add_library(
)

folly_add_library(
NAME memcpy
NAME memset-impl
SRCS
FollyMemset.cpp
$<$<BOOL:${IS_AARCH64_ARCH}>:memset_select_aarch64.cpp>
DEPS
$<$<BOOL:${IS_AARCH64_ARCH}>:folly_external_aor_memset_aarch64>
)

folly_add_library(
NAME memset
HEADERS
FollyMemcpy.h
FollyMemset.h
EXPORTED_DEPS
folly_memcpy-impl
folly_memset-impl
)

folly_add_library(
NAME memset-use
EXCLUDE_FROM_MONOLITH
SRCS
FollyMemset.cpp
$<$<BOOL:${IS_AARCH64_ARCH}>:memset_select_aarch64.cpp>
DEPS
$<$<BOOL:${IS_AARCH64_ARCH}>:folly_external_aor_memset_aarch64-use>
COMPILE_OPTIONS
$<$<BOOL:${IS_AARCH64_ARCH}>:-DFOLLY_MEMSET_IS_MEMSET>
)

folly_add_library(
NAME memcpy-impl
SRCS
FollyMemcpy.cpp
$<$<BOOL:${IS_AARCH64_ARCH}>:memcpy_select_aarch64.cpp>
DEPS
$<$<BOOL:${IS_AARCH64_ARCH}>:folly_external_aor_memcpy_aarch64>
)

folly_add_library(
NAME memcpy
HEADERS
FollyMemcpy.h
EXPORTED_DEPS
folly_memcpy-impl
)

folly_add_library(
NAME memcpy-use
EXCLUDE_FROM_MONOLITH
SRCS
FollyMemcpy.cpp
$<$<BOOL:${IS_AARCH64_ARCH}>:memcpy_select_aarch64.cpp>
DEPS
$<$<BOOL:${IS_AARCH64_ARCH}>:folly_external_aor_memcpy_aarch64-use>
COMPILE_OPTIONS
$<$<BOOL:${IS_AARCH64_ARCH}>:-DFOLLY_MEMCPY_IS_MEMCPY>
)

# x86 assembly memcpy implementation (not supported on MSVC)
Expand Down Expand Up @@ -767,27 +804,6 @@ folly_add_library(
folly_utility
)

folly_add_library(
NAME memset
HEADERS
FollyMemset.h
EXPORTED_DEPS
folly_memset-impl
)

folly_add_library(
NAME memset-impl
SRCS
FollyMemset.cpp
)

folly_add_library(
NAME memset-use
EXCLUDE_FROM_MONOLITH
SRCS
FollyMemset.cpp
)

folly_add_library(
NAME micro_lock
SRCS
Expand Down
88 changes: 37 additions & 51 deletions folly/random/xoshiro256pp.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <array>
#include <cstdint>
#include <cstring>
#include <limits>
#include <ostream>
#include <random>
Expand All @@ -31,17 +32,7 @@

namespace folly {

namespace detail {
#if defined(__AVX2__) && defined(__GNUC__)
using DefaultVectorType = __v4du; // GCC-specific unsigned vector type
#else
using DefaultVectorType = uint64_t; // Fallback for other compilers
#endif
} // namespace detail

using DefaultVectorType = detail::DefaultVectorType;

template <typename ResType, typename VectorType = DefaultVectorType>
template <typename ResType>
class xoshiro256pp {
public:
using result_type = ResType;
Expand Down Expand Up @@ -78,10 +69,10 @@ class xoshiro256pp {
}

void seed(uint64_t pSeed = default_seed) noexcept {
uint64_t seed = pSeed;
for (uint64_t re = 0; re < VecResCount; re++) {
for (uint64_t stat = 0; stat < StateSize; stat++) {
state[re][stat] = seed_vec<vector_type>(seed);
uint64_t seed_val = pSeed;
for (uint64_t result_idx = 0; result_idx < VecResCount; result_idx++) {
for (uint64_t state_idx = 0; state_idx < StateSize; state_idx++) {
state[idx(state_idx, result_idx)] = splitmix64(seed_val);
}
}
cur = ResultCount;
Expand All @@ -95,35 +86,25 @@ class xoshiro256pp {
}

private:
using vector_type = VectorType;
using vector_type = uint64_t;
static constexpr uint64_t StateSize = 4;
static constexpr uint64_t VecResCount = 8;
static constexpr uint64_t ResultCount =
VecResCount * (sizeof(vector_type) / sizeof(result_type));
union {
vector_type vecRes[VecResCount]{};
result_type res[ResultCount];
};
vector_type state[VecResCount][StateSize]{};
#if FOLLY_AARCH64
static constexpr uint64_t VecResCount = 16;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This, plus the elimination of the vector_type choice means it's actually cutting the buffer size in half under x86_64, which we don't want.

Any idea what the auto-vectorization looks like for this new version on x86_64? IIRC the original was a bit annoying to get right when I first wrote this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what autovectorisation looks like, but I believe Lukas had mentioned performance was better on x86_64 as well. I'll run some numbers to check and post back.

Would you rather we keep the old buffer size for x86_64 in any case?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll likely need to take another look at the buffer size after the final state of this diff lands, we should probably keep the old buffer size for x86_64 at least for now, since it's mostly going to be dependent on the width of the underlying vectors and how many of ALU vector operations the CPU can execute per cycle.

#else
static constexpr uint64_t VecResCount = 32;
#endif
static constexpr uint64_t size_ratio = sizeof(vector_type) / sizeof(result_type);
static constexpr uint64_t ResultCount = VecResCount * size_ratio;

alignas(64) vector_type state[StateSize * VecResCount]{};

result_type res[ResultCount];
uint64_t cur = ResultCount;

template <typename Size, typename VType, typename CharT, typename Traits>
template <typename Size, typename CharT, typename Traits>
friend std::basic_ostream<CharT, Traits>& operator<<(
std::basic_ostream<CharT, Traits>& os,
const xoshiro256pp<Size, VType>& rng);

template <typename T>
static inline T seed_vec(uint64_t& seed) {
if constexpr (sizeof(T) != sizeof(uint64_t)) {
T sbase{};
for (uint64_t i = 0; i < sizeof(vector_type) / sizeof(uint64_t); i++) {
sbase[i] = splitmix64(seed);
}
return sbase;
} else {
return T(splitmix64(seed));
}
}
const xoshiro256pp<Size>& rng);

static inline uint64_t splitmix64(uint64_t& cur) noexcept {
uint64_t z = (cur += 0x9e3779b97f4a7c15);
Expand All @@ -132,22 +113,27 @@ class xoshiro256pp {
return z ^ (z >> 31);
}

constexpr uint64_t FOLLY_ALWAYS_INLINE idx(uint64_t state_idx, uint64_t result_idx) noexcept {
return state_idx * VecResCount + result_idx;
}

FOLLY_ALWAYS_INLINE static vector_type rotl(
const vector_type x, int k) noexcept {
return (x << k) | (x >> (64 - k));
}

void calc() noexcept {
for (uint64_t i = 0; i < VecResCount; i++) {
auto& curState = state[i];
vecRes[i] = rotl(curState[0] + curState[3], 23) + curState[0];
const auto t = curState[1] << 17;
curState[2] ^= curState[0];
curState[3] ^= curState[1];
curState[1] ^= curState[2];
curState[0] ^= curState[3];
curState[2] ^= t;
curState[3] = rotl(curState[3], 45);
for (unsigned int i = 0; i < VecResCount; ++i) {
const vector_type vec_res = rotl(state[idx(0, i)] + state[idx(3, i)], 23) + state[idx(0, i)];
std::memcpy(&res[i * size_ratio], &vec_res, sizeof(vector_type));

const auto t = state[idx(1, i)] << 17;
state[idx(2, i)] ^= state[idx(0, i)];
state[idx(3, i)] ^= state[idx(1, i)];
state[idx(1, i)] ^= state[idx(2, i)];
state[idx(0, i)] ^= state[idx(3, i)];
state[idx(2, i)] ^= t;
state[idx(3, i)] = rotl(state[idx(3, i)], 45);
}
cur = 0;
}
Expand All @@ -160,10 +146,10 @@ class xoshiro256pp {
}
};

template <typename Size, typename VectorType, typename CharT, typename Traits>
template <typename Size, typename CharT, typename Traits>
std::basic_ostream<CharT, Traits>& operator<<(
std::basic_ostream<CharT, Traits>& os,
const xoshiro256pp<Size, VectorType>& rng) {
const xoshiro256pp<Size>& rng) {
for (auto i2 : rng.res) {
os << i2 << " ";
}
Expand Down
Loading