9#include <batmat/config.hpp>
11#include <batmat/openmp.h>
13#include <batmat/thread-pool.hpp>
15#include <guanaqo/trace.hpp>
24template <
class SC = SharedContext>
32#if GUANAQO_WITH_TRACING
42#if !BATMAT_WITH_OPENMP
54 static_assert(std::is_same_v<
decltype(
barrier.spin_count),
decltype(spin_count)>);
55 return std::exchange(
barrier.spin_count, spin_count);
66#if GUANAQO_WITH_TRACING && !GUANAQO_WITH_PERFETTO
68 using token_t =
typename shared_context_type::barrier_type::arrival_token;
73 using arrival_token =
typename shared_context_type::barrier_type::arrival_token;
92#if GUANAQO_WITH_TRACING && !GUANAQO_WITH_PERFETTO
94 return {
shared.barrier.arrive(
static_cast<uint32_t
>(
index)), std::move(trace)};
96 return shared.barrier.arrive(
static_cast<uint32_t
>(
index));
101#if GUANAQO_WITH_TRACING && !GUANAQO_WITH_PERFETTO
102 auto trace = std::move(token.trace);
103 shared.barrier.wait(std::move(token.token));
105 shared.barrier.wait(std::move(token));
113#if !GUANAQO_WITH_PERFETTO
116 shared.barrier.arrive_and_wait(
static_cast<uint32_t
>(
index));
123#if !GUANAQO_WITH_PERFETTO
126 shared.barrier.arrive_and_wait(
static_cast<uint32_t
>(
index), line);
132 return shared.barrier.broadcast(
static_cast<uint32_t
>(
index), std::move(x),
133 static_cast<uint32_t
>(src));
138 template <
class F,
class... Args>
140 using T = std::invoke_result_t<F, Args...>;
145 return broadcast(std::invoke(std::forward<F>(f), std::forward<Args>(args)...), 0);
153 template <
class T,
class F>
155 return shared.barrier.arrive_reduce(
static_cast<uint32_t
>(
index), std::move(x),
162 T
wait_reduce(shared_context_type::barrier_type::template arrival_token_typed<T> &&token) {
163 return shared.barrier.wait_reduce(std::move(token));
168 template <
class T,
class F>
170 return shared.barrier.reduce(
static_cast<uint32_t
>(
index), std::move(x), std::move(func));
177 return reduce(std::move(x), std::plus<>{});
185 shared.barrier.arrive_and_wait_with_completion(
static_cast<uint32_t
>(
index),
192#if !BATMAT_WITH_OPENMP
193 thread_pool.sync_run_n(
num_thr, [
this, &f](index_t i, index_t) {
199 for (index_t i = 0; i <
num_thr; ++i) {
Barrier synchronization primitive.
Fairly vanilla combining tree barrier.
TraceLogger & get_trace_logger()
#define GUANAQO_TRACE(name, instance,...)
#define GUANAQO_TRACE_INSTANT(name, instance)
No-op completion function for the TreeBarrier.
Thread context for parallel execution.
T broadcast(T x, index_t src=0)
Broadcast a value x from the thread with index src to all threads.
T reduce(T x)
Reduction with std::plus, i.e., summation across all threads.
auto call_broadcast(F &&f, Args &&...args) -> std::invoke_result_t< F, Args... >
Call a function f with the given args on a single thread and broadcast the return value to all thread...
void run_single_sync(F &&f)
Wait for all threads to reach this point, then run the given function on a single thread before relea...
void arrive_and_wait()
Arrive at the barrier and wait for the barrier phase to complete.
bool is_master() const
Check if this thread is the master thread (thread index 0).
typename shared_context_type::barrier_type::arrival_token arrival_token
auto arrive_reduce(T x, F func)
Perform a reduction of x across all threads using the given binary function func.
T wait_reduce(shared_context_type::barrier_type::template arrival_token_typed< T > &&token)
Wait for the reduction initiated by arrive_reduce() to complete and obtain the reduced value.
void arrive_and_wait(int line)
Debug version of arrive_and_wait() that performs a sanity check to ensure that all threads are arrivi...
T reduce(T x, F func)
Perform a reduction of x across all threads using the given binary function func, and wait for the re...
void wait(arrival_token &&token)
Await a token returned by arrive(), waiting for the barrier phase to complete.
friend constexpr bool operator==(const Context &a, const Context &b)
arrival_token arrive()
Arrive at the barrier and obtain a token that can be used to wait for completion of the current barri...
shared_context_type & shared
Abstraction for a parallel execution context: a set of threads that can synchronize and communicate w...
void run(F &&)
Execute the given function in parallel on all threads, blocking until completion.
uint32_t set_barrier_spin_count(uint32_t spin_count)
Configure the barrier spin count used in parallel synchronization before falling back to a futex wait...
EmptyCompletion completion_type
TreeBarrier< completion_type, uint16_t > barrier_type
ScopedLog trace(const char *name, int64_t instance, int64_t flop_count=-1)