3#include <batmat/assume.hpp>
5namespace CYQLONE_NS(cyqlone) {
7template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
10 const index_t c =
ceil_p();
16 return a < 0 ? a + c : a;
18template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
21 const index_t c =
ceil_p();
27 return a >= c ? a - c : a;
29template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
32 auto ui =
static_cast<std::make_unsigned_t<index_t>
>(bi);
33 return static_cast<index_t
>(std::countr_zero(ui));
35template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
38 return bi == 0 ?
lp() :
ν2(bi);
41template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
44 const index_t N =
ceil_N();
50 return a >= N ? a - N : a;
52template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
55 const index_t N =
ceil_N();
61 return a < 0 ? a + N : a;
63template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
70 return a < 0 ? a +
p : a;
72template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
79 return a >=
p ? a -
p : a;
81template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
84 return tricyqle.sub_wrap_ceil_p(a, b);
86template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
89 return tricyqle.add_wrap_ceil_p(a, b);
91template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
94 const index_t c =
ceil_P();
99 return a < 0 ? a + c : a;
101template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
104 const index_t c =
ceil_P();
109 return a >= c ? a - c : a;
111template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
115 const auto levP =
lp();
117 return (((1 << levP) - 1) << (
lp() +
lv() - levP)) + (biA >> levP);
120template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
124template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
The main header for the Cyqlone and Tricyqle linear solvers.
constexpr index_t get_level(index_t i)
constexpr index_t get_index_in_level(index_t i)
index_t ceil_N() const
Horizon length, rounded up to a multiple of the number of parallel execution units.
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
constexpr index_t ceil_P() const
The number of parallel execution units P rounded up to the next power of two.
index_t add_wrap_ceil_N(index_t a, index_t b) const
Add b to a modulo N_horiz.
index_t ν2(index_t i) const
2-adic valuation ν₂.
index_t sub_wrap_ceil_N(index_t a, index_t b) const
Subtract b from a modulo N_horiz.
constexpr index_t lv() const
log₂(v), logarithm of the vector length.
index_t get_linear_batch_offset(index_t biA) const
index_t add_wrap_p(index_t a, index_t b) const
Add b to a modulo p.
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
index_t add_wrap_ceil_P(index_t a, index_t b) const
index_t sub_wrap_p(index_t a, index_t b) const
Subtract b from a modulo p.
index_t add_wrap_ceil_p(index_t a, index_t b) const
Add b to a modulo ceil_p().
const index_t p
Number of processors/threads.
tricyqle_t tricyqle
Block-tridiagonal solver (CR/PCR/PCG).
constexpr index_t lp() const
log₂(p), logarithm of the number of processors/threads, rounded up.
index_t sub_wrap_ceil_P(index_t a, index_t b) const
constexpr index_t lp() const
log₂(p), logarithm of the number of processors/threads p, rounded up.
index_t ν2(index_t i) const
2-adic valuation ν₂.
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
index_t add_wrap_ceil_p(index_t a, index_t b) const
Add b to a modulo ceil_p().
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
constexpr index_t ceil_p() const
The number of processors p rounded up to the next power of two.