5#include <batmat/assume.hpp>
6#include <batmat/linalg/gemm.hpp>
7#include <batmat/linalg/gemv.hpp>
8#include <batmat/linalg/potrf.hpp>
9#include <batmat/linalg/shift.hpp>
10#include <batmat/linalg/trsm.hpp>
12namespace CYQLONE_NS(cyqlone) {
14using namespace linalg;
15using namespace batmat::linalg;
22template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
36template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
49template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
57 if (
ν2p(i_prev) >
ν2p(i_next)) {
70template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
72 const index_t offset = 1 << l;
78 const bool factor_next =
ν2p(i) == l + 1;
79 if constexpr (
v == 1) {
90 auto U =
cr_U.batch(iU);
106 auto Y =
cr_Y.batch(iY);
114 auto U =
cr_U.batch(iU), Y =
cr_Y.batch(iY);
123 if (factor_next && i != 0) {
140 else if constexpr (
v > 1)
146 if (factor_next && i == 0) {
162template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
164 index_t stride)
const {
165 if constexpr (
v == 1)
169 const index_t diU = iU * stride, diL = iL * stride;
176template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
179 if constexpr (
v == 1)
183 const index_t diY = iY * stride;
189template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
191 view<> w, index_t stride)
const {
192 const index_t diL = iL * stride;
202 if (
ν2p(iL) == l + 1 && iL != 0) {
209template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
212 index_t stride)
const {
213 if constexpr (
v == 1)
217 const index_t diL = iL * stride;
224template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
226 index_t stride)
const {
227 if constexpr (
v == 1)
231 const index_t diL = iL * stride, diY = iY * stride;
232 auto Y =
cr_Y.batch(iY);
240template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
242 index_t stride)
const {
243 const index_t diL = iL * stride;
258template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
259template <StorageOrder O>
261 if (!
params.enable_prefetching)
263 const auto inner_stride = std::max<index_t>(64 /
sizeof(
value_type) /
v, 1);
264 if constexpr (O == StorageOrder::RowMajor)
265 for (index_t r = 0; r < X.
rows(); ++r)
267 __builtin_prefetch(&X(0, r, c), 0, 2);
269 for (index_t c = 0; c < X.
cols(); ++c)
271 __builtin_prefetch(&X(0, r, c), 0, 2);
274template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
275template <StorageOrder O>
277 if (!
params.enable_prefetching)
279 const auto inner_stride = std::max<index_t>(64 /
sizeof(
value_type) /
v, 1);
280 if constexpr (O == StorageOrder::RowMajor)
281 for (index_t r = 0; r < X.
rows(); ++r)
283 __builtin_prefetch(&X(0, r, c), 0, 2);
285 for (index_t c = 0; c < X.
cols(); ++c)
287 __builtin_prefetch(&X(0, r, c), 0, 2);
290template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
296template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
299 if (
v == 1 && iU >=
p)
305template <index_t VL,
class T, StorageOrder DefaultOrder,
class Ctx>
307 if (
v == 1 && iY + (1 << l) >=
p && !
circular)
The main header for the Cyqlone and Tricyqle linear solvers.
void syrk_sub(VA &&A, Structured< VC, SD > C, Structured< VD, SD > D, Opts... opts)
void trsm(Structured< VA, SA > A, VB &&B, VD &&D, with_rotate_B_t< RotB >={})
void gemm_neg(VA &&A, VB &&B, VD &&D, TilingOptions packing={}, Opts... opts)
void gemv(VA &&A, VB &&B, VD &&D, Opts... opts)
void syrk_sub_potrf(VA &&A, Structured< VC, SC > C, Structured< VD, SC > D, simdified_value_t< VA > regularization=0)
void potrf(Structured< VC, SC > C, Structured< VD, SC > D, simdified_value_t< VC > regularization=0)
void gemv_sub(VA &&A, VB &&B, VC &&C, VD &&D, Opts... opts)
void sub(VA &&A, VB &&B, VC &&C, with_rotate_t< Rotate >={})
Subtract two matrices or vectors C = A - B. Rotate affects B.
constexpr auto tril(M &&m)
#define GUANAQO_TRACE(name, instance,...)
constexpr with_rotate_D_t< I > with_rotate_D
constexpr with_rotate_t< I > with_rotate
constexpr with_rotate_B_t< I > with_rotate_B
constexpr with_rotate_C_t< I > with_rotate_C
constexpr index_type cols() const
constexpr index_type rows() const
batch_view_type batch(index_type b) const
void factor_L(index_t l, index_t i)
Update and factorize a block L in the Cholesky factor for CR level l+1 and column index i,...
void prefetch_L(batch_view< O > X) const
void solve_y_forward(index_t l, index_t iY, mut_view<> λ, mut_view<> w, index_t stride) const
Update the right-hand side λ during the forward solve phase of CR after computing block iY of λ at le...
void solve_u_backward(index_t l, index_t iU, mut_view<> λ, mut_view<> w, index_t stride) const
batmat::matrix::View< const value_type, index_t, vl_t, index_t, index_t, O > view
Non-owning immutable view type for matrix.
batmat::matrix::View< value_type, index_t, vl_t, index_t, index_t, O > mut_view
Non-owning mutable view type for matrix.
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
index_t add_wrap_ceil_p(index_t a, index_t b) const
Add b to a modulo ceil_p().
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
bool circular
Whether the block-tridiagonal system is circular (nonzero top-right & bottom-left corners).
matrix< default_order > pcr_L
Diagonal blocks of the PCR Cholesky factorizations.
matrix< default_order > cr_Y
Subdiagonal blocks Y of the Cholesky factor of the Schur complement (used during CR).
void solve_λ_backward(index_t biL, mut_view<> λ, view<> w, index_t stride) const
void solve_λ_forward(index_t l, index_t iL, mut_view<> λ, view<> w, index_t stride) const
Apply the updates to block iL of the right-hand side from solve_u_forward and solve_y_forward,...
void solve_y_backward(index_t l, index_t iY, mut_view<> λ, index_t stride) const
void factor_U(index_t l, index_t iU)
Compute a block U in the Cholesky factor for the given CR level l and column index iU.
batmat::matrix::View< const value_type, index_t, vl_t, vl_t, layer_stride, O > batch_view
Non-owning immutable view type for a single batch of v matrices.
static constexpr index_t v
Vector length.
void prefetch_U(index_t l, index_t iU) const
void prefetch(batch_view< O > X) const
[Cyqlone solve CR helper]
Params params
Solver parameters for Tricyqle-specific settings.
void solve_u_forward(index_t l, index_t iU, mut_view<> λ, index_t stride) const
Update the right-hand side λ during the forward solve phase of CR after computing block iU of λ at le...
const index_t p
Number of processors/threads.
void update_K(index_t l, index_t i)
Compute a subdiagonal block K of the Schur complement for CR level l+1 and column index i,...
matrix< default_order > cr_U
Subdiagonal blocks U of the Cholesky factor of the Schur complement (used during CR).
matrix< default_order > cr_L
Diagonal blocks of the Cholesky factor of the Schur complement (used during CR).
void prefetch_Y(index_t l, index_t iY) const
void factor_Y(index_t l, index_t iY)
Compute a block Y in the Cholesky factor for the given CR level l and column index iY.
#define CYQ_TRACE_WRITE(...)
#define CYQ_TRACE_READ(...)
#define BATMAT_UNROLLED_IVDEP_FOR(N,...)