4#include <batmat/linalg/simdify.hpp>
5#include <batmat/linalg/structure.hpp>
6#include <batmat/linalg/uview.hpp>
7#include <batmat/loop.hpp>
8#include <batmat/lut.hpp>
9#include <batmat/ops/transpose.hpp>
10#include <guanaqo/mat-view.hpp>
11#include <guanaqo/trace.hpp>
16using namespace batmat::linalg;
27template <
class T, StorageOrder O>
29 batmat::matrix::View<T, index_t, scalar_simd_size<std::remove_cv_t<T>>,
index_t,
index_t, O>;
31template <
class T,
class D,
class L, StorageOrder O>
32using scalar_view = batmat::matrix::View<T, index_t, index_constant<1>, D, L, O>;
37 requires(OB == StorageOrder::ColMajor)
39 static_assert(std::is_const_v<TA> ^ std::is_const_v<TB>);
40 static_assert(
typename decltype(B)::batch_size_type() == 1);
41 static constexpr bool Pack = std::is_const_v<TB>;
47 using T = std::remove_const_t<TA>;
48 static constexpr index_t v =
typename decltype(A)::batch_size_type();
52 const auto cstrA = A.col_stride() *
v;
53 const auto rstrA = A.row_stride() *
v;
54 const auto cstrB = B.col_stride();
55 const auto rstrB = B.row_stride();
56 const auto bstrB = B.layer_stride();
57 static_assert(rstrB == 1);
59 const auto pAend = pA + A.cols() * cstrA;
61 auto inner_count = Struc ==
LowerTriangular ? std::max(A.rows(), A.cols())
69 0,
clamp(inner_count, index_t{0}, A.rows()),
v,
80 lut[nr - 1](pB_, bstrB, pA_, rstrA);
82 lut[nr - 1](pA_, rstrA, pB_, bstrB);
88 if (inner_count < A.rows()) {
100 requires(OB == StorageOrder::RowMajor)
102 return unpack_full<TA, Abi, transpose(Struc)>(A.transposed(), B.transposed());
109template <
class T,
class Abi, StorageOrder OA,
class DB,
class LB, StorageOrder OB>
112 if (B.depth() >= A.depth())
113 return unpack_full(A, B.first_layers(A.depth()));
114 static_assert(OA == StorageOrder::ColMajor);
116 static constexpr index_t v = A.batch_size();
119 for (index_t c = 0; c < A.cols(); ++c)
123 batmat::ops::transpose_dyn<v, v>(&A(0, r, c), v, &B(0, r, c), B.layer_stride(),
126 [&](index_t r, index_t nr) {
127 lut[nr - 1](A.block(r, c, nr, 1).data, v, B.block(r, c, nr, 1).data,
128 B.layer_stride(), B.depth());
144template <simdifiable VA,
class VB>
145 requires(std::is_same_v<simdified_value_t<VA>,
typename std::remove_cvref_t<VB>::value_type> &&
146 typename std::remove_cvref_t<VB>::batch_size_type() == 1)
149 simdify(A).as_const(), B.first_layers(A.depth()));
154template <
class VA, simdifiable VB>
156 typename std::remove_cvref_t<VA>::batch_size_type() == 1)
159 simdify(B), A.first_layers(B.depth()).as_const());
void unpack(VA &&A, VB &&B)
Copy a compact batch of matrices A to multiple scalar matrices B.
void clamp(Vx &&x, Vlo &&lo, Vhi &&hi, Vz &&z)
Elementwise clamping z = max(lo, min(x, hi)).
void pack(VA &&A, VB &&B)
Copy multiple scalar matrices A to a compact batch of matrices B.
void transpose(const T *pa, index_t lda, T *pb, index_t ldb)
void transpose_dyn(const T *pa, index_t lda, T *pb, index_t ldb, index_t d=R)
void foreach_chunked(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, auto func_rem, LoopDir dir=LoopDir::Forward)
consteval auto make_1d_lut(F f)
#define GUANAQO_TRACE(name, instance,...)
stdx::simd_size< Tp, Abi > simd_size
typename detail::simdified_value< V >::type simdified_value_t
typename detail::simdified_abi< V >::type simdified_abi_t
constexpr auto simdify(simdifiable auto &&a) -> simdified_view_t< decltype(a)>
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
constexpr auto rows(const Matrix< T, I, S, D, O, A > &v)
std::integral_constant< index_t, I > index_constant