cyqlone develop
Fast, parallel and vectorized solver for linear systems with optimal control structure.
Loading...
Searching...
No Matches
packing.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <cyqlone/config.hpp>
4#include <batmat/linalg/simdify.hpp>
5#include <batmat/linalg/structure.hpp>
6#include <batmat/linalg/uview.hpp>
7#include <batmat/loop.hpp>
8#include <batmat/lut.hpp>
9#include <batmat/ops/transpose.hpp>
10#include <guanaqo/mat-view.hpp>
11#include <guanaqo/trace.hpp>
12#include <type_traits>
13
14namespace cyqlone::linalg {
15
16using namespace batmat::linalg;
17
18/// @cond DETAIL
19
20namespace detail {
21
23
24#if 0
25template <class T>
27template <class T, StorageOrder O>
28using scalar_view =
29 batmat::matrix::View<T, index_t, scalar_simd_size<std::remove_cv_t<T>>, index_t, index_t, O>;
30#else
31template <class T, class D, class L, StorageOrder O>
32using scalar_view = batmat::matrix::View<T, index_t, index_constant<1>, D, L, O>;
33#endif
34
35template <class TA, class Abi, MatrixStructure Struc = MatrixStructure::General, StorageOrder OA,
36 class TB, class DB, class LB, StorageOrder OB>
37 requires(OB == StorageOrder::ColMajor)
38inline void unpack_full(view<TA, Abi, OA> A, scalar_view<TB, DB, LB, OB> B) {
39 static_assert(std::is_const_v<TA> ^ std::is_const_v<TB>);
40 static_assert(typename decltype(B)::batch_size_type() == 1);
41 static constexpr bool Pack = std::is_const_v<TB>;
42 BATMAT_ASSERT(B.rows() == A.rows());
43 BATMAT_ASSERT(B.cols() == A.cols());
44 BATMAT_ASSERT(static_cast<index_t>(B.depth()) == static_cast<index_t>(A.depth()));
45 GUANAQO_TRACE("unpack", 0, A.rows() * A.cols() * A.depth());
46 using enum MatrixStructure;
47 using T = std::remove_const_t<TA>;
48 static constexpr index_t v = typename decltype(A)::batch_size_type();
49 static constexpr auto lut = batmat::make_1d_lut<v>([]<index_t R>(index_constant<R>) {
51 });
52 const auto cstrA = A.col_stride() * v;
53 const auto rstrA = A.row_stride() * v;
54 const auto cstrB = B.col_stride();
55 const auto rstrB = B.row_stride();
56 const auto bstrB = B.layer_stride();
57 static_assert(rstrB == 1);
58 TA *pA = A.data();
59 const auto pAend = pA + A.cols() * cstrA;
60 TB *pB = B.data();
61 auto inner_count = Struc == LowerTriangular ? std::max(A.rows(), A.cols())
62 : Struc == UpperTriangular ? index_t{1}
63 : A.rows();
64 using std::clamp;
65 while (pA < pAend) {
66 TA *pA_ = pA;
67 TB *pB_ = pB;
69 0, clamp(inner_count, index_t{0}, A.rows()), v,
70 [&](index_t) {
71 if constexpr (Pack)
72 batmat::ops::transpose<v, v>(pB_, bstrB, pA_, rstrA);
73 else
74 batmat::ops::transpose<v, v>(pA_, rstrA, pB_, bstrB);
75 pA_ += v * rstrA;
76 pB_ += v * rstrB;
77 },
78 [&](index_t, index_t nr) {
79 if constexpr (Pack)
80 lut[nr - 1](pB_, bstrB, pA_, rstrA);
81 else
82 lut[nr - 1](pA_, rstrA, pB_, bstrB);
83 });
84 pA += cstrA;
85 pB += cstrB;
86 if constexpr (Struc == LowerTriangular) {
87 --inner_count;
88 if (inner_count < A.rows()) {
89 pA += rstrA;
90 pB += rstrB;
91 }
92 } else if (Struc == UpperTriangular) {
93 ++inner_count;
94 }
95 }
96}
97
98template <class TA, class Abi, MatrixStructure Struc = MatrixStructure::General, StorageOrder OA,
99 class TB, class DB, class LB, StorageOrder OB>
100 requires(OB == StorageOrder::RowMajor)
101inline void unpack_full(view<TA, Abi, OA> A, scalar_view<TB, DB, LB, OB> B) {
102 return unpack_full<TA, Abi, transpose(Struc)>(A.transposed(), B.transposed());
103}
104
105// TODO: mismatched storage order is currently not supported.
106
107/// @todo Test this function. Then benchmark it against unpack_full: if this variant is not much
108/// slower, we should consider using it for all unpacking so we don't need two variants.
109template <class T, class Abi, StorageOrder OA, class DB, class LB, StorageOrder OB>
110 requires(OA == OB)
111inline void unpack_partial(view<const T, Abi, OA> A, scalar_view<T, DB, LB, OB> B) {
112 if (B.depth() >= A.depth())
113 return unpack_full(A, B.first_layers(A.depth()));
114 static_assert(OA == StorageOrder::ColMajor); // TODO: row major
115 GUANAQO_TRACE("unpack", 0, A.rows() * A.cols() * A.depth());
116 static constexpr index_t v = A.batch_size();
117 static constexpr auto lut = batmat::make_1d_lut<v>(
119 for (index_t c = 0; c < A.cols(); ++c)
121 0, A.rows(), v,
122 [&](index_t r) {
123 batmat::ops::transpose_dyn<v, v>(&A(0, r, c), v, &B(0, r, c), B.layer_stride(),
124 B.depth());
125 },
126 [&](index_t r, index_t nr) {
127 lut[nr - 1](A.block(r, c, nr, 1).data, v, B.block(r, c, nr, 1).data,
128 B.layer_stride(), B.depth());
129 });
130}
131
132} // namespace detail
133
134/// @endcond
135
136/// @addtogroup topic-linalg
137/// @{
138
139/// @name Packing and unpacking
140/// @{
141
142/// Copy a compact batch of matrices @p A to multiple scalar matrices @p B.
143/// @post A(l, r, c) == B(l, r, c) for all valid l, r, c.
144template <simdifiable VA, class VB>
145 requires(std::is_same_v<simdified_value_t<VA>, typename std::remove_cvref_t<VB>::value_type> &&
146 typename std::remove_cvref_t<VB>::batch_size_type() == 1)
147void unpack(VA &&A, VB &&B) {
148 detail::unpack_full<const simdified_value_t<VA>, simdified_abi_t<VA>>(
149 simdify(A).as_const(), B.first_layers(A.depth()));
150}
151
152/// Copy multiple scalar matrices @p A to a compact batch of matrices @p B.
153/// @post A(l, r, c) == B(l, r, c) for all valid l, r, c.
154template <class VA, simdifiable VB>
155 requires(std::is_same_v<typename std::remove_cvref_t<VA>::value_type, simdified_value_t<VB>> &&
156 typename std::remove_cvref_t<VA>::batch_size_type() == 1)
157void pack(VA &&A, VB &&B) {
158 detail::unpack_full<simdified_value_t<VB>, simdified_abi_t<VB>>(
159 simdify(B), A.first_layers(B.depth()).as_const());
160}
161
162/// @}
163
164/// @}
165
166} // namespace cyqlone::linalg
#define BATMAT_ASSERT(x)
std::ptrdiff_t index_t
void unpack(VA &&A, VB &&B)
Copy a compact batch of matrices A to multiple scalar matrices B.
Definition packing.hpp:147
void clamp(Vx &&x, Vlo &&lo, Vhi &&hi, Vz &&z)
Elementwise clamping z = max(lo, min(x, hi)).
Definition linalg.hpp:325
void pack(VA &&A, VB &&B)
Copy multiple scalar matrices A to a compact batch of matrices B.
Definition packing.hpp:157
void transpose(const T *pa, index_t lda, T *pb, index_t ldb)
void transpose_dyn(const T *pa, index_t lda, T *pb, index_t ldb, index_t d=R)
void foreach_chunked(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, auto func_rem, LoopDir dir=LoopDir::Forward)
consteval auto make_1d_lut(F f)
#define GUANAQO_TRACE(name, instance,...)
stdx::simd_size< Tp, Abi > simd_size
typename detail::simdified_value< V >::type simdified_value_t
typename detail::simdified_abi< V >::type simdified_abi_t
constexpr auto simdify(simdifiable auto &&a) -> simdified_view_t< decltype(a)>
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
constexpr auto rows(const Matrix< T, I, S, D, O, A > &v)
std::integral_constant< index_t, I > index_constant
constexpr index_t v