cyqlone develop
Fast, parallel and vectorized solver for linear systems with optimal control structure.
Loading...
Searching...
No Matches
sparse.tpp
Go to the documentation of this file.
1#include <cyqlone/cyqlone.hpp>
2
3#include <batmat/assume.hpp>
4#include <guanaqo/blas/hl-blas-interface.hpp>
5#include <cmath>
6#include <limits>
7
8#include <batmat/linalg/copy.hpp>
9#include <batmat/linalg/trtri.hpp>
10
11namespace CYQLONE_NS(cyqlone) {
12using namespace batmat::linalg;
13
14template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
16 std::span<const value_type> Σ) const
17 -> SparseMatrix {
20 using std::sqrt;
21 const index_t nux = nu + nx, nuxx = nux + nx;
22 // stride between stages in the same interval
23 const index_t stage_stride = nuxx;
24 // stride between intervals in the thread partitioning
25 const index_t interval_stride = nuxx * n - nx;
26 // stride between vector lanes (largest stride)
27 const index_t vector_stride = p * interval_stride;
28 // total matrix size
29 const index_t nn = ceil_N() * nuxx;
30 // first index of the CR block
31 const index_t sλ = nn - (nx * p * v);
32
33 SparseMatrixBuilder mat{.rows = nn, .cols = nn, .symmetry = Symmetry::Unsymmetric};
34
35 // Add the ALM penalty terms to the cost Hessians
36 batmat::matrix::Matrix<value_type, index_t> H{{.depth = N_horiz, .rows = nux, .cols = nux}};
37 const auto nyM = std::max(ny, ny_0 + ny_N);
38 batmat::matrix::Matrix<value_type, index_t> DC{{.depth = N_horiz, .rows = nyM, .cols = nux}};
39 auto R = H.top_left(nu, nu);
40 auto Q = H.bottom_right(nx, nx);
41 auto Sᵀ = H.bottom_left(nx, nu);
42 for (index_t j = 0; j < N_horiz; ++j) {
43 H(j) = ocp.data_H(j);
44 if (j > 0) {
45 DC.top_rows(ny)(j) = ocp.data_G(j - 1);
46 for (index_t r = 0; r < ny; ++r)
47 for (index_t c = 0; c < nux; ++c)
48 DC(j, r, c) *= sqrt(Σ[ny_0 + (j - 1) * ny + r]);
50 } else {
51 DC.top_rows(ny_0 + ny_N)(j) = ocp.data_G0N(0);
52 for (index_t r = 0; r < ny_0; ++r)
53 for (index_t c = 0; c < nu; ++c)
54 DC(0, r, c) *= sqrt(Σ[r]);
55 for (index_t r = 0; r < ny_N; ++r)
56 for (index_t c = 0; c < nx; ++c)
57 DC(0, ny_0 + r, nu + c) *= sqrt(Σ[ny_0 + (N_horiz - 1) * ny + r]);
59 }
60 }
61
62 // Populate the sparse matrix
63 for (index_t l = 0; l < v; ++l) { // vector lane
64 for (index_t c = 0; c < p; ++c) {
65 const index_t j0 = c * n + l * p * n;
66 const auto biA = c + l * p;
67 const auto biI = sub_wrap_ceil_P(biA, 1);
68 const auto sλA = sλ + nx * get_linear_batch_offset(biA);
69 const auto sλI = sλ + nx * get_linear_batch_offset(biI);
70 // TODO: handle case if lev > or >= lp()
71 for (index_t i = 0; i < n; ++i) {
72 const index_t j = sub_wrap_ceil_N(j0, i);
73 // index of current diagonal block
74 index_t s = l * vector_stride + c * interval_stride + stage_stride * i;
75 // Padding
76 if (j >= N_horiz) {
77 mat.add_diag(s, s, 1, nux); // H = I
78 i + 1 == n ? mat.add_diag(sλI, s + nu, -1, nx) // E = -I
79 : mat.add_diag(s + nux, s + nu, -1, nx); // E = -I
80 continue;
81 }
82 // H
83 mat.add(s, s, R(j), 1, LowerTriangular);
84 mat.add(s + nu, s + nu, Q(j), 1, LowerTriangular);
85 mat.add(s + nu, s, Sᵀ(j));
86 // beginning of subinterval: coupling constraints at the bottom (row sλA)
87 if (i == 0) {
88 mat.add(sλA, s, ocp.data_F(j).left_cols(nu)); // B(j)
89 j > 0 ? mat.add(sλA, s + nu, ocp.data_F(j).right_cols(nx)) // A(j)
90 : void(); // A(0) = 0
91 } else {
92 mat.add(s, s - nx, ocp.data_F(j).left_cols(nu).transposed()); // B(j)
93 mat.add(s + nu, s - nx, ocp.data_F(j).right_cols(nx).transposed()); // A(j)
94 }
95 // end of subinterval: coupling constraints at the bottom (row sλI)
96 i + 1 == n ? mat.add_diag(sλI, s + nu, -1, nx) // E = -I
97 : mat.add_diag(s + nux, s + nu, -1, nx); // E = -I
98 }
99 }
100 }
101 return std::move(mat).build();
102}
103
104template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
106 value_type scale_b) const
107 -> std::vector<T> {
109 const index_t nux = nu + nx, nuxx = nux + nx;
110 // stride between stages in the same interval
111 const index_t stage_stride = nuxx;
112 // stride between intervals in the thread partitioning
113 const index_t interval_stride = nuxx * n - nx;
114 // stride between vector lanes (largest stride)
115 const index_t vector_stride = p * interval_stride;
116 // total vector size
117 const index_t nn = ceil_N() * nuxx;
118 // first index of the CR block
119 const index_t sλ = nn - (nx * p * v);
120
121 std::vector<value_type> rhs(nn);
122 std::ranges::fill(rhs, std::numeric_limits<value_type>::quiet_NaN());
123
124 for (index_t l = 0; l < v; ++l) {
125 for (index_t t = 0; t < p; ++t) {
126 const index_t di0 = t * n;
127 for (index_t i = 0; i < n; ++i) {
128 const index_t di = di0 + i;
129 index_t s = l * vector_stride + t * interval_stride + stage_stride * i;
130 if (i > 0)
131 for (index_t c = 0; c < nx; ++c)
132 rhs[s - nx + c] = scale_b * b.batch(di)(l)(c, 0);
133 for (index_t c = 0; c < nux; ++c)
134 rhs[s + c] = scale_rq * rq.batch(di)(l)(c, 0);
135 }
136 }
137 }
138 index_t s = sλ;
139 const auto cyclic_block = [&](index_t i) {
140 const index_t t = i % p, l = i / p;
141 const index_t di = t * n;
142 for (index_t c = 0; c < nx; ++c)
143 rhs[s + c] = scale_b * b.batch(di)(l)(c, 0);
144 s += nx;
145 };
146 for (index_t l = 0; l < lp(); ++l) {
147 index_t offset = 1 << l;
148 index_t stride = offset << 1;
149 for (index_t i = offset; i < v * p; i += stride)
150 cyclic_block(i);
151 }
152 for (index_t i = 0; i < v * p; i += p) {
153 cyclic_block(i);
154 }
155 return rhs;
156}
157
158template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
162 const index_t nux = nu + nx, nuxx = nux + nx;
163 // stride between stages in the same interval
164 const index_t stage_stride = nuxx;
165 // stride between intervals in the thread partitioning
166 const index_t interval_stride = nuxx * n - nx;
167 // stride between vector lanes (largest stride)
168 const index_t vector_stride = p * interval_stride;
169 // total matrix size
170 const index_t nn = ceil_N() * nuxx;
171 // first index of the CR block
172 const index_t sλ = nn - (nx * p * v);
173
174 SparseMatrixBuilder mat{.rows = nn, .cols = nn, .symmetry = Symmetry::Unsymmetric};
175
176 // Compute blocks that are not computed by the Cyqlone factorization (because they are not
177 // needed for the solve)
178 matrix<> LA{{.depth = v * p, .rows = nx, .cols = n * nx}};
179 matrix<> invQᵀ{{.depth = v * p, .rows = nx, .cols = n * nx}};
180 matrix<> LBA{{.depth = v * p, .rows = nu + nx, .cols = (n - 1) * nx}};
181 for (index_t t = 0; t < p; ++t) {
182 const index_t di0 = t * n; // data batch index
183 auto RSQ = riccati_LH.batch(t);
184 auto LBAt = LBA.batch(t);
185 auto invQᵀt = invQᵀ.batch(t);
186 auto  = riccati_LAB.batch(t).left_cols(n * nx);
187 auto LAt = LA.batch(t);
188 for (index_t i = 0; i < n; ++i) {
189 const auto di = di0 + i;
190 auto RSQi = RSQ.middle_cols(i * nux, nux);
191 auto Qi = tril(RSQi.bottom_right(nx, nx));
192 auto Qi_inv = triu(invQᵀt.middle_cols(i * nx, nx));
193 auto Âi = Â.middle_cols(i * nx, nx);
194 auto LAi = LAt.middle_cols(i * nx, nx);
195
196 copy(Âi, LAi);
197 if (i + 1 < n) // Final block already inverted
198 trtri(Qi, Qi_inv.transposed());
199 else
200 copy(triu(RSQi.block(nu - 1, nu, nx, nx)), Qi_inv);
201 if (i + 1 < n) // Final block is already  LQ⁻ᵀ
202 trsm(LAi, Qi.transposed());
203 if (i > 0) {
204 auto LBAi = LBAt.middle_cols((i - 1) * nx, nx);
205 auto RSQ_prev = RSQ.middle_cols((i - 1) * nux, nux);
206 auto Q_prev = tril(RSQ_prev.bottom_right(nx, nx));
207 auto BA = data_F.batch(di);
208 copy(BA.transposed(), LBAi);
209 trmm(LBAi, Q_prev);
210 }
211 }
212 }
213
214 // Populate the sparse matrix
215 for (index_t l = 0; l < v; ++l) {
216 for (index_t t = 0; t < p; ++t) {
217 const index_t j0 = t * n + l * n * p;
218 const auto biA = t + l * p;
219 const auto biI = sub_wrap_ceil_P(biA, 1);
220 const auto sλA = sλ + nx * get_linear_batch_offset(biA);
221 const auto sλI = sλ + nx * get_linear_batch_offset(biI);
222 auto B̂t = riccati_LAB.batch(t).right_cols(n * nu);
223 auto LHt = riccati_LH.batch(t);
224 auto LBAt = LBA.batch(t);
225 auto LAt = LA.batch(t);
226 auto invQᵀt = invQᵀ.batch(t);
227 for (index_t i = 0; i < n; ++i) {
228 [[maybe_unused]] const index_t j = sub_wrap_ceil_N(j0, i);
229 index_t s = l * vector_stride + t * interval_stride + i * stage_stride;
230 auto B̂i = B̂t.middle_cols(i * nu, nu);
231 auto LHi = LHt.middle_cols(i * nux, nux);
232 auto iQiᵀ = invQᵀt.middle_cols(i * nx, nx);
233 auto AiQiᵀ = LAt.middle_cols(i * nx, nx);
234 if (i > 0) {
235 auto LBAi = LBAt.middle_cols((i - 1) * nx, nx);
236 auto iQᵀprev = invQᵀt.middle_cols((i - 1) * nx, nx);
237 auto LA_prev = LAt.middle_cols((i - 1) * nx, nx);
238 mat.add(s - nx, s - nx, iQᵀprev(l), -1, UpperTriangular);
239 mat.add(s, s - nx, LBAi(l));
240 mat.add(sλA, s - nx, LA_prev(l));
241 }
242 mat.add(s, s, LHi(l), 1, LowerTriangular);
243 mat.add(sλA, s, B̂i(l));
244 if (i + 1 < n)
245 mat.add(s + nux, s + nu, iQiᵀ(l), -1, UpperTriangular);
246 else
247 mat.add(sλI, s + nu, iQiᵀ(l), -1, UpperTriangular);
248 mat.add(sλA, s + nu, AiQiᵀ(l));
249 }
250 }
251 }
252 index_t s = sλ;
253 const auto cyclic_block = [&](index_t i, index_t offset) {
254 const index_t sY = sλ + nx * get_linear_batch_offset(i + offset);
255 const index_t sU = sλ + nx * get_linear_batch_offset(i - offset);
256 const index_t t = i % p, l = i / p;
257 mat.add(s, s, tricyqle.cr_L.batch(t)(l), 1, LowerTriangular);
258 if (i + offset < v * p)
259 mat.add(sY, s, tricyqle.cr_Y.batch(t)(l));
260 mat.add(sU, s, tricyqle.cr_U.batch(t)(l));
261 s += nx;
262 };
263 const auto cyclic_block_final = [&](index_t i, index_t offset) {
264 const index_t sY = sλ + nx * get_linear_batch_offset(i + offset);
265 const index_t t = i % p, l = i / p;
266 mat.add(s, s, tricyqle.pcr_L.batch(0)(l), 1, LowerTriangular);
267 if (i + offset < v * p)
268 mat.add(sY, s, tricyqle.cr_Y.batch(t)(l));
269 s += nx;
270 };
271 for (index_t l = 0; l < lp(); ++l) {
272 index_t offset = 1 << l;
273 index_t stride = offset << 1;
274 for (index_t i = offset; i < v * p; i += stride)
275 cyclic_block(i, offset);
276 }
277 for (index_t i = 0; i < v * p; i += p)
278 cyclic_block_final(i, p);
279 return std::move(mat).build();
280}
281
282template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
285 const index_t nux = nu + nx, nuxx = nux + nx;
286 // stride between stages in the same interval
287 const index_t stage_stride = nuxx;
288 // stride between intervals in the thread partitioning
289 const index_t interval_stride = nuxx * n - nx;
290 // stride between vector lanes (largest stride)
291 const index_t vector_stride = p * interval_stride;
292 // total matrix size
293 const index_t nn = ceil_N() * nuxx;
294 // first index of the CR block
295 const index_t sλ = nn - (nx * p * v);
296
297 SparseMatrixBuilder mat{.rows = nn, .cols = nn, .symmetry = Symmetry::Lower};
298
299 for (index_t l = 0; l < v; ++l) {
300 for (index_t t = 0; t < p; ++t) {
301 for (index_t i = 0; i < n; ++i) {
302 index_t s = l * vector_stride + t * interval_stride + i * stage_stride;
303 if (i > 0)
304 for (index_t c = 0; c < nx; ++c)
305 mat.add(s - nx + c, s - nx + c, -1);
306 for (index_t c = 0; c < nu; ++c)
307 mat.add(s + c, s + c, 1);
308 for (index_t c = 0; c < nx; ++c)
309 mat.add(s + c + nu, s + c + nu, 1);
310 }
311 }
312 }
313 for (index_t i = 0; i < p * v; ++i)
314 for (index_t r = 0; r < nx; ++r)
315 mat.add(sλ + nx * i + r, sλ + nx * i + r, -1);
316 return std::move(mat).build();
317}
318
319} // namespace CYQLONE_NS(cyqlone)
#define BATMAT_ASSERT(x)
The main header for the Cyqlone and Tricyqle linear solvers.
void xsyrk_LT(T alpha, std::type_identity_t< MatrixView< const T, I > > A, T beta, MatrixView< T, I > C)
void trsm(Structured< VA, SA > A, VB &&B, VD &&D, with_rotate_B_t< RotB >={})
void trtri(Structured< VA, MatrixStructure::LowerTriangular > A, Structured< VD, MatrixStructure::LowerTriangular > D)
void trmm(Structured< VA, SA > A, Structured< VB, SB > B, Structured< VD, SD > D, Opts... opts)
void copy(VA &&A, VB &&B, Opts... opts)
constexpr auto triu(M &&m)
constexpr auto tril(M &&m)
constexpr bool is_pow_2(index_t n)
Definition cyqlone.hpp:32
auto top_rows(index_type n)
auto bottom_left(index_type nr, index_type nc)
auto top_left(index_type nr, index_type nc)
auto bottom_right(index_type nr, index_type nc)
const index_t n
Number of stages per thread per vector lane (rounded up).
Definition cyqlone.hpp:605
SparseMatrix build_sparse_factor() const
Definition sparse.tpp:159
index_t ceil_N() const
Horizon length, rounded up to a multiple of the number of parallel execution units.
Definition cyqlone.hpp:653
typename tricyqle_t::template view< O > view
Non-owning immutable view type for matrix.
Definition cyqlone.hpp:693
matrix< default_order > data_F
Stage-wise dynamics matrices F(j) = [ B(j) A(j) ] of the OCP.
Definition cyqlone.hpp:766
SparseMatrix build_sparse(const CyqloneStorage< value_type > &ocp, std::span< const value_type > Σ) const
Definition sparse.tpp:15
index_t sub_wrap_ceil_N(index_t a, index_t b) const
Subtract b from a modulo N_horiz.
Definition indexing.tpp:53
index_t get_linear_batch_offset(index_t biA) const
Definition indexing.tpp:112
typename tricyqle_t::template matrix< O > matrix
Owning type for a batch of matrices (with batch size v).
Definition cyqlone.hpp:690
const index_t N_horiz
Horizon length of the optimal control problem.
Definition cyqlone.hpp:567
const index_t ny
Number of general constraints of the OCP per stage.
Definition cyqlone.hpp:570
const index_t ny_0
Number of general constraints at stage 0, D(0) u(0).
Definition cyqlone.hpp:571
const index_t nu
Number of controls of the OCP.
Definition cyqlone.hpp:569
matrix< default_order > riccati_LH
Cholesky factors of the Hessian blocks for the Riccati recursion.
Definition cyqlone.hpp:782
SparseMatrix build_sparse_diag() const
Definition sparse.tpp:283
const index_t p
Number of processors/threads.
Definition cyqlone.hpp:601
tricyqle_t tricyqle
Block-tridiagonal solver (CR/PCR/PCG).
Definition cyqlone.hpp:747
std::vector< value_type > build_rhs(view<> rq, view<> b, value_type scale_rq=-1, value_type scale_b=-1) const
Definition sparse.tpp:105
const index_t ny_N
Number of general constraints at the final stage, C(N) x(N).
Definition cyqlone.hpp:572
constexpr index_t lp() const
log₂(p), logarithm of the number of processors/threads, rounded up.
Definition cyqlone.hpp:610
static constexpr index_t v
Vector length.
Definition cyqlone.hpp:603
index_t sub_wrap_ceil_P(index_t a, index_t b) const
Definition indexing.tpp:92
const index_t nx
Number of states of the OCP.
Definition cyqlone.hpp:568
matrix< default_order > riccati_LAB
Storage for the matrices LB(j), Acl(j) and LA(j₁) for the Riccati recursion.
Definition cyqlone.hpp:788
Storage for a linear-quadratic OCP with the initial states x₀ eliminated.
A builder for constructing a SparseMatrix incrementally.
Definition sparse.hpp:37
void add(index_t row, index_t col, real_t value)
Definition sparse.hpp:43
A sparse matrix in COO format.
Definition sparse.hpp:26