cyqlone develop
Fast, parallel and vectorized solver for linear systems with optimal control structure.
Loading...
Searching...
No Matches
cyqlone.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file
4/// The main header for the Cyqlone and Tricyqle linear solvers.
5/// @ingroup topic-lin-solvers
6
7#include <cyqlone/config.hpp>
10#include <cyqlone/parallel.hpp>
11#include <cyqlone/sparse.hpp>
12#include <cyqlone/timing.hpp>
13#include <batmat/assume.hpp>
14#include <batmat/config.hpp>
15#include <batmat/linalg/hyhound.hpp> // TODO: isolate size functions
16#include <batmat/matrix/layout.hpp>
17#include <batmat/matrix/matrix.hpp>
18#include <batmat/openmp.h>
19#include <batmat/simd.hpp>
20#include <batmat/unroll.h>
21#include <guanaqo/trace.hpp>
22
23#include <algorithm>
24#include <bit>
25#include <cassert>
26#include <utility>
27
28namespace CYQLONE_NS(cyqlone) {
29
31
32[[nodiscard]] constexpr bool is_pow_2(index_t n) {
33 BATMAT_ASSUME(n > 0);
34 auto un = static_cast<std::make_unsigned_t<index_t>>(n);
35 return std::has_single_bit(un);
36}
37
38[[nodiscard]] constexpr index_t ceil_log2(index_t n) {
39 BATMAT_ASSUME(n > 0);
40 auto un = static_cast<std::make_unsigned_t<index_t>>(n);
41 return static_cast<index_t>(std::bit_width(un - 1));
42}
43
44// TODO: replace by ν2
45[[nodiscard]] constexpr index_t get_level(index_t i) {
46 BATMAT_ASSUME(i > 0);
47 auto ui = static_cast<std::make_unsigned_t<index_t>>(i);
48 return static_cast<index_t>(std::countr_zero(ui));
49}
50// TODO: move to indexing.tpp or data.tpp?
51[[nodiscard]] constexpr index_t get_index_in_level(index_t i) {
52 if (i == 0)
53 return 0;
54 auto l = get_level(i);
55 return i >> (l + 1);
56}
57
58/// Solver for block-tridiagonal systems using cyclic reduction (CR), parallel cyclic reduction
59/// (PCR), and preconditioned conjugate gradient (PCG) methods.
60/// @tparam VL Vector length.
61/// @tparam T Scalar type.
62/// @tparam DefaultOrder Storage order for the matrix workspaces (row/column major).
63/// @ingroup topic-block-tridiag-solvers
64template <index_t VL = 4, class T = real_t, StorageOrder DefaultOrder = StorageOrder::ColMajor,
65 class Ctx = parallel::Context<>>
67 using value_type = T;
69 using Context = Ctx;
70 using SharedContext = typename Context::shared_context_type;
71
72 /// @name Problem dimensions
73 /// @{
74
75 const index_t block_size; ///< Block size of the block-tridiagonal system.
76 const index_t max_rank = 0; ///< Maximum update rank.
77
78 /// Whether the block-tridiagonal system is circular (nonzero top-right & bottom-left corners).
79 bool circular = false;
80
81 /// @}
82
83 /// @name Solver parameters
84 /// @{
85
86 /// Solver parameters for Tricyqle-specific settings.
88
89 /// Get the current solver parameters.
90 [[nodiscard]] Params get_params() const { return params; }
91
92 /// Update the solver parameters.
93 void update_params(const Params &new_params) { params = new_params; }
94
95 /// @}
96
97 /// @name Parallelization and vectorization
98 /// @{
99
100 /// Number of processors/threads.
101 const index_t p = 8;
102 /// Vector length.
103 static constexpr index_t v = VL;
104 /// log₂(p), logarithm of the number of processors/threads @ref p, rounded up.
105 [[nodiscard]] constexpr index_t lp() const { return ceil_log2(p); }
106 /// The number of processors @ref p rounded up to the next power of two.
107 [[nodiscard]] constexpr index_t ceil_p() const { return 1 << lp(); }
108 /// The number of parallel execution units `P = p * v` rounded up to the next power of two.
109 [[nodiscard]] constexpr index_t ceil_P() const { return 1 << (lp() + lv()); }
110 /// log₂(v), logarithm of the vector length @ref v.
111 [[nodiscard]] static constexpr index_t lv() { return ceil_log2(v); }
112
113 /// Represents a SIMD vector of width @ref v storing values of type @ref value_type.
115 /// Integral constant type for the vector length.
116 using vl_t = std::integral_constant<index_t, v>;
117 /// Integral constant type for the alignment of the batched matrix data structures.
118 using align_t = std::integral_constant<index_t, v * alignof(value_type)>;
119
120 /// Create a new parallel execution context, storing synchronization primitives and shared data
121 /// for the parallel algorithms.
122 std::unique_ptr<SharedContext> create_parallel_context() const {
123 return std::make_unique<SharedContext>(p);
124 }
125
126 /// @}
127
128 /// @name Indexing utilities
129 /// @{
130
131 /// 2-adic valuation ν₂.
132 [[nodiscard]] index_t ν2(index_t i) const;
133 /// 2-adic valuation modulo p, i.e. `ν2p(0) = ν2p(p) = lp()`.
134 [[nodiscard]] index_t ν2p(index_t i) const;
135 /// Add @p b to @p a modulo @ref ceil_p().
136 [[nodiscard]] index_t add_wrap_ceil_p(index_t a, index_t b) const;
137 /// Subtract @p b from @p a modulo @ref ceil_p().
138 [[nodiscard]] index_t sub_wrap_ceil_p(index_t a, index_t b) const;
139
140 /// @}
141
142 /// @name Matrix data structures
143 /// @{
144
145 /// Default storage order for most matrices.
146 static constexpr auto default_order = DefaultOrder;
147 /// Column-major storage order for column vectors and update matrices.
148 static constexpr auto column_major = StorageOrder::ColMajor;
149
150 /// Owning type for a batch of matrices (with batch size v).
151 template <StorageOrder O = column_major>
153 /// Non-owning immutable view type for @ref matrix.
154 template <StorageOrder O = column_major>
156 /// Non-owning mutable view type for @ref matrix.
157 template <StorageOrder O = column_major>
160 /// Non-owning immutable view type for a single batch of v matrices.
161 template <StorageOrder O = column_major>
163 /// Non-owning mutable view type for a single batch of v matrices.
164 template <StorageOrder O = column_major>
166
167 /// @}
168
169 /// @name Factorization and solve routines
170 /// @{
171
172 /// Initialize the diagonal blocks M of the block tridiagonal system using a user-provided
173 /// function. The function is called with an index `i` in `[0, p)` and a mutable view to the
174 /// compact workspace of depth `v` where the diagonal blocks `M(i+l*p)` for `l` in `[0, v)`
175 /// should be stored. Copying non-compact data to this workspace can be achieved using the
176 /// @ref cyqlone::linalg::pack function.
177 decltype(auto) init_diag(Context &ctx, auto &&func) {
178 return func(ctx.index, cr_L.batch(ctx.index));
179 }
180 /// Initialize the subdiagonal blocks K of the block tridiagonal system using a user-provided
181 /// function. The function is called with an index `i` in `[0, p)` and a mutable view to the
182 /// compact workspace of depth `v` where the subdiagonal blocks `K(i+l*p)` for `l` in `[0, v)`
183 /// should be stored. Copying non-compact data to this workspace can be achieved using the
184 /// @ref cyqlone::linalg::pack function.
185 decltype(auto) init_subdiag(Context &ctx, auto &&func) {
186 return (p == 1 || ctx.index & 1) ? func(ctx.index, cr_Y.batch(ctx.index))
187 : func(ctx.index, cr_U.batch(ctx.index + 1).transposed());
188 }
189 /// Initialize the right-hand side of the linear system using a user-provided function. The
190 /// function is called with an index `i` in `[0, p)` and a mutable view of the batch of @p b
191 /// of depth `v` where the right-hand side blocks `b(i+l*p)` for `l` in `[0, v)` should be
192 /// stored. Copying non-compact data to this batch can be achieved using the
193 /// @ref cyqlone::linalg::pack function.
194 decltype(auto) init_rhs(Context &ctx, mut_view<> b, auto &&func) const {
195 return func(ctx.index, b.batch(ctx.index));
196 }
197 /// Get access to the solution computed by this thread using a user-provided function. The
198 /// function is called with an index `i` in `[0, p)` and a view to the compact batch of @p λ
199 /// of depth `v` where the solution blocks `λ(i+l*p)` for `l` in `[0, v)` are stored. Copying
200 /// this batch to a non-compact layout can be achieved using the @ref cyqlone::linalg::unpack
201 /// function.
202 decltype(auto) get_solution(Context &ctx, view<> λ, auto &&func) const {
203 return func(ctx.index, λ.batch(ctx.index));
204 }
205 /// @copydoc get_solution
206 decltype(auto) get_solution(Context &ctx, mut_view<> λ, auto &&func) const {
207 return func(ctx.index, λ.batch(ctx.index));
208 }
209 /// @copydoc get_solution
210 template <class Λ, class F>
211 decltype(auto) get_solution(Context &ctx, Λ &&λ, F &&func) const {
212 return func(ctx.index, λ.batch(ctx.index));
213 }
214
215 /// Fused factorization and forward solve. Performs CR to reduce the system down to a single
216 /// block tridiagonal system of size @ref v which is then solved using PCR or PCG.
217 ///
218 /// @pre The workspaces `cr_L.batch(i)` contain the diagonal blocks `M(i:v:p)` of the block
219 /// tridiagonal system (see @ref init_diag).
220 /// @pre If `p > 1`, the workspaces `cr_Y.batch(i)` with odd i contain the subdiagonal blocks
221 /// `K(i:v:p)` of the block tridiagonal system (see @ref init_subdiag).
222 /// @pre If `p > 1`, the workspaces `cr_U.batch(i)` with odd i contain the superdiagonal blocks
223 /// `K(i-1:v:p)ᵀ` of the block tridiagonal system (see @ref init_subdiag).
224 /// @pre If `p = 1`, the workspace `cr_Y.batch(0)` contains the subdiagonal blocks `K(0:v:1)`
225 /// of the block tridiagonal system, and `cr_U.batch(0)` is not used.
226 /// @post The workspaces `cr_L.batch(i)` with `i > 0` contain the diagonal blocks of the CR
227 /// Cholesky factor. `cr_Y.batch(i)` and `cr_U.batch(i)` contain the subdiagonal blocks.
228 /// The final batch `cr_L.batch(0)` contains the diagonal blocks of the Schur complement
229 /// of all other blocks, which is the input to the PCR or PCG solver. The batch
230 /// `pcr_L.batch(0)` contains its Cholesky factorizations. The subdiagonal blocks of this
231 /// Schur complement are stored in `pcr_Y.batch(0)`.
232 /// @post If the PCR solver was selected, the full PCR factorization is stored in @ref pcr_L,
233 /// @ref pcr_U and @ref pcr_Y.
234 ///
235 /// @param ctx Parallel execution context for communication/synchronization between threads.
236 /// @param λ On entry, the right-hand side of the linear system. On exit, the solution of the
237 /// forward solve phase.
238 /// @param stride Stride (in number of batches) between batches of @p λ. In total, @p λ
239 /// contains `stride * p` batches, but only every `stride`-th batch is accessed.
240 ///
241 /// Note that a backward solve of the final block `λ.batch(0)` is performed during the forward
242 /// solve phase, so `λ.batch(0)` contains (part of) the final solution. Performing the forward
243 /// and backward solves separately for this block is not possible, because it is solved using
244 /// PCR or PCG. Both methods solve the full block tridiagonal system at once, rather than using
245 /// a single explicit CR Cholesky factorization with distinct forward and backward solves.
246 void factor_solve(Context &ctx, mut_view<> λ, index_t stride = 1);
247 /// Perform only the factorization as described by @ref factor_solve.
248 void factor(Context &ctx);
249 /// Perform only the forward solve as described by @ref factor_solve.
250 void solve_forward(Context &ctx, mut_view<> λ, index_t stride = 1);
251
252 /// Perform the backward solve phase, after the forward solve phase has been performed by
253 /// @ref factor_solve.
254 /// @param ctx Parallel execution context for communication/synchronization between threads.
255 /// @param λ On entry, the solution of the forward solve phase. On exit, the solution of the
256 /// full linear system.
257 /// @param work Workspace of `p * v` column vectors of size @ref block_size.
258 /// @param stride Stride (in number of batches) between batches of @p λ. In total, @p λ
259 /// contains `stride * p` batches, but only every `stride`-th batch is accessed.
260 void solve_reverse(Context &ctx, mut_view<> λ, mut_view<> work, index_t stride = 1) const;
261 void solve_reverse(Context &ctx, mut_view<> λ, index_t stride = 1) {
262 solve_reverse(ctx, λ, work_cr, stride);
263 }
264
265 /// @}
266
267 /// @name Cyclic reduction data structures
268 /// @{
269
270 /// Diagonal blocks of the Cholesky factor of the Schur complement (used during CR).
271 /// Batch indices correspond to block column indices of the block tridiagonal system.
273 return matrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};
274 }();
275 /// Subdiagonal blocks U of the Cholesky factor of the Schur complement (used during CR).
276 /// Batch indices correspond to block column indices of the block tridiagonal system.
278 return matrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};
279 }();
280 /// Subdiagonal blocks Y of the Cholesky factor of the Schur complement (used during CR).
281 /// Batch indices correspond to block column indices of the block tridiagonal system.
283 return matrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};
284 }();
285 /// Temporary workspace for the CR solve phase.
287 return matrix<column_major>{{.depth = p * v, .rows = block_size, .cols = 1}};
288 }();
289
290 /// @}
291
292 /// @name Parallel cyclic reduction data structures
293 /// @{
294
295 /// Diagonal blocks of the PCR Cholesky factorizations.
298 {.depth = v * (lv() + 1), .rows = block_size, .cols = block_size}};
299 }();
300 /// Subdiagonal blocks Y of the PCR Cholesky factorizations.
302 return matrix<default_order>{{.depth = v * lv(), .rows = block_size, .cols = block_size}};
303 }();
304 /// Subdiagonal blocks U of the PCR Cholesky factorizations.
306 return matrix<default_order>{{.depth = v * lv(), .rows = block_size, .cols = block_size}};
307 }();
308 /// Workspace to store the diagonal blocks during the PCR factorization.
310 return matrix<default_order>{{.depth = v, .rows = block_size, .cols = block_size}};
311 }();
312 /// Temporary workspace for CG vectors.
314 return matrix<column_major>{{.depth = v, .rows = block_size, .cols = 4}};
315 }();
316
317 /// @}
318
319 /// @name Factorization update data structures
320 /// @{
321
322 /// Update rank (number of changing constraints) per thread.
323 std::vector<index_t> m_update = std::vector<index_t>(p);
324 /// Update rank from D(0). Negative if D(0) is not handled separately.
325 index_t m_update_u0 = -1;
326
327 /// Compressed reprentation of the nonzero diagonal elements of the matrix Σ.
329 return matrix<column_major>{{.depth = v, .rows = max_rank, .cols = 1}};
330 }();
331 /// Workspace to store the update matrices Ξ(Υ) for the factorization update.
333 return matrix<column_major>{{.depth = 4 * v, .rows = block_size, .cols = max_rank}};
334 }();
335 /// Storage for the hyperbolic Householder transformations.
337 using namespace batmat::linalg;
338 const auto [r, c] = hyhound_size_W(tril(cr_L.batch(0)));
339 return matrix<column_major>{{.depth = p * v, .rows = r, .cols = c}};
340 }();
341
342 /// Two copies of @ref work_update_Σ for PCR updates.
344 return matrix<column_major>{{.depth = v, .rows = 2 * max_rank, .cols = 1}};
345 }();
346 /// Update matrices to apply to the diagonal blocks L during PCR updates.
348 return matrix<column_major>{{.depth = v, .rows = block_size, .cols = max_rank}};
349 }();
350 /// Update matrices to apply to the subdiagonal blocks U and Y during PCR updates.
352 return matrix<column_major>{{.depth = v, .rows = block_size, .cols = 2 * max_rank}};
353 }();
354
355 /// @}
356
357 /// @name Low-level factorization and solve routines
358 /// @{
359
360 /// Fused factorization and forward solve. Unlike @ref factor_solve, this function assumes that
361 /// the odd diagonal blocks in the first level of CR have already been factorized and solved.
362 /// This allows the user to fuse the evaluation and factorization of these blocks, possibly
363 /// enabling higher performance by avoiding an additional trip to memory.
364 /// @pre If `p > 1`, the workspaces `cr_L.batch(i)` contain the diagonal blocks M(i) for even i,
365 /// and the Cholesky factors L(i) of M(i) for odd i.
366 /// @pre If `p = 1`, the workspace `cr_L.batch(0)` contains the diagonal block M(0), and the
367 /// workspace `pcr_L.batch(0)` contains its Cholesky factor L(0).
368 /// @pre If `p > 1`, the workspaces `cr_Y.batch(i)` contain the subdiagonal blocks K(i) for
369 /// odd i (uninitialized for even i).
370 /// @pre If `p = 1`, the workspace `cr_Y.batch(0)` contains the subdiagonal block K(0).
371 /// @pre If `p > 1`, the workspace `cr_U.batch(i)` contains the superdiagonal blocks
372 /// K(i-1)ᵀ for odd i (uninitialized for even i).
373 /// @pre If `p = 1`, the workspace `cr_U.batch(0)` is not used.
374 /// @pre If `p > 1`, the right-hand sides `λ.batch(stride * i)` contain the right-hand sides of
375 /// the system for even i, and the right-hand sides multiplied by L(i)⁻¹ for odd i.
376 /// @pre If `p = 1`, the right-hand side `λ.batch(0)` contains the right-hand side of the system.
377 template <bool Factor = true, bool Solve = true>
378 void factor_solve_skip_first(Context &ctx, mut_view<> λ, index_t stride = 1);
379 /// Factorization-only variant of @ref factor_solve_skip_first.
381 /// Solution-only variant of @ref factor_solve_skip_first.
382 void solve_forward_skip_first(Context &ctx, mut_view<> λ, index_t stride = 1) {
384 }
385 /// Implementation of @ref factor_solve.
386 template <bool Factor = true, bool Solve = true>
387 void factor_solve_impl(Context &ctx, mut_view<> λ, index_t stride = 1);
388
389 /// @}
390
391 /// @name Low-level CR factorization and solve routines
392 /// @{
393
394 [[nodiscard]] index_t cr_thread_assignment(index_t l, index_t c) const;
395 /// Compute a block U in the Cholesky factor for the given CR level @p l and column index @p iU.
396 void factor_U(index_t l, index_t iU);
397 /// Compute a block Y in the Cholesky factor for the given CR level @p l and column index @p iY.
398 void factor_Y(index_t l, index_t iY);
399 /// Update and factorize a block L in the Cholesky factor for CR level @p l+1 and column index
400 /// @p i, using the previously computed blocks U and Y in the same row at level @p l.
401 void factor_L(index_t l, index_t i);
402 /// Compute a subdiagonal block K of the Schur complement for CR level @p l+1 and column index
403 /// @p i, using the previously computed blocks U and Y in the same row at level @p l.
404 void update_K(index_t l, index_t i);
405
406 /// Update the right-hand side @p λ during the forward solve phase of CR after computing
407 /// block @p iU of @p λ at level @p l, subtracting the product `U(iU) λ(iU)` from the block
408 /// of @p λ in the same row as `U(iU)`.
409 /// The @p stride parameter specifies the stride between consecutive blocks of @p λ.
410 void solve_u_forward(index_t l, index_t iU, mut_view<> λ, index_t stride) const;
411 /// Update the right-hand side @p λ during the forward solve phase of CR after computing
412 /// block @p iY of @p λ at level @p l, subtracting the product `Y(iY) λ(iY)` from the block
413 /// of @p λ in the same row as `Y(iY)`.
414 /// The product `Y(iY) λ(iY)` is stored in the workspace @p w to allow it to be computed
415 /// concurrently with @ref solve_u_forward, which updates the same block of @p λ.
416 /// The @p stride parameter specifies the stride between consecutive blocks of @p λ.
417 void solve_y_forward(index_t l, index_t iY, mut_view<> λ, mut_view<> w, index_t stride) const;
418 /// Apply the updates to block @p iL of the right-hand side from @ref solve_u_forward and
419 /// @ref solve_y_forward, and then solve with the diagonal block L for the next level @p l+1.
420 /// The @p stride parameter specifies the stride between consecutive blocks of @p λ.
421 void solve_λ_forward(index_t l, index_t iL, mut_view<> λ, view<> w, index_t stride) const;
422
423 /// @}
424
425 /// @name Low-level PCR factorization and solve routines
426 /// @{
427
428 static constexpr bool merge_last_level_pcr = true;
429 /// Compute the parallel cyclic reduction factorization of the final block tridiagonal system
430 /// of size @ref v.
431 /// The function assumes that the Cholesky factors of the diagonal blocks are stored in
432 /// `cr_L.batch(0)`, and the subdiagonal blocks are stored in `cr_Y.batch(0)`.
433 /// This variant computes both subdiagonal blocks on a single thread.
435 /// Perform a single level of the PCR factorization.
436 /// This variant computes both subdiagonal blocks on a single thread.
437 template <index_t Level>
439 /// Compute the parallel cyclic reduction factorization of the final block tridiagonal system
440 /// of size @ref v.
441 /// This variant computes the subdiagonal blocks in parallel on different threads.
443 /// Perform a single level of the PCR factorization.
444 /// This variant computes the subdiagonal blocks in parallel on different threads.
445 template <index_t Level>
447
448 /// Solve a linear system with the final block tridiagonal system of size @ref v using the PCR
449 /// factorization.
451 /// @copydoc solve_pcr
452 void solve_pcr(mut_batch_view<> λ) { solve_pcr(λ, work_pcg.batch(0).left_cols(1)); }
453 /// Perform a single level of the PCR solve.
454 template <index_t Level>
456
457 /// @}
458
459 /// @name Low-level PCG routines
460 /// @{
461
462 /// Multiply a vector by the final block tridiagonal matrix of size @ref v.
463 /// The matrix is represented by the Cholesky factors of the diagonal blocks L and the
464 /// subdiagonal blocks K.
465 /// ~~~
466 /// M = [ M(0) Kᵀ(0) ] where M(i) = L(i) L(i)ᵀ
467 /// [ K(0) M(1) Kᵀ(1) ]
468 /// [ K(1) M(2) Kᵀ(2) ]
469 /// [ K(2) M(3) ]
470 /// ~~~
473 /// Multiply a vector by the preconditioner for the final block tridiagonal system of size
474 /// @ref v.
475 /// @see mul_Mv
478 /// Solve a linear system with the final block tridiagonal system of size @ref v using the
479 /// preconditioned conjugate gradient method.
480 /// The function assumes that the Cholesky factors of the diagonal blocks are stored in
481 /// `cr_L.batch(0)`, and the subdiagonal blocks are stored in `cr_Y.batch(0)`.
483 /// @copydoc solve_pcg
484 void solve_pcg(mut_batch_view<> λ) { solve_pcg(λ, work_pcg.batch(0)); }
485
486 /// @}
487
488 /// @name Low-level reverse solve routines
489 /// @{
490
491 void solve_reverse_parallel(Context &ctx, mut_view<> λ, mut_view<> work, index_t stride) const;
492 void solve_reverse_serial(mut_view<> λ, mut_view<> work, index_t stride) const;
493 void solve_u_backward(index_t l, index_t iU, mut_view<> λ, mut_view<> w, index_t stride) const;
494 void solve_y_backward(index_t l, index_t iY, mut_view<> λ, index_t stride) const;
495 void solve_λ_backward(index_t biL, mut_view<> λ, view<> w, index_t stride) const;
496
497 /// @}
498
499 /// @name Low-level factorization update routines
500 /// @{
501
502 /// @todo properly define semantics and indices.
503 void set_thread_update_rank(Context &ctx, index_t c, index_t m);
504 /// @todo properly define semantics and indices.
505 void set_update_rank_extra(index_t m);
506 /// @todo properly define semantics and indices.
508
509 [[nodiscard]] std::pair<index_t, index_t> cols_Ups_fwd(index_t l, index_t i) const;
510 [[nodiscard]] std::pair<index_t, index_t> cols_Ups_bwd(index_t l, index_t i) const;
511 [[nodiscard]] std::pair<index_t, index_t> cols_Q_cr(index_t l, index_t i) const;
512 [[nodiscard]] index_t work_Ups_fwd_w(index_t l, index_t i) const;
513 [[nodiscard]] index_t work_Ups_bwd_w(index_t l, index_t i) const;
514 [[nodiscard]] mut_batch_view<column_major> work_Ups_fwd(index_t l, index_t i);
515 [[nodiscard]] mut_batch_view<column_major> work_Ups_bwd(index_t l, index_t i);
516 [[nodiscard]] mut_batch_view<column_major> work_Q_cr(index_t l, index_t i);
517 [[nodiscard]] mut_batch_view<column_major> work_Σ_fwd(index_t l, index_t i);
518 [[nodiscard]] mut_batch_view<column_major> work_Σ_bwd(index_t l, index_t i);
519 [[nodiscard]] mut_batch_view<column_major> work_Σ_Q(index_t l, index_t i);
526
527 template <bool Solve = true>
528 void update_solve_cr(Context &ctx, mut_view<> λ, index_t stride);
529 void update_L(index_t l, index_t i);
530 void update_U(index_t l, index_t i);
531 void update_Y(index_t l, index_t i);
532
533 template <index_t Level>
536
537 /// @}
538
539 /// @name Low-level prefetching
540 /// @{
541
542 template <StorageOrder O>
543 void prefetch(batch_view<O> X) const;
544 template <StorageOrder O>
546 void prefetch_L(index_t bi) const;
547 void prefetch_U(index_t l, index_t iU) const;
548 void prefetch_Y(index_t l, index_t iY) const;
549
550 /// @}
551};
552
553/// Linear solver for systems with optimal control structure.
554/// @tparam VL Vector length.
555/// @tparam T Scalar type.
556/// @tparam DefaultOrder Storage order for the matrix workspaces (row/column major).
557/// @tparam Ctx Parallel execution context type, see @ref parallel::Context.
558/// @ingroup topic-ocp-solvers
559template <index_t VL = 4, class T = real_t, StorageOrder DefaultOrder = StorageOrder::ColMajor,
560 class Ctx = parallel::Context<>>
562 using value_type = T;
563
564 /// @name Problem dimensions
565 /// @{
566
567 const index_t N_horiz; ///< Horizon length of the optimal control problem.
568 const index_t nx; ///< Number of states of the OCP.
569 const index_t nu; ///< Number of controls of the OCP.
570 const index_t ny; ///< Number of general constraints of the OCP per stage.
571 const index_t ny_0; ///< Number of general constraints at stage 0, D(0) u(0).
572 const index_t ny_N; ///< Number of general constraints at the final stage, C(N) x(N).
573
574 /// Get the total number of primal variables in the OCP.
575 /// @note The actual number of variable stored in Cyqlone's internal data structures may be
576 /// larger.
577 [[nodiscard]] index_t num_variables() const { return N_horiz * (nu + nx); }
578 /// Get the total number of dynamics constraints in the OCP.
579 /// @note The actual number of constraints stored in Cyqlone's internal data structures may be
580 /// larger.
581 [[nodiscard]] index_t num_dynamics_constraints() const { return N_horiz * nx; }
582 /// Get the total number of general constraints in the OCP.
583 /// @note The actual number of constraints stored in Cyqlone's internal data structures may be
584 /// larger.
585 [[nodiscard]] index_t num_general_constraints() const {
586 return (N_horiz - 1) * ny + ny_0 + ny_N;
587 }
588
589 /// @}
590
591 /// @name Parallelization and vectorization
592 /// @{
593
594 /// Tricyqle solver type for solving block-tridiagonal systems in parallel.
599
600 /// Number of processors/threads.
601 const index_t p;
602 /// Vector length.
603 static constexpr index_t v = VL;
604 /// Number of stages per thread per vector lane (rounded up).
605 const index_t n = (N_horiz + p * v - 1) / (p * v);
606
607 /// log₂(v), logarithm of the vector length.
608 [[nodiscard]] constexpr index_t lv() const { return tricyqle.lv(); }
609 /// log₂(p), logarithm of the number of processors/threads, rounded up.
610 [[nodiscard]] constexpr index_t lp() const { return tricyqle.lp(); }
611 /// The number of processors @ref p rounded up to the next power of two.
612 [[nodiscard]] constexpr index_t ceil_p() const { return tricyqle.ceil_p(); }
613 /// The number of parallel execution units P rounded up to the next power of two.
614 [[nodiscard]] constexpr index_t ceil_P() const { return tricyqle.ceil_P(); }
615
616 std::unique_ptr<SharedContext> create_parallel_context() const {
617 return tricyqle.create_parallel_context();
618 }
619
620 /// Call a function for each stage in the horizon, passing the stage index, the data batch
621 /// index, and optionally the corresponding batches of the given arrays.
622 /// Iterates backwards in time (decreasing stage index j).
623 void foreach_stage(Context &ctx, auto &&func, auto &&...xs) const {
624 BATMAT_ASSERT(((xs.batch_size() == v) && ...));
625 BATMAT_ASSERT(((xs.depth() == ceil_N()) && ...));
626 const index_t ti = riccati_thread_assignment(ctx);
627 for (index_t i = 0; i < n; ++i) {
628 const index_t di = ti * n + i;
629 const index_t j = sub_wrap_ceil_N(ti * n, i);
630 func(j, di, xs.batch(di)...);
631 }
632 }
633 /// Call a function for each stage in the horizon, passing the stage index, the data batch
634 /// index, and optionally the corresponding batches of the given arrays.
635 /// Iterates forward in time (increasing stage index j).
636 void foreach_stage_fwd(Context &ctx, auto &&func, auto &&...xs) const {
637 BATMAT_ASSERT(((xs.batch_size() == v) && ...));
638 BATMAT_ASSERT(((xs.depth() == ceil_N()) && ...));
639 const index_t ti = riccati_thread_assignment(ctx);
640 for (index_t i = n; i --> 0;) {
641 const index_t di = ti * n + i;
642 const index_t j = sub_wrap_ceil_N(ti * n, i);
643 func(j, di, xs.batch(di)...);
644 }
645 }
646
647 /// @}
648
649 /// @name Indexing utilities
650 /// @{
651
652 /// Horizon length, rounded up to a multiple of the number of parallel execution units.
653 [[nodiscard]] index_t ceil_N() const { return n * p * v; }
654 /// 2-adic valuation ν₂.
655 [[nodiscard]] index_t ν2(index_t i) const;
656 /// 2-adic valuation modulo p, i.e. `ν2p(0) = ν2p(p) = lp()`.
657 [[nodiscard]] index_t ν2p(index_t i) const;
658 /// Add @p b to @p a modulo @ref N_horiz.
659 [[nodiscard]] index_t add_wrap_ceil_N(index_t a, index_t b) const;
660 /// Subtract @p b from @p a modulo @ref N_horiz.
661 [[nodiscard]] index_t sub_wrap_ceil_N(index_t a, index_t b) const;
662 /// Add @p b to @p a modulo @ref p.
663 [[nodiscard]] index_t add_wrap_p(index_t a, index_t b) const;
664 /// Subtract @p b from @p a modulo @ref p.
665 [[nodiscard]] index_t sub_wrap_p(index_t a, index_t b) const;
666 /// Add @p b to @p a modulo @ref ceil_p().
667 [[nodiscard]] index_t add_wrap_ceil_p(index_t a, index_t b) const;
668 /// Subtract @p b from @p a modulo @ref ceil_p().
669 [[nodiscard]] index_t sub_wrap_ceil_p(index_t a, index_t b) const;
670
671 /// @todo refactor sparse.tpp
672 [[nodiscard]] index_t sub_wrap_ceil_P(index_t a, index_t b) const;
673 /// @todo refactor sparse.tpp
674 [[nodiscard]] index_t add_wrap_ceil_P(index_t a, index_t b) const;
675 /// @todo refactor sparse.tpp
676 [[nodiscard]] index_t get_linear_batch_offset(index_t biA) const;
677
678 /// @}
679
680 /// @name Matrix data structures
681 /// @{
682
683 /// Default storage order for most matrices.
685 /// Column-major storage order for column vectors and update matrices.
686 static constexpr auto column_major = tricyqle_t::column_major;
687
688 /// Owning type for a batch of matrices (with batch size v).
689 template <StorageOrder O = column_major>
690 using matrix = typename tricyqle_t::template matrix<O>;
691 /// Non-owning immutable view type for @ref matrix.
692 template <StorageOrder O = column_major>
693 using view = typename tricyqle_t::template view<O>;
694 /// Non-owning mutable view type for @ref matrix.
695 template <StorageOrder O = column_major>
696 using mut_view = typename tricyqle_t::template mut_view<O>;
698 /// Non-owning immutable view type for a single batch of v matrices.
699 template <StorageOrder O = column_major>
700 using batch_view = typename tricyqle_t::template batch_view<O>;
701 /// Non-owning mutable view type for a single batch of v matrices.
702 template <StorageOrder O = column_major>
703 using mut_batch_view = typename tricyqle_t::template mut_batch_view<O>;
704
705 /// @}
706
707 /// @name Solver parameters
708 /// @{
709
710 /// Solver parameters and settings.
712
713 /// Get the current Cyqlone solver parameters.
714 [[nodiscard]] CyqloneParams<value_type> get_params() const { return params; }
715
716 /// Update the Cyqlone solver parameters.
717 void update_params(const CyqloneParams<value_type> &new_params) { params = new_params; }
718
719 /// Get the current Tricyqle solver parameters.
721 return tricyqle.get_params();
722 }
723
724 /// Update the Tricyqle solver parameters.
726 tricyqle.update_params(new_params);
727 }
728
729 /// Get a string representation of the main solver parameters. Used mainly for file names.
730 [[nodiscard]] std::string get_params_string() const {
731 const auto &tricyqle_params = get_tricyqle_params();
732 std::string_view solve = tricyqle_params.solve_method == SolveMethod::PCR ? "pcr"
733 : tricyqle_params.solve_method == SolveMethod::StairPCG
734 ? "pcg=stair"
735 : "pcg=jacobi";
736 std::string_view order = default_order == StorageOrder::RowMajor ? "rm" : "cm";
737 return std::format("nx={}-nu={}-ny={}-N={}-p={}-v={}-{}-{}", nx, nu, ny, N_horiz, p, v,
738 solve, order);
739 }
740
741 /// @}
742
743 /// @name Tricyqle solver for block-tridiagonal systems
744 /// @{
745
746 /// Block-tridiagonal solver (CR/PCR/PCG).
748 .block_size = nx,
749 .max_rank = std::max(ny, ny_0 + ny_N) * N_horiz,
750 .p = p,
751 };
752
753 /// @}
754
755 // Note: the cumbersome IILE initialization syntax is to work around a GCC bug
756 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116015
757
758 /// @name OCP data (reordered for use during the Cyqlone algorithm)
759 /// @{
760
761 /// Stage-wise Hessian blocks H(j) = [ R(j) S(j); S(j)ᵀ Q(j) ] of the OCP cost function.
763 return matrix<default_order>{{.depth = ceil_N(), .rows = nu + nx, .cols = nu + nx}};
764 }();
765 /// Stage-wise dynamics matrices F(j) = [ B(j) A(j) ] of the OCP.
767 return matrix<default_order>{{.depth = ceil_N(), .rows = nx, .cols = nu + nx}};
768 }();
769 /// Stage-wise constraint Jacobians G(j)ᵀ = [ D(j) C(j) ]ᵀ of the OCP.
771 const auto nyM = std::max(ny, ny_0 + ny_N);
772 return matrix<default_order>{{.depth = ceil_N(), .rows = nu + nx, .cols = nyM}};
773 }();
774
775 /// @}
776
777 /// @name Modified Riccati data structures
778 /// @{
779
780 /// Cholesky factors of the Hessian blocks for the Riccati recursion.
781 /// LH(j) = [ LR(j) 0; LS(j) LQ(j) ]
783 return matrix<default_order>{{.depth = p * v, .rows = nu + nx, .cols = n * (nu + nx)}};
784 }();
785 /// Storage for the matrices LB(j), Acl(j) and LA(j₁) for the Riccati recursion.
786 /// Grouped per thread, with layout [ Acl(jₙ) ... Acl(j₂) LA(j₁) | LB(jₙ) ... LB(j₁) ], so that
787 /// LA(j₁) and LB(j) are contiguous (useful when evaluating the Schur complement).
789 return matrix<default_order>{{.depth = p * v, .rows = nx, .cols = n * nx + n * nu}};
790 }();
791 /// Temporary storage for the V(j) = [ B(j)ᵀ LQ(j); A(j)ᵀ LQ(j) ] matrices during the Riccati
792 /// recursion. The workspace is wider than just V to also accommodate the active constraint
793 /// Jacobians, since both are used to update the Hessian blocks during the Riccati recursion.
795 const auto nyM = std::max(ny, ny_0 + ny_N);
796 return matrix<default_order>{{.depth = p * v, .rows = nu + nx, .cols = nx + nyM}};
797 }();
798 /// Temporary workspace for the Riccati solve phase.
800 return matrix<column_major>{{.depth = p * v, .rows = nx, .cols = 1}};
801 }();
802
803 /// @name Riccati factorization update data structures
804 /// @{
805
806 /// Compressed representation of the nonzero diagonal elements of the matrix Σ, populated
807 /// for each thread separately during the factorization update of the Riccati recursion.
809 const auto nyM = std::max(ny, ny_0 + ny_N);
810 return matrix<column_major>{{.depth = p * v, .rows = n * nyM, .cols = 1}};
811 }();
812 /// Workspace to store the update matrices Υu, Υx, Υλ, Φu, Φx and Φλ during the factorization
813 /// update of the Riccati recursion.
814 /// Both @ref riccati_Υ1 and @ref riccati_Υ2 are used alternately.
816 const auto nyM = std::max(ny, ny_0 + ny_N);
817 return matrix<column_major>{{.depth = p * v, .rows = nu + nx + nx, .cols = n * nyM}};
818 }();
819 /// Alternate workspace to @ref riccati_Υ1.
821 const auto nyM = std::max(ny, ny_0 + ny_N);
822 return matrix<column_major>{{.depth = p * v, .rows = nu + nx + nx, .cols = n * nyM}};
823 }();
824
825 /// @}
826
827 /// @name Packing and unpacking of OCP data to Cyqlone storage format
828 /// @{
829
830 /// @}
831
832 /// @name Packing and unpacking of OCP data to Cyqlone storage format
833 /// @{
834
835 /// Initialize a Cyqlone solver for the given OCP.
836 ///
837 /// Note: constraints on u(0) and x(N) should be independent.
838 /// ~~~
839 /// nx nu
840 /// ocp.CD(0) = [ 0 | D ] ny₀
841 /// [ 0 | 0 ] ny - ny₀
842 /// ~~~
843 ///
844 /// Since ocp.D(0) and ocp.C(N) will be merged, the top ny₀ rows of ocp.C(N)
845 /// should be zero.
846 ///
847 /// @todo Create documentation page about the different OCP representations and storage formats.
848 static CyqloneSolver build(const CyqloneStorage<value_type> &ocp, index_t p);
849 /// Update the internal data structures to reflect changes in the OCP data (without changing
850 /// the problem size).
852 /// Initialize the right-hand side vector for the dynamics constraints of the OCP, using the
853 /// custom Cyqlone storage format.
855 /// @copydoc initialize_rhs
857 /// Initialize the gradient vector for the OCP cost function, using the custom Cyqlone storage
858 /// format.
860 /// @copydoc initialize_gradient
862 /// Initialize the lower and upper bounds for the general constraints of the OCP, using the
863 /// custom Cyqlone storage format.
865 mut_view<> b_max) const;
866 /// @copydoc initialize_bounds
867 std::pair<matrix<>, matrix<>> initialize_bounds(const CyqloneStorage<value_type> &ocp) const;
868
869 /// @todo check and document behavior when `N_horiz != ceil_N()`.
870 void pack_variables(std::span<const value_type> ux_lin, mut_view<> ux) const;
871 matrix<> pack_variables(std::span<const value_type> ux_lin) const;
872 void unpack_variables(view<> ux, std::span<value_type> ux_lin) const;
873 std::vector<value_type> unpack_variables(view<> ux) const;
874 void pack_dynamics(std::span<const value_type> λ_lin, mut_view<> λ) const;
875 matrix<> pack_dynamics(std::span<const value_type> λ_lin) const;
876 void unpack_dynamics(view<> λ, std::span<value_type> λ_lin) const;
877 std::vector<value_type> unpack_dynamics(view<> λ) const;
878 void pack_constraints(std::span<const value_type> y_lin, mut_view<> y,
879 value_type fill = 0) const;
880 matrix<> pack_constraints(std::span<const value_type> y_lin, value_type fill = 0) const;
881 void unpack_constraints(view<> y, std::span<value_type> y_lin) const;
882 std::vector<value_type> unpack_constraints(view<> y) const;
883
884 /// Get a zero-initialized matrix for the primal variables u and x.
886 /// Get a zero-initialized matrix for the dynamics constraints (or their multipliers).
888 /// Get a zero-initialized matrix for the general constraints (or their multipliers).
890
891 /// @}
892
893 /// @name OCP cost gradient and constraints evaluation
894 /// @{
895
896 /// Compute Mx + b, where M is the dynamics constraint Jacobian matrix of the OCP.
897 /// In other words, evaluate the residuals for all stages, i.e.,
898 /// @f$ A_j x^j + B_j u^j - x^{j+1} + b^j @f$.
900 /// Compute Mᵀλ, where M is the dynamics constraint Jacobian matrix of the OCP.
901 /// Optionally add the result to the existing contents of Mᵀλ by setting @p accum to true.
903 bool accum = false) const;
904 /// Compute the general constraints Gx, where G is the general constraint Jacobian matrix of the
905 /// OCP. In other words, evaluate the constraints for all stages, i.e.,
906 /// @f$ C_j x^j + D_j u^j @f$.
907 void general_constr(Context &ctx, view<> ux, mut_view<> DCux) const;
908 /// Compute Gᵀy, where G is the general constraint Jacobian matrix of the OCP.
910 /// @copydoc transposed_general_constr
911 /// @todo Remove.
913 /// Compute the cost gradient, with optional scaling factors.
914 /// grad_f ← Q ux + α q + β grad_f
916 mut_view<> grad_f) const;
917 /// Compute the regularized cost gradient, with regularization parameter γ⁻¹, with respect to
918 /// the point @p ux0.
920 mut_view<> grad_f) const;
921 /// Subtract the regularization term from the cost gradient.
923 mut_view<> grad_f) const;
924
925 /// @}
926
927 /// @name Factorization and solve routines
928 /// @{
929
930 /// Compute the Cyqlone factorization of the KKT matrix of the OCP and perform a forward solve
931 /// (fused for improved locality).
932 /// @param ctx Parallel context.
933 /// @param γ Reciprocal primal regularization.
934 /// @param Σ ALM penalty factors.
935 /// @param[in,out] ux Negative augmented Lagrangian gradient on entry; solution of forward solve
936 /// on exit.
937 /// @param[in,out] λ Constant term of the dynamics constraints on entry; solution of forward
938 /// solve on exit.
939 /// To obtain the solution of the KKT system, a reverse solve with the same factorization must
940 /// be performed afterwards.
941 /// @see @ref solve_reverse
943 /// Compute the Cyqlone factorization of the KKT matrix of the OCP.
944 /// @see @ref factor_solve
945 void factor(Context &ctx, value_type γ, view<> Σ);
946 /// Perform a forward solve with the Cyqlone factorization.
947 /// @see @ref factor_solve
949 /// Perform a reverse solve with the Cyqlone factorization.
950 /// @param ctx Parallel context.
951 /// @param ux On entry, the result of the forward solve; on exit the solution of the primal
952 /// variables of the KKT system.
953 /// @param λ On entry, the result of the forward solve; on exit the solution of the dual
954 /// variables corresponding to the dynamics constraints of the KKT system.
956 /// Fused variant of @ref solve_reverse and @ref transposed_dynamics_constr (for improved
957 /// locality of the dynamics Jacobians).
959 /// Perform factorization updates of the Cyqlone factorization as described by
960 /// Algorithm 4 in the paper.
961 /// @param ctx Parallel context.
962 /// @param ΔΣ Changes to the ALM penalty factors Σ.
963 void update(Context &ctx, view<> ΔΣ);
964 /// Fused variant of @ref update and @ref solve_forward.
966
967 /// @}
968
969 /// @name Low-level factorization and forward solve routines
970 /// @{
971
972 index_t riccati_thread_assignment(Context &ctx) const { return add_wrap_p(ctx.index, 1); }
973 template <bool Factor = true, bool Solve = true>
975 template <bool Factor = true, bool Solve = true>
977
978 /// @}
979
980 /// @name Low-level factorization and forward solve routines for parallel cyclic reduction
981 /// @{
982
983 template <bool Factor = true, bool Solve = true>
985
986 /// @}
987
988 /// @name Low-level reverse solve routines
989 /// @{
990
992 std::optional<mut_view<>> Mᵀλ) const;
994 std::optional<mut_view<>> Mᵀλ = std::nullopt) const;
995
996 /// @}
997
998 /// @name Low-level factorization update routines
999 /// @{
1000
1001 /// Update the modified Riccati factorization of a single block column as described by
1002 /// Algorithm 3 in the paper.
1003 template <bool Solve = true>
1005 void update_riccati(Context &ctx, view<> ΔΣ) { update_riccati_solve<false>(ctx, ΔΣ, {}, {}); }
1006 template <bool Solve = true>
1008
1009 /// @}
1010
1011 /// @name Build sparse representations for debugging and testing
1012 /// @{
1013
1015 std::span<const value_type> Σ) const;
1016 [[nodiscard]] std::vector<value_type> build_rhs(view<> rq, view<> b, value_type scale_rq = -1,
1017 value_type scale_b = -1) const;
1018 [[nodiscard]] std::vector<value_type> build_sol(view<> ux, view<> λ) const {
1019 return build_rhs(ux, λ, 1, 1);
1020 }
1021 [[nodiscard]] SparseMatrix build_sparse_factor() const;
1022 [[nodiscard]] SparseMatrix build_sparse_diag() const;
1023
1024 /// @}
1025};
1026
1027} // namespace CYQLONE_NS(cyqlone)
#define BATMAT_ASSUME(x)
#define BATMAT_ASSERT(x)
std::string_view order(qp::StorageOrder o)
Data structure for optimal control problems where the initial states are eliminated.
@ StairPCG
Preconditioned Conjugate Gradient with staircase preconditioner (iterative).
@ PCR
Parallel Cyclic Reduction (direct).
Parameters and settings for the Tricyqle block-tridiagonal solver.
auto hyhound_size_W(Structured< VL, SL > L)
void fill(simdified_value_t< VB > a, VB &&B)
constexpr auto tril(M &&m)
Parameters and settings for the Cyqlone solver.
simd< Tp, deduced_abi< Tp, Np > > deduced_simd
constexpr index_t get_level(index_t i)
Definition cyqlone.hpp:45
constexpr index_t get_index_in_level(index_t i)
Definition cyqlone.hpp:51
constexpr index_t ceil_log2(index_t n)
Definition cyqlone.hpp:38
constexpr bool is_pow_2(index_t n)
Definition cyqlone.hpp:32
Parallel execution context and synchronization primitives.
Sparse matrix utilities.
batch_view_type batch(index_type b) const
Linear solver for systems with optimal control structure.
Definition cyqlone.hpp:561
index_t num_variables() const
Get the total number of primal variables in the OCP.
Definition cyqlone.hpp:577
std::vector< value_type > build_sol(view<> ux, view<> λ) const
Definition cyqlone.hpp:1018
tricyqle_t::simd simd
Definition cyqlone.hpp:598
matrix pack_constraints(std::span< const value_type > y_lin, value_type fill=0) const
Definition data.tpp:485
SparseMatrix build_sparse_factor() const
Definition sparse.tpp:159
void update(Context &ctx, view<> ΔΣ)
Perform factorization updates of the Cyqlone factorization as described by Algorithm 4 in the paper.
Definition update.tpp:284
void transposed_general_constr(view<> y, mut_view<> DCᵀy) const
Compute Gᵀy, where G is the general constraint Jacobian matrix of the OCP.
matrix initialize_variables() const
Get a zero-initialized matrix for the primal variables u and x.
Definition data.tpp:501
std::vector< value_type > unpack_constraints(view<> y) const
Definition data.tpp:493
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
Definition indexing.tpp:125
constexpr index_t ceil_P() const
The number of parallel execution units P rounded up to the next power of two.
Definition cyqlone.hpp:614
index_t add_wrap_ceil_N(index_t a, index_t b) const
Add b to a modulo N_horiz.
Definition indexing.tpp:42
typename tricyqle_t::template view< O > view
Non-owning immutable view type for matrix.
Definition cyqlone.hpp:693
void update_data(const CyqloneStorage< value_type > &ocp)
Update the internal data structures to reflect changes in the OCP data (without changing the problem ...
Definition data.tpp:94
void update_params(const CyqloneParams< value_type > &new_params)
Update the Cyqlone solver parameters.
Definition cyqlone.hpp:717
void solve_reverse_mul(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> Mᵀλ)
Fused variant of solve_reverse and transposed_dynamics_constr (for improved locality of the dynamics ...
Definition factor.tpp:261
void factor(Context &ctx, value_type γ, view<> Σ)
Compute the Cyqlone factorization of the KKT matrix of the OCP.
Definition factor.tpp:137
void solve_reverse(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> work, std::optional< mut_view<> > Mᵀλ=std::nullopt) const
Definition factor.tpp:155
typename tricyqle_t::template batch_view< O > batch_view
Non-owning immutable view type for a single batch of v matrices.
Definition cyqlone.hpp:700
void update_solve(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
Fused variant of update and solve_forward.
Definition update.tpp:289
void pack_variables(std::span< const value_type > ux_lin, mut_view<> ux) const
Definition data.tpp:242
void factor_solve(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
Compute the Cyqlone factorization of the KKT matrix of the OCP and perform a forward solve (fused for...
Definition factor.tpp:132
void pack_constraints(std::span< const value_type > y_lin, mut_view<> y, value_type fill=0) const
Definition data.tpp:366
typename tricyqle_t::layer_stride layer_stride
Definition cyqlone.hpp:697
void cost_gradient_remove_regularization(Context &ctx, value_type γ, view<> x, view<> x0, mut_view<> grad_f) const
Subtract the regularization term from the cost gradient.
Definition mat-vec.tpp:147
void initialize_gradient(const CyqloneStorage< value_type > &ocp, mut_view<> grad) const
Initialize the gradient vector for the OCP cost function, using the custom Cyqlone storage format.
Definition data.tpp:170
void unpack_variables(view<> ux, std::span< value_type > ux_lin) const
Definition data.tpp:281
matrix initialize_rhs(const CyqloneStorage< value_type > &ocp) const
Initialize the right-hand side vector for the dynamics constraints of the OCP, using the custom Cyqlo...
Definition data.tpp:429
void transposed_dynamics_constr(Context &ctx, view<> λ, mut_view<> Mᵀλ, bool accum=false) const
Compute Mᵀλ, where M is the dynamics constraint Jacobian matrix of the OCP.
Definition mat-vec.tpp:52
void residual_dynamics_constr(Context &ctx, view<> x, view<> b, mut_view<> Mxb) const
Compute Mx + b, where M is the dynamics constraint Jacobian matrix of the OCP.
Definition mat-vec.tpp:17
index_t ν2(index_t i) const
2-adic valuation ν₂.
Definition indexing.tpp:121
void update_solve_impl(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
[PCR update]
Definition update.tpp:263
void update_riccati(Context &ctx, view<> ΔΣ)
Definition cyqlone.hpp:1005
TricyqleParams< value_type > get_tricyqle_params() const
Get the current Tricyqle solver parameters.
Definition cyqlone.hpp:720
SparseMatrix build_sparse(const CyqloneStorage< value_type > &ocp, std::span< const value_type > Σ) const
Definition sparse.tpp:15
index_t sub_wrap_ceil_N(index_t a, index_t b) const
constexpr index_t lv() const
log₂(v), logarithm of the vector length.
Definition cyqlone.hpp:608
std::unique_ptr< SharedContext > create_parallel_context() const
Definition cyqlone.hpp:616
std::string get_params_string() const
Get a string representation of the main solver parameters. Used mainly for file names.
Definition cyqlone.hpp:730
void unpack_constraints(view<> y, std::span< value_type > y_lin) const
Definition data.tpp:401
index_t get_linear_batch_offset(index_t biA) const
Definition indexing.tpp:112
index_t add_wrap_p(index_t a, index_t b) const
Add b to a modulo p.
Definition indexing.tpp:73
void cost_gradient_regularized(Context &ctx, value_type γ, view<> ux, view<> ux0, view<> q, mut_view<> grad_f) const
Compute the regularized cost gradient, with regularization parameter γ⁻¹, with respect to the point u...
Definition mat-vec.tpp:129
typename tricyqle_t::template matrix< O > matrix
Owning type for a batch of matrices (with batch size v).
Definition cyqlone.hpp:690
tricyqle_t::Context Context
Definition cyqlone.hpp:596
void solve_forward(Context &ctx, mut_view<> ux, mut_view<> λ)
Perform a forward solve with the Cyqlone factorization.
Definition factor.tpp:141
index_t num_dynamics_constraints() const
Get the total number of dynamics constraints in the OCP.
Definition cyqlone.hpp:581
static CyqloneSolver build(const CyqloneStorage< value_type > &ocp, index_t p)
Initialize a Cyqlone solver for the given OCP.
Definition data.tpp:41
void cost_gradient(Context &ctx, view<> ux, value_type α, view<> q, value_type β, mut_view<> grad_f) const
Compute the cost gradient, with optional scaling factors.
Definition mat-vec.tpp:115
matrix initialize_dynamics_constraints() const
Get a zero-initialized matrix for the dynamics constraints (or their multipliers).
Definition data.tpp:506
void transposed_general_constr(Context &ctx, view<> y, mut_view<> DCᵀy) const
Compute Gᵀy, where G is the general constraint Jacobian matrix of the OCP.
Definition mat-vec.tpp:105
void foreach_stage(Context &ctx, auto &&func, auto &&...xs) const
Call a function for each stage in the horizon, passing the stage index, the data batch index,...
Definition cyqlone.hpp:623
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
Definition indexing.tpp:82
void initialize_rhs(const CyqloneStorage< value_type > &ocp, mut_view<> rhs) const
Initialize the right-hand side vector for the dynamics constraints of the OCP, using the custom Cyqlo...
Definition data.tpp:147
void solve_reverse(Context &ctx, mut_view<> ux, mut_view<> λ)
Perform a reverse solve with the Cyqlone factorization.
Definition factor.tpp:255
std::vector< value_type > unpack_variables(view<> ux) const
Definition data.tpp:461
TricyqleSolver< VL, T, DefaultOrder, Ctx > tricyqle_t
Tricyqle solver type for solving block-tridiagonal systems in parallel.
Definition cyqlone.hpp:595
void solve_riccati_reverse(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> work, std::optional< mut_view<> > Mᵀλ) const
[Modified Riccati factorization and fused forward solve]
Definition riccati.tpp:145
index_t riccati_thread_assignment(Context &ctx) const
Definition cyqlone.hpp:972
void general_constr(Context &ctx, view<> ux, mut_view<> DCux) const
Compute the general constraints Gx, where G is the general constraint Jacobian matrix of the OCP.
Definition mat-vec.tpp:95
index_t add_wrap_ceil_P(index_t a, index_t b) const
Definition indexing.tpp:102
void compute_schur(Context &ctx, mut_view<> ux, mut_view<> λ)
[Cyqlone compute Schur]
Definition schur.tpp:31
matrix initialize_general_constraints() const
Get a zero-initialized matrix for the general constraints (or their multipliers).
Definition data.tpp:511
matrix initialize_gradient(const CyqloneStorage< value_type > &ocp) const
Initialize the gradient vector for the OCP cost function, using the custom Cyqlone storage format.
Definition data.tpp:437
typename tricyqle_t::template mut_batch_view< O > mut_batch_view
Non-owning mutable view type for a single batch of v matrices.
Definition cyqlone.hpp:703
void update_riccati_solve(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
Update the modified Riccati factorization of a single block column as described by Algorithm 3 in the...
Definition update.tpp:352
index_t sub_wrap_p(index_t a, index_t b) const
Subtract b from a modulo p.
Definition indexing.tpp:64
void initialize_bounds(const CyqloneStorage< value_type > &ocp, mut_view<> b_min, mut_view<> b_max) const
Initialize the lower and upper bounds for the general constraints of the OCP, using the custom Cyqlon...
Definition data.tpp:204
std::vector< value_type > unpack_dynamics(view<> λ) const
Definition data.tpp:477
index_t add_wrap_ceil_p(index_t a, index_t b) const
Add b to a modulo ceil_p().
Definition indexing.tpp:87
typename tricyqle_t::template mut_view< O > mut_view
Non-owning mutable view type for matrix.
Definition cyqlone.hpp:696
void pack_dynamics(std::span< const value_type > λ_lin, mut_view<> λ) const
Definition data.tpp:316
void update_tricyqle_params(const TricyqleParams< value_type > &new_params)
Update the Tricyqle solver parameters.
Definition cyqlone.hpp:725
matrix pack_dynamics(std::span< const value_type > λ_lin) const
Definition data.tpp:469
constexpr index_t ceil_p() const
The number of processors p rounded up to the next power of two.
Definition cyqlone.hpp:612
void foreach_stage_fwd(Context &ctx, auto &&func, auto &&...xs) const
Call a function for each stage in the horizon, passing the stage index, the data batch index,...
Definition cyqlone.hpp:636
void factor_riccati_solve(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
[Modified Riccati factorization and fused forward solve]
Definition riccati.tpp:23
SparseMatrix build_sparse_diag() const
Definition sparse.tpp:283
std::pair< matrix<>, matrix<> > initialize_bounds(const CyqloneStorage< value_type > &ocp) const
Initialize the lower and upper bounds for the general constraints of the OCP, using the custom Cyqlon...
Definition data.tpp:445
index_t num_general_constraints() const
Get the total number of general constraints in the OCP.
Definition cyqlone.hpp:585
matrix pack_variables(std::span< const value_type > ux_lin) const
Definition data.tpp:453
CyqloneParams< value_type > get_params() const
Get the current Cyqlone solver parameters.
Definition cyqlone.hpp:714
std::vector< value_type > build_rhs(view<> rq, view<> b, value_type scale_rq=-1, value_type scale_b=-1) const
Definition sparse.tpp:105
void factor_solve_impl(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
[Cyqlone factorization and fused forward solve]
Definition factor.tpp:13
constexpr index_t lp() const
log₂(p), logarithm of the number of processors/threads, rounded up.
Definition cyqlone.hpp:610
void unpack_dynamics(view<> λ, std::span< value_type > λ_lin) const
Definition data.tpp:342
tricyqle_t::SharedContext SharedContext
Definition cyqlone.hpp:597
index_t sub_wrap_ceil_P(index_t a, index_t b) const
Definition indexing.tpp:92
Storage for a linear-quadratic OCP with the initial states x₀ eliminated.
A sparse matrix in COO format.
Definition sparse.hpp:26
Solver for block-tridiagonal systems using cyclic reduction (CR), parallel cyclic reduction (PCR),...
Definition cyqlone.hpp:66
void update_pcr(batch_view<> fwd, batch_view<> bwd, batch_view<> Σ)
[Cyqlone update CR helper]
Definition update.tpp:180
constexpr index_t lp() const
log₂(p), logarithm of the number of processors/threads p, rounded up.
Definition cyqlone.hpp:105
void factor_solve_impl(Context &ctx, mut_view<> λ, index_t stride=1)
Implementation of factor_solve.
Definition factor.tpp:29
value_type mul_precond(batch_view<> r, mut_batch_view<> z, mut_batch_view<> w, batch_view< default_order > L, batch_view< default_order > K) const
Multiply a vector by the preconditioner for the final block tridiagonal system of size v.
Definition pcg.tpp:35
void solve_reverse_serial(mut_view<> λ, mut_view<> work, index_t stride) const
[Cyqlone solve CR]
Definition factor.tpp:228
mut_batch_view< column_major > work_Σ_extra()
Definition update.tpp:706
batmat::matrix::View< value_type, index_t, vl_t, vl_t, layer_stride, O > mut_batch_view
Non-owning mutable view type for a single batch of v matrices.
Definition cyqlone.hpp:165
mut_batch_view< column_major > work_Σ_fwd(index_t l, index_t i)
Definition update.tpp:636
void factor_L(index_t l, index_t i)
Update and factorize a block L in the Cholesky factor for CR level l+1 and column index i,...
Definition cr.tpp:71
void prefetch_L(batch_view< O > X) const
Definition cr.tpp:276
decltype(auto) get_solution(Context &ctx, mut_view<> λ, auto &&func) const
Get access to the solution computed by this thread using a user-provided function.
Definition cyqlone.hpp:206
mut_batch_view< column_major > work_Ups_fwd_last()
Definition update.tpp:657
mut_batch_view< column_major > work_Σ_bwd_last()
Definition update.tpp:689
decltype(auto) init_subdiag(Context &ctx, auto &&func)
Initialize the subdiagonal blocks K of the block tridiagonal system using a user-provided function.
Definition cyqlone.hpp:185
mut_batch_view< column_major > work_Σ_Q(index_t l, index_t i)
Definition update.tpp:650
mut_batch_view< column_major > work_Ups_extra()
Definition update.tpp:699
void factor_skip_first(Context &ctx)
Factorization-only variant of factor_solve_skip_first.
Definition cyqlone.hpp:380
void factor_solve_skip_first(Context &ctx, mut_view<> λ, index_t stride=1)
Fused factorization and forward solve.
Definition factor.tpp:48
void prefetch_L(index_t bi) const
Definition cr.tpp:291
typename Context::shared_context_type SharedContext
Definition cyqlone.hpp:70
index_t ν2(index_t i) const
2-adic valuation ν₂.
Definition indexing.tpp:30
void solve_pcr(mut_batch_view<> λ)
Solve a linear system with the final block tridiagonal system of size v using the PCR factorization.
Definition cyqlone.hpp:452
batmat::matrix::DefaultStride layer_stride
Definition cyqlone.hpp:159
mut_batch_view< column_major > work_Q_cr(index_t l, index_t i)
Definition update.tpp:628
batmat::datapar::deduced_simd< value_type, v > simd
Represents a SIMD vector of width v storing values of type value_type.
Definition cyqlone.hpp:114
void solve_y_forward(index_t l, index_t iY, mut_view<> λ, mut_view<> w, index_t stride) const
Update the right-hand side λ during the forward solve phase of CR after computing block iY of λ at le...
Definition cr.tpp:177
void solve_u_backward(index_t l, index_t iU, mut_view<> λ, mut_view<> w, index_t stride) const
Definition cr.tpp:210
batmat::matrix::View< const value_type, index_t, vl_t, index_t, index_t, O > view
Non-owning immutable view type for matrix.
Definition cyqlone.hpp:155
batmat::matrix::View< value_type, index_t, vl_t, index_t, index_t, O > mut_view
Non-owning mutable view type for matrix.
Definition cyqlone.hpp:158
decltype(auto) get_solution(Context &ctx, Λ &&λ, F &&func) const
Get access to the solution computed by this thread using a user-provided function.
Definition cyqlone.hpp:211
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
Definition indexing.tpp:36
index_t add_wrap_ceil_p(index_t a, index_t b) const
Add b to a modulo ceil_p().
Definition indexing.tpp:19
void solve_reverse(Context &ctx, mut_view<> λ, index_t stride=1)
Definition cyqlone.hpp:261
decltype(auto) init_diag(Context &ctx, auto &&func)
Initialize the diagonal blocks M of the block tridiagonal system using a user-provided function.
Definition cyqlone.hpp:177
void solve_forward(Context &ctx, mut_view<> λ, index_t stride=1)
Perform only the forward solve as described by factor_solve.
Definition factor.tpp:126
mut_batch_view< column_major > work_Σ_bwd(index_t l, index_t i)
Definition update.tpp:643
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
Definition indexing.tpp:8
index_t cr_thread_assignment(index_t l, index_t c) const
Adjust thread assignment for non-power-of-two p: The diagonal blocks M(⌊p/2⌋2) are usually mapped to ...
Definition factor.tpp:277
constexpr index_t ceil_P() const
The number of parallel execution units P = p * v rounded up to the next power of two.
Definition cyqlone.hpp:109
mut_batch_view< column_major > work_Ups_bwd(index_t l, index_t i)
Definition update.tpp:620
void solve_forward_skip_first(Context &ctx, mut_view<> λ, index_t stride=1)
Solution-only variant of factor_solve_skip_first.
Definition cyqlone.hpp:382
std::pair< index_t, index_t > cols_Ups_fwd(index_t l, index_t i) const
Definition update.tpp:558
void update_L(index_t l, index_t i)
[Cyqlone update CR helper]
Definition update.tpp:23
std::pair< index_t, index_t > cols_Q_cr(index_t l, index_t i) const
Definition update.tpp:588
void update_pcr_level(index_t m, mut_batch_view<> WYU, mut_batch_view<> WΣ)
Definition update.tpp:198
void solve_λ_backward(index_t biL, mut_view<> λ, view<> w, index_t stride) const
Definition cr.tpp:241
decltype(auto) get_solution(Context &ctx, view<> λ, auto &&func) const
Get access to the solution computed by this thread using a user-provided function.
Definition cyqlone.hpp:202
mut_batch_view< column_major > work_Ups_bwd_last()
Definition update.tpp:668
mut_batch_view< column_major > work_Σ_fwd_last()
Definition update.tpp:679
index_t work_Ups_bwd_w(index_t l, index_t i) const
Definition update.tpp:604
void set_update_rank_extra(index_t m)
Definition update.tpp:547
void solve_λ_forward(index_t l, index_t iL, mut_view<> λ, view<> w, index_t stride) const
Apply the updates to block iL of the right-hand side from solve_u_forward and solve_y_forward,...
Definition cr.tpp:190
mut_batch_view< column_major > work_Ups_fwd(index_t l, index_t i)
Definition update.tpp:612
void solve_y_backward(index_t l, index_t iY, mut_view<> λ, index_t stride) const
Definition cr.tpp:225
void update_solve_cr(Context &ctx, mut_view<> λ, index_t stride)
[Cyqlone update CR]
Definition update.tpp:297
void set_thread_update_rank(Context &ctx, index_t c, index_t m)
[Cyqlone update Riccati]
Definition update.tpp:539
std::integral_constant< index_t, v *alignof(value_type)> align_t
Integral constant type for the alignment of the batched matrix data structures.
Definition cyqlone.hpp:118
void factor_pcr()
Compute the parallel cyclic reduction factorization of the final block tridiagonal system of size v.
Definition pcr.tpp:28
Params get_params() const
Get the current solver parameters.
Definition cyqlone.hpp:90
constexpr index_t ceil_p() const
The number of processors p rounded up to the next power of two.
Definition cyqlone.hpp:107
std::unique_ptr< SharedContext > create_parallel_context() const
Create a new parallel execution context, storing synchronization primitives and shared data for the p...
Definition cyqlone.hpp:122
void factor_solve(Context &ctx, mut_view<> λ, index_t stride=1)
Fused factorization and forward solve.
Definition factor.tpp:117
void factor_U(index_t l, index_t iU)
Compute a block U in the Cholesky factor for the given CR level l and column index iU.
Definition cr.tpp:23
batmat::matrix::View< const value_type, index_t, vl_t, vl_t, layer_stride, O > batch_view
Non-owning immutable view type for a single batch of v matrices.
Definition cyqlone.hpp:162
void solve_pcg(mut_batch_view<> λ)
Solve a linear system with the final block tridiagonal system of size v using the preconditioned conj...
Definition cyqlone.hpp:484
void solve_pcg(mut_batch_view<> λ, mut_batch_view<> work_pcg) const
Solve a linear system with the final block tridiagonal system of size v using the preconditioned conj...
Definition pcg.tpp:54
std::integral_constant< index_t, v > vl_t
Integral constant type for the vector length.
Definition cyqlone.hpp:116
void factor(Context &ctx)
Perform only the factorization as described by factor_solve.
Definition factor.tpp:122
void solve_reverse_parallel(Context &ctx, mut_view<> λ, mut_view<> work, index_t stride) const
[Cyqlone solve CR]
Definition factor.tpp:179
void prefetch_U(index_t l, index_t iU) const
Definition cr.tpp:297
void solve_reverse(Context &ctx, mut_view<> λ, mut_view<> work, index_t stride=1) const
Perform the backward solve phase, after the forward solve phase has been performed by factor_solve.
Definition factor.tpp:164
void prefetch(batch_view< O > X) const
[Cyqlone solve CR helper]
Definition cr.tpp:260
void factor_pcr_level_parallel(Context &ctx)
Perform a single level of the PCR factorization.
Definition pcr.tpp:109
decltype(auto) init_rhs(Context &ctx, mut_view<> b, auto &&func) const
Initialize the right-hand side of the linear system using a user-provided function.
Definition cyqlone.hpp:194
void solve_u_forward(index_t l, index_t iU, mut_view<> λ, index_t stride) const
Update the right-hand side λ during the forward solve phase of CR after computing block iU of λ at le...
Definition cr.tpp:163
void solve_pcr(mut_batch_view<> λ, mut_batch_view<> work_pcr) const
Solve a linear system with the final block tridiagonal system of size v using the PCR factorization.
Definition pcr.tpp:181
void update_params(const Params &new_params)
Update the solver parameters.
Definition cyqlone.hpp:93
index_t work_Ups_fwd_w(index_t l, index_t i) const
Definition update.tpp:593
void update_K(index_t l, index_t i)
Compute a subdiagonal block K of the Schur complement for CR level l+1 and column index i,...
Definition cr.tpp:50
void solve_pcr_level(mut_batch_view<> λ, mut_batch_view<> work_pcr) const
Perform a single level of the PCR solve.
Definition pcr.tpp:194
void update_Y(index_t l, index_t i)
Definition update.tpp:157
std::pair< index_t, index_t > cols_Ups_bwd(index_t l, index_t i) const
Definition update.tpp:573
TricyqleParams< value_type > Params
Definition cyqlone.hpp:68
batmat::matrix::Matrix< value_type, index_t, vl_t, index_t, O, align_t > matrix
Owning type for a batch of matrices (with batch size v).
Definition cyqlone.hpp:152
void update_U(index_t l, index_t i)
Definition update.tpp:123
void factor_pcr_parallel(Context &ctx)
Compute the parallel cyclic reduction factorization of the final block tridiagonal system of size v.
Definition pcr.tpp:101
value_type mul_Mv(batch_view<> p, mut_batch_view<> Mp, batch_view< default_order > L, batch_view< default_order > K) const
Multiply a vector by the final block tridiagonal matrix of size v.
Definition pcg.tpp:23
void prefetch_Y(index_t l, index_t iY) const
Definition cr.tpp:306
void factor_pcr_level()
Perform a single level of the PCR factorization.
Definition pcr.tpp:38
void factor_Y(index_t l, index_t iY)
Compute a block Y in the Cholesky factor for the given CR level l and column index iY.
Definition cr.tpp:37
Thread context for parallel execution.
Definition parallel.hpp:64