API reference#
-
namespace cyqlone#
Typedefs
-
using DefaultTimings = batmat::DefaultTimings#
-
using MatVarPtr = std::unique_ptr<matvar_t, decltype(&Mat_VarFree)>#
Functions
-
void reference_to_gradient(LinearOCPStorage &ocp, std::span<const real_t> ref)#
Simply computes the gradient of the quadratic cost \( J(x, u) = \sum_{j=1}^{N-1} \ell_j(x^j, u^j) + \ell_N(x^N) \), with \( \ell_j(x, u) = \tfrac12 \left\| \begin{pmatrix} x - x^j_\text{ref} \\ u - u^j_\text{ref} \end{pmatrix} \right\|_{H_j}^2 \), with the Hessian \( H_j = \begin{pmatrix} Q_j & S_j^\top \\ S_j & R_j \end{pmatrix} \).
Stores \( \nabla J(0, 0) \) to
qr.
-
inline const char *enum_name(SolveMethod s)#
-
bool is_pow_2(index_t n)#
-
index_t ceil_log2(index_t n)#
-
index_t get_level(index_t i)#
-
index_t get_index_in_level(index_t i)#
-
template<class T, size_t N>
MatVarPtr create_tensor_var(const char *name, std::span<const T> buffer, std::array<index_t, N> dims)#
-
template<class T, size_t N>
void write_tensor(mat_t *matfp, const char *name, std::span<const T> buffer, std::array<index_t, N> dims)#
-
template<class T>
void add_to_mat_impl(mat_t *mat, const std::string &varname, guanaqo::MatrixView<const T, index_t> data)#
-
template<class T>
void add_to_mat_impl(mat_t *mat, const std::string &varname, batmat::matrix::View<const T, index_t> data)#
-
void validate_mat_var(const matvar_t *var, const std::string &name, int expected_rank)#
-
auto open_vector_var(mat_t *mat, const std::string &varname)#
-
index_t get_depth(index_t n)#
- template<class Mat> auto λ_max_power (const Mat &M, const Mat &K, int max_it, typename Mat::value_type tol)
-
struct EmptyCompletion#
- #include <cyqlone/barrier.hpp>
No-op completion function for the TreeBarrier.
Public Functions
-
inline void operator()() const noexcept#
Does nothing.
-
inline void operator()() const noexcept#
-
template<typename CompletionFn = EmptyCompletion, class PhaseType = uint32_t>
class TreeBarrier# - #include <cyqlone/barrier.hpp>
Fairly vanilla combining tree barrier.
It is inspired by GCC 15.2’s __tree_barrier, with some important API differences:
Every thread has a unique thread ID in [0, expected-1]. This eliminates the need for hashing the pthread thread IDs and for the inner search loop to find free slots in the tree.
Wait tries to spin for a given number of iterations before falling back to a futex-based atomic wait.
The barrier phase is exposed to the user.
Custom completion functions can be provided at arrival time.
Reductions and broadcasts on small values are supported.
Public Functions
-
inline TreeBarrier(uint32_t expected, CompletionFn completion)#
Create a barrier with
expectedparticipating threads and a completion function that is called by the last thread that arrives at each phase.
-
TreeBarrier(const TreeBarrier&) = delete#
-
TreeBarrier(TreeBarrier&&) = default#
-
TreeBarrier &operator=(const TreeBarrier&) = delete#
-
TreeBarrier &operator=(TreeBarrier&&) = default#
-
template<class C>
inline arrival_token arrive_with_completion(uint32_t thread_id, C &&custom_completion)# Arrive at the barrier with a custom completion function that is called by the last thread that arrives, before advancing the barrier phase and notifying all waiting threads.
The completion function of the barrier is not called in this case. Each thread should use a unique thread ID in [0, expected-1].
-
inline arrival_token arrive(uint32_t thread_id)#
Arrive at the barrier.
The barrier’s completion function is called by the last thread that arrives, before advancing the barrier phase and notifying all waiting threads. Each thread should use a unique thread ID in [0, expected-1].
-
inline arrival_token arrive(uint32_t thread_id, int line)#
Arrive at the barrier, recording the given line number for sanity checking to make sure that all threads arrive from the same line or statement in the source code.
This is useful for debugging purposes to detect mismatched barrier calls, but should not really be used otherwise. If CYQLONE_SANITY_CHECKS_BARRIER is disabled, the line number is ignored and this function is equivalent to arrive(uint32_t). Each thread should use a unique thread ID in [0, expected-1].
-
inline BarrierPhase current_phase() const#
Query the current barrier phase.
May wrap around on overflow, but all threads will see the same phase values in the same order.
-
inline bool wait_may_block(const arrival_token &token) const noexcept#
Check if wait() may block.
If it returns false, the caller can call wait() and it will return immediately without spinning or sleeping. This is useful if the caller has other non-critical work to do while waiting for other threads. Users should still call wait() before arriving again.
Note
This function does not impose any memory ordering, so even when it returns false, changes made before the arrival of other threads may not be visible yet. In contrast, wait() does ensure proper synchronization.
-
inline void wait(arrival_token &&token) const#
Wait for the barrier to complete after an arrival, using the given token.
Separating the arrival and wait phases allows for overlapping computation with waiting, hiding the synchronization latency. Waiting on the same token multiple times is not allowed.
-
inline void arrive_and_wait(uint32_t thread_id)#
Convenience function to arrive and wait in a single call.
-
inline void arrive_and_wait(uint32_t thread_id, int line)#
Convenience function to arrive and wait in a single call (with optional sanity check).
-
template<class C>
inline void arrive_and_wait_with_completion(uint32_t thread_id, C &&custom_completion)# Convenience function to arrive and wait in a single call (with custom completion).
-
template<class C>
inline auto arrive_and_wait_with_completion(uint32_t thread_id, C &&custom_completion)# Convenience function to arrive and wait in a single call (with custom completion).
Broadcasts the return value of the custom completion function to all threads.
-
template<class T, class F>
inline arrival_token_typed<T> arrive_reduce(uint32_t thread_id, T x, F reduce)# Combining tree reduction across all threads.
Deterministic application order for a given number of threads.
-
template<class T>
inline T wait_reduce(arrival_token_typed<T> &&token)# Wait for the result of an arrive_reduce call and obtain the reduced value.
Public Members
-
uint32_t spin_count = 1000#
Number of spin iterations before falling back to futex-based wait.
Public Static Functions
-
static inline uint32_t max()#
Maximum number of threads supported by this barrier implementation.
-
class arrival_token#
- #include <cyqlone/barrier.hpp>
Subclassed by cyqlone::TreeBarrier< CompletionFn, PhaseType >::arrival_token_typed< T >
Public Functions
-
inline explicit arrival_token(BarrierPhase phase)#
-
arrival_token(const arrival_token &phase) = delete#
-
arrival_token(arrival_token &&phase) = default#
-
arrival_token &operator=(const arrival_token &phase) = delete#
-
arrival_token &operator=(arrival_token &&phase) = default#
-
inline BarrierPhase get() const noexcept#
-
inline explicit arrival_token(BarrierPhase phase)#
-
template<class T>
class arrival_token_typed : public cyqlone::TreeBarrier<CompletionFn, PhaseType>::arrival_token# - #include <cyqlone/barrier.hpp>
Public Functions
-
inline BarrierPhase get() const noexcept#
-
inline BarrierPhase get() const noexcept#
-
struct LinearOCPSparseQP#
- #include <cyqlone/conversion.hpp>
Represents a sparse multiple shooting formulation of the standard optimal control problem.
Public Functions
- KKTMatrix build_kkt (real_t S, std::span< const real_t > Σ, std::span< const bool > J) const
Returns the lower part of the symmetric indefinite KKT matrix for the active set
Jand penalty factorsΣ.
Public Members
-
std::vector<index_t> Q_outer_ptr#
-
std::vector<index_t> Q_inner_idx#
-
std::vector<real_t> Q_values#
-
SparseCSC<index_t, index_t> Q_sparsity#
-
std::vector<index_t> A_outer_ptr#
-
std::vector<index_t> A_inner_idx#
-
std::vector<real_t> A_values#
-
SparseCSC<index_t, index_t> A_sparsity#
-
index_t n#
-
index_t m_eq#
-
index_t m_ineq#
Public Static Functions
-
static LinearOCPSparseQP build(const LinearOCPStorage &ocp)#
-
struct KKTMatrix#
- #include <cyqlone/conversion.hpp>
-
template<class T = real_t>
struct TricyqleParams# - #include <cyqlone/cyqlone-params.hpp>
Parameters and settings for the Tricyqle block-tridiagonal solver.
Public Members
-
bool enable_prefetching = true#
Use prefetching during the reverse CR solve phase.
-
index_t pcg_max_iter = 100#
Maximum number of preconditioned conjugate gradient iterations.
-
value_type pcg_tolerance = std::numeric_limits<value_type>::epsilon() / 10#
Tolerance for the preconditioned conjugate gradient solver.
-
bool pcg_print_resid = false#
Enable printing of the residuals during PCG.
-
SolveMethod solve_method = SolveMethod::StairPCG#
Algorithm to use for solving the final reduced block tridiagonal system.
-
double pcr_max_update_fraction = 0.6#
Tuning parameter for deciding when to update or re-factor the PCR factorization.
If the update rank exceeds this fraction of nx, the PCR factorization is recomputed
-
double cr_max_update_fraction_Y0 = 0.9#
Tuning parameter for deciding when to update or re-factor the last subdiagonal blocks in the CR factorization.
If the update rank exceeds this fraction of nx, the last subdiagonal blocks are recomputed.
-
index_t parallel_solve_cr_threshold = 10#
Threshold on nx for switching to a serial implementation of the reverse CR solve.
-
index_t parallel_factor_pcr_threshold = 20#
Threshold on nx for switching to a serial implementation of the PCR factorization.
-
bool enable_prefetching = true#
-
template<class T = real_t>
struct CyqloneParams# - #include <cyqlone/cyqlone-params.hpp>
Parameters and settings for the Cyqlone solver.
-
template<class T = real_t>
struct CyqloneStorage# - #include <cyqlone/cyqlone-storage.hpp>
Storage for a linear-quadratic OCP with the initial states x₀ eliminated.
The matrices are combined per stage, with inputs ordered first. The first and last stage are special because of the lack of x₀ and uₙ.ₙ₋₁ minimize ∑ [½ uᵢᵀ Rᵢ uᵢ + uᵢᵀ Sᵢ xᵢ + ½ xᵢᵀ Qᵢ xᵢ + rᵢᵀuᵢ + qᵢᵀxᵢ] ⁱ⁼¹ + ½ u₀ᵀ R₀ u₀ + (r₀ + S₀ xᵢₙᵢₜ)ᵀ u₀ + ½ xₙᵀ Qₙ xₙ + qₙᵀ xₙ s.t. xᵢ₊₁ = Aᵢ xᵢ + Bᵢ uᵢ + cᵢ lᵢ ≤ Cᵢ xᵢ + Dᵢ uᵢ ≤ uᵢ l₀ - C₀ xᵢₙᵢₜ ≤ D₀ U₀ ≤ u₀ - C₀ xᵢₙᵢₜ lₙ ≤ Cₙ xₙ ≤ uₙDue to the elimination of x₀, there may be fewer constraints for the first stage, which is tracked by the Ju0 mask. When reconstructing the solution, the multipliers for eliminated constraints are set to zero (we assume that the initial state is feasible w.r.t. the state constraints).Hᵢ = [ Rᵢ Sᵢ ], H₀ = [ R₀ 0 ], Fᵢ = [ Bᵢ Aᵢ ], Gᵢ = [ Dᵢ Cᵢ ], G₀ = [ D₀ Cₙ ] [ Sᵢᵀ Qᵢ ] [ 0 Qₙ ]Public Types
-
using matrix = batmat::matrix::Matrix<value_type, index_t>#
-
using Solution = LinearOCPStorage::Solution#
-
using KKTError = LinearOCPStorage::KKTError#
Public Functions
-
void update_impl(const LinearOCPStorage &ocp)#
-
void update(const LinearOCPStorage &ocp)#
-
void reconstruct_ineq_multipliers(std::span<const value_type> y_compressed, std::span<value_type> y) const#
-
std::vector<value_type> reconstruct_ineq_multipliers(std::span<const value_type> y_compressed) const#
- Solution reconstruct_solution (const LinearOCPStorage &ocp, std::span< const value_type > ux_compressed, std::span< const value_type > y_compressed, std::span< const value_type > λ_compressed) const
- KKTError compute_kkt_error (const LinearOCPStorage &ocp, std::span< const value_type > ux_compressed, std::span< const value_type > y_compressed, std::span< const value_type > λ_compressed) const
Public Members
-
index_t N_horiz#
-
index_t nx#
-
index_t nu#
-
index_t ny#
-
index_t ny_0#
-
index_t ny_N#
-
std::vector<bool> Ju0#
Public Static Functions
-
static CyqloneStorage build(const LinearOCPStorage &ocp, index_t ny_0 = -1)#
-
static index_t count_constr_0(const LinearOCPStorage &ocp, std::vector<bool> &Ju0)#
-
using matrix = batmat::matrix::Matrix<value_type, index_t>#
-
template<index_t VL = 4, class T = real_t, StorageOrder DefaultOrder = StorageOrder::ColMajor, class Ctx = parallel::Context<>>
struct TricyqleSolver# - #include <cyqlone/cyqlone.hpp>
Solver for block-tridiagonal systems using cyclic reduction (CR), parallel cyclic reduction (PCR), and preconditioned conjugate gradient (PCG) methods.
- Template Parameters:
VL – Vector length.
T – Scalar type.
DefaultOrder – Storage order for the matrix workspaces (row/column major).
Parallelization and vectorization
-
using simd = batmat::datapar::deduced_simd<value_type, v>#
Represents a SIMD vector of width v storing values of type value_type.
-
using align_t = std::integral_constant<index_t, v * alignof(value_type)>#
Integral constant type for the alignment of the batched matrix data structures.
-
const index_t p = 8#
Number of processors/threads.
-
inline index_t ceil_P() const#
The number of parallel execution units
P = p * vrounded up to the next power of two.
-
inline std::unique_ptr<SharedContext> create_parallel_context() const#
Create a new parallel execution context, storing synchronization primitives and shared data for the parallel algorithms.
Matrix data structures
-
template<StorageOrder O = column_major>
using matrix = batmat::matrix::Matrix<value_type, index_t, vl_t, index_t, O, align_t># Owning type for a batch of matrices (with batch size v).
-
template<StorageOrder O = column_major>
using view = batmat::matrix::View<const value_type, index_t, vl_t, index_t, index_t, O># Non-owning immutable view type for matrix.
-
template<StorageOrder O = column_major>
using mut_view = batmat::matrix::View<value_type, index_t, vl_t, index_t, index_t, O># Non-owning mutable view type for matrix.
-
using layer_stride = batmat::matrix::DefaultStride#
-
template<StorageOrder O = column_major>
using batch_view = batmat::matrix::View<const value_type, index_t, vl_t, vl_t, layer_stride, O># Non-owning immutable view type for a single batch of v matrices.
-
template<StorageOrder O = column_major>
using mut_batch_view = batmat::matrix::View<value_type, index_t, vl_t, vl_t, layer_stride, O># Non-owning mutable view type for a single batch of v matrices.
-
static auto default_order = DefaultOrder#
Default storage order for most matrices.
-
static auto column_major = StorageOrder::ColMajor#
Column-major storage order for column vectors and update matrices.
Problem dimensions
-
const index_t block_size#
Block size of the block-tridiagonal system.
-
const index_t max_rank = 0#
Maximum update rank.
-
bool circular = false#
Whether the block-tridiagonal system is circular (nonzero top-right & bottom-left corners).
Solver parameters
Cyclic reduction data structures
-
matrix<default_order> cr_L =
[this] {returnmatrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};}()# Diagonal blocks of the Cholesky factor of the Schur complement (used during CR).
Batch indices correspond to block column indices of the block tridiagonal system.
-
matrix<default_order> cr_U =
[this] {returnmatrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};}()# Subdiagonal blocks U of the Cholesky factor of the Schur complement (used during CR).
Batch indices correspond to block column indices of the block tridiagonal system.
-
matrix<default_order> cr_Y =
[this] {returnmatrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};}()# Subdiagonal blocks Y of the Cholesky factor of the Schur complement (used during CR).
Batch indices correspond to block column indices of the block tridiagonal system.
-
matrix<column_major> work_cr =
[this] {returnmatrix<column_major>{{.depth = p * v, .rows = block_size, .cols = 1}};}()# Temporary workspace for the CR solve phase.
Parallel cyclic reduction data structures
-
matrix<default_order> pcr_L =
[this] {returnmatrix<default_order>{{.depth =v * (lv() + 1), .rows = block_size, .cols = block_size}};}()# Diagonal blocks of the PCR Cholesky factorizations.
-
matrix<default_order> pcr_Y =
[this] {returnmatrix<default_order>{{.depth = v * lv(), .rows = block_size, .cols = block_size}};}()# Subdiagonal blocks Y of the PCR Cholesky factorizations.
-
matrix<default_order> pcr_U =
[this] {returnmatrix<default_order>{{.depth = v * lv(), .rows = block_size, .cols = block_size}};}()# Subdiagonal blocks U of the PCR Cholesky factorizations.
-
matrix<default_order> pcr_M =
[this] {returnmatrix<default_order>{{.depth = v, .rows = block_size, .cols = block_size}};}()# Workspace to store the diagonal blocks during the PCR factorization.
-
matrix<column_major> work_pcg =
[this] {returnmatrix<column_major>{{.depth = v, .rows = block_size, .cols = 4}};}()# Temporary workspace for CG vectors.
Factorization update data structures
-
std::vector<index_t> m_update = std::vector<index_t>(p)#
Update rank (number of changing constraints) per thread.
-
index_t m_update_u0 = -1#
Update rank from D(0). Negative if D(0) is not handled separately.
- matrix< column_major > work_update_Σ = [this] {returnmatrix<column_major>{{.depth = v, .rows = max_rank, .cols = 1}};}()
Compressed reprentation of the nonzero diagonal elements of the matrix Σ.
-
matrix<column_major> work_update =
[this] {returnmatrix<column_major>{{.depth = 4 * v, .rows = block_size, .cols = max_rank}};}()# Workspace to store the update matrices Ξ(Υ) for the factorization update.
-
matrix<column_major> work_hyh =
[this] {using namespacebatmat::linalg;const auto [r, c] =hyhound_size_W(tril(cr_L.batch(0)));returnmatrix<column_major>{{.depth = p * v, .rows = r, .cols = c}};}()# Storage for the hyperbolic Householder transformations.
- matrix< column_major > work_update_pcr_Σ = [this] {returnmatrix<column_major>{{.depth = v, .rows = 2 * max_rank, .cols = 1}};}()
Two copies of work_update_Σ for PCR updates.
-
matrix<column_major> work_update_pcr_L =
[this] {returnmatrix<column_major>{{.depth = v, .rows = block_size, .cols = max_rank}};}()# Update matrices to apply to the diagonal blocks L during PCR updates.
-
matrix<column_major> work_update_pcr_UY =
[this] {returnmatrix<column_major>{{.depth = v, .rows = block_size, .cols = 2 * max_rank}};}()# Update matrices to apply to the subdiagonal blocks U and Y during PCR updates.
Low-level PCR factorization and solve routines
-
static bool merge_last_level_pcr = true#
-
void factor_pcr()#
Compute the parallel cyclic reduction factorization of the final block tridiagonal system of size v.
[PCR factor serial]
The function assumes that the Cholesky factors of the diagonal blocks are stored in
cr_L.batch(0), and the subdiagonal blocks are stored incr_Y.batch(0). This variant computes both subdiagonal blocks on a single thread.
-
template<index_t Level>
void factor_pcr_level()# Perform a single level of the PCR factorization.
This variant computes both subdiagonal blocks on a single thread.
-
void factor_pcr_parallel(Context &ctx)#
Compute the parallel cyclic reduction factorization of the final block tridiagonal system of size v.
[PCR factor serial]
This variant computes the subdiagonal blocks in parallel on different threads.
[PCR factor]
-
template<index_t Level>
void factor_pcr_level_parallel(Context &ctx)# Perform a single level of the PCR factorization.
This variant computes the subdiagonal blocks in parallel on different threads.
- void solve_pcr (mut_batch_view<> λ, mut_batch_view<> work_pcr) const
Solve a linear system with the final block tridiagonal system of size v using the PCR factorization.
[PCR factor]
[Cyqlone solve PCR]
- inline void solve_pcr (mut_batch_view<> λ)
Solve a linear system with the final block tridiagonal system of size v using the PCR factorization.
[PCR factor]
[Cyqlone solve PCR]
- template<index_t Level> void solve_pcr_level (mut_batch_view<> λ, mut_batch_view<> work_pcr) const
Perform a single level of the PCR solve.
Indexing utilities
- index_t ν2 (index_t i) const
2-adic valuation ν₂.
- index_t ν2p (index_t i) const
2-adic valuation modulo p, i.e.
ν2p(0) = ν2p(p) = lp().
Factorization and solve routines
-
inline decltype(auto) init_diag(Context &ctx, auto &&func)#
Initialize the diagonal blocks M of the block tridiagonal system using a user-provided function.
The function is called with an index
iin[0, p)and a mutable view to the compact workspace of depthvwhere the diagonal blocksM(i+l*p)forlin[0, v)should be stored. Copying non-compact data to this workspace can be achieved using the cyqlone::linalg::pack function.
-
inline decltype(auto) init_subdiag(Context &ctx, auto &&func)#
Initialize the subdiagonal blocks K of the block tridiagonal system using a user-provided function.
The function is called with an index
iin[0, p)and a mutable view to the compact workspace of depthvwhere the subdiagonal blocksK(i+l*p)forlin[0, v)should be stored. Copying non-compact data to this workspace can be achieved using the cyqlone::linalg::pack function.
-
inline decltype(auto) init_rhs(Context &ctx, mut_view<> b, auto &&func) const#
Initialize the right-hand side of the linear system using a user-provided function.
The function is called with an index
iin[0, p)and a mutable view of the batch ofbof depthvwhere the right-hand side blocksb(i+l*p)forlin[0, v)should be stored. Copying non-compact data to this batch can be achieved using the cyqlone::linalg::pack function.
- inline decltype(auto) get_solution (Context &ctx, view<> λ, auto &&func) const
Get access to the solution computed by this thread using a user-provided function.
The function is called with an index
iin[0, p)and a view to the compact batch ofλof depthvwhere the solution blocksλ(i+l*p)forlin[0, v)are stored. Copying this batch to a non-compact layout can be achieved using the cyqlone::linalg::unpack function.
- inline decltype(auto) get_solution (Context &ctx, mut_view<> λ, auto &&func) const
Get access to the solution computed by this thread using a user-provided function.
The function is called with an index
iin[0, p)and a view to the compact batch ofλof depthvwhere the solution blocksλ(i+l*p)forlin[0, v)are stored. Copying this batch to a non-compact layout can be achieved using the cyqlone::linalg::unpack function.
- template<class Λ, class F> inline decltype(auto) get_solution (Context &ctx, Λ &&λ, F &&func) const
Get access to the solution computed by this thread using a user-provided function.
The function is called with an index
iin[0, p)and a view to the compact batch ofλof depthvwhere the solution blocksλ(i+l*p)forlin[0, v)are stored. Copying this batch to a non-compact layout can be achieved using the cyqlone::linalg::unpack function.
- void factor_solve (Context &ctx, mut_view<> λ, index_t stride=1)
Fused factorization and forward solve.
[Cyqlone factor Schur]
Performs CR to reduce the system down to a single block tridiagonal system of size v which is then solved using PCR or PCG.
Note that a backward solve of the final block
λ.batch(0)is performed during the forward solve phase, soλ.batch(0)contains (part of) the final solution. Performing the forward and backward solves separately for this block is not possible, because it is solved using PCR or PCG. Both methods solve the full block tridiagonal system at once, rather than using a single explicit CR Cholesky factorization with distinct forward and backward solves.- Parameters:
ctx – Parallel execution context for communication/synchronization between threads.
λ – On entry, the right-hand side of the linear system. On exit, the solution of the forward solve phase.
stride – Stride (in number of batches) between batches of
λ. In total,λcontainsstride * pbatches, but only everystride-th batch is accessed.
- Pre:
The workspaces
cr_L.batch(i)contain the diagonal blocksM(i:v:p)of the block tridiagonal system (see init_diag).- Pre:
If
p > 1, the workspacescr_Y.batch(i)with odd i contain the subdiagonal blocksK(i:v:p)of the block tridiagonal system (see init_subdiag).- Pre:
If
p > 1, the workspacescr_U.batch(i)with odd i contain the superdiagonal blocksK(i-1:v:p)ᵀof the block tridiagonal system (see init_subdiag).- Pre:
If
p = 1, the workspacecr_Y.batch(0)contains the subdiagonal blocksK(0:v:1)of the block tridiagonal system, andcr_U.batch(0)is not used.- Post:
The workspaces
cr_L.batch(i)withi > 0contain the diagonal blocks of the CR Cholesky factor.cr_Y.batch(i)andcr_U.batch(i)contain the subdiagonal blocks. The final batchcr_L.batch(0)contains the diagonal blocks of the Schur complement of all other blocks, which is the input to the PCR or PCG solver. The batchpcr_L.batch(0)contains its Cholesky factorizations. The subdiagonal blocks of this Schur complement are stored inpcr_Y.batch(0).- Post:
If the PCR solver was selected, the full PCR factorization is stored in pcr_L, pcr_U and pcr_Y.
-
void factor(Context &ctx)#
Perform only the factorization as described by factor_solve.
- void solve_forward (Context &ctx, mut_view<> λ, index_t stride=1)
Perform only the forward solve as described by factor_solve.
- void solve_reverse (Context &ctx, mut_view<> λ, mut_view<> work, index_t stride=1) const
Perform the backward solve phase, after the forward solve phase has been performed by factor_solve.
- Parameters:
ctx – Parallel execution context for communication/synchronization between threads.
λ – On entry, the solution of the forward solve phase. On exit, the solution of the full linear system.
work – Workspace of
p * vcolumn vectors of size block_size.stride – Stride (in number of batches) between batches of
λ. In total,λcontainsstride * pbatches, but only everystride-th batch is accessed.
- inline void solve_reverse (Context &ctx, mut_view<> λ, index_t stride=1)
Low-level factorization and solve routines
- template<bool Factor = true, bool Solve = true> void factor_solve_skip_first (Context &ctx, mut_view<> λ, index_t stride=1)
Fused factorization and forward solve.
[Cyqlone factor Schur]
Unlike factor_solve, this function assumes that the odd diagonal blocks in the first level of CR have already been factorized and solved. This allows the user to fuse the evaluation and factorization of these blocks, possibly enabling higher performance by avoiding an additional trip to memory.
- Pre:
If
p > 1, the workspacescr_L.batch(i)contain the diagonal blocks M(i) for even i, and the Cholesky factors L(i) of M(i) for odd i.- Pre:
If
p = 1, the workspacecr_L.batch(0)contains the diagonal block M(0), and the workspacepcr_L.batch(0)contains its Cholesky factor L(0).- Pre:
If
p > 1, the workspacescr_Y.batch(i)contain the subdiagonal blocks K(i) for odd i (uninitialized for even i).- Pre:
If
p = 1, the workspacecr_Y.batch(0)contains the subdiagonal block K(0).- Pre:
If
p > 1, the workspacecr_U.batch(i)contains the superdiagonal blocks K(i-1)ᵀ for odd i (uninitialized for even i).- Pre:
If
p = 1, the workspacecr_U.batch(0)is not used.- Pre:
If
p > 1, the right-hand sidesλ.batch(stride * i)contain the right-hand sides of the system for even i, and the right-hand sides multiplied by L(i)⁻¹ for odd i.- Pre:
If
p = 1, the right-hand sideλ.batch(0)contains the right-hand side of the system.
-
inline void factor_skip_first(Context &ctx)#
Factorization-only variant of factor_solve_skip_first.
- inline void solve_forward_skip_first (Context &ctx, mut_view<> λ, index_t stride=1)
Solution-only variant of factor_solve_skip_first.
- template<bool Factor = true, bool Solve = true> void factor_solve_impl (Context &ctx, mut_view<> λ, index_t stride=1)
Implementation of factor_solve.
[Cyqlone factorization and fused forward solve]
Low-level CR factorization and solve routines
-
index_t cr_thread_assignment(index_t l, index_t c) const#
Adjust thread assignment for non-power-of-two p: The diagonal blocks M(⌊p/2⌋2) are usually mapped to increasing thread indices c as the CR level l increases, as can be seen in the functions above, where iY = c + 1 - 2^l, and from the way the path of M nodes curves to the right in the thread assignment diagram in the paper.
However, these large thread indices are not actually present if p is not a power of two, so we need to remap them, undoing the offset 1 - 2^l. We always assign the last M evaluation to the even thread ⌊p/2⌋2, since this thread is present even if p is odd. The odd thread ⌊p/2⌋2+1 is assigned an inactive index, since it never has any work during CR, as there is no coupling between the last and first stages (at least not in the scalar case).
-
void factor_U(index_t l, index_t iU)#
Compute a block U in the Cholesky factor for the given CR level
land column indexiU.[Cyqlone factor CR helper]
-
void factor_Y(index_t l, index_t iY)#
Compute a block Y in the Cholesky factor for the given CR level
land column indexiY.
-
void factor_L(index_t l, index_t i)#
Update and factorize a block L in the Cholesky factor for CR level
l+1and column indexi, using the previously computed blocks U and Y in the same row at levell.
-
void update_K(index_t l, index_t i)#
Compute a subdiagonal block K of the Schur complement for CR level
l+1and column indexi, using the previously computed blocks U and Y in the same row at levell.
- void solve_u_forward (index_t l, index_t iU, mut_view<> λ, index_t stride) const
Update the right-hand side
λduring the forward solve phase of CR after computing blockiUofλat levell, subtracting the productU(iU) λ(iU)from the block ofλin the same row asU(iU).[Cyqlone factor CR helper]
The
strideparameter specifies the stride between consecutive blocks ofλ.[Cyqlone solve CR helper]
- void solve_y_forward (index_t l, index_t iY, mut_view<> λ, mut_view<> w, index_t stride) const
Update the right-hand side
λduring the forward solve phase of CR after computing blockiYofλat levell, subtracting the productY(iY) λ(iY)from the block ofλin the same row asY(iY).The product
Y(iY) λ(iY)is stored in the workspacewto allow it to be computed concurrently with solve_u_forward, which updates the same block ofλ. Thestrideparameter specifies the stride between consecutive blocks ofλ.
- void solve_λ_forward (index_t l, index_t iL, mut_view<> λ, view<> w, index_t stride) const
Apply the updates to block
iLof the right-hand side from solve_u_forward and solve_y_forward, and then solve with the diagonal block L for the next levell+1.The
strideparameter specifies the stride between consecutive blocks ofλ.
Low-level PCG routines
-
value_type mul_Mv(batch_view<> p, mut_batch_view<> Mp, batch_view<default_order> L, batch_view<default_order> K) const#
Multiply a vector by the final block tridiagonal matrix of size v.
The matrix is represented by the Cholesky factors of the diagonal blocks L and the subdiagonal blocks K.
M = [ M(0) Kᵀ(0) ] where M(i) = L(i) L(i)ᵀ [ K(0) M(1) Kᵀ(1) ] [ K(1) M(2) Kᵀ(2) ] [ K(2) M(3) ]
-
value_type mul_precond(batch_view<> r, mut_batch_view<> z, mut_batch_view<> w, batch_view<default_order> L, batch_view<default_order> K) const#
Multiply a vector by the preconditioner for the final block tridiagonal system of size v.
See also
- void solve_pcg (mut_batch_view<> λ, mut_batch_view<> work_pcg) const
Solve a linear system with the final block tridiagonal system of size v using the preconditioned conjugate gradient method.
The function assumes that the Cholesky factors of the diagonal blocks are stored in
cr_L.batch(0), and the subdiagonal blocks are stored incr_Y.batch(0).
- inline void solve_pcg (mut_batch_view<> λ)
Solve a linear system with the final block tridiagonal system of size v using the preconditioned conjugate gradient method.
The function assumes that the Cholesky factors of the diagonal blocks are stored in
cr_L.batch(0), and the subdiagonal blocks are stored incr_Y.batch(0).
Low-level reverse solve routines
- void solve_reverse_parallel (Context &ctx, mut_view<> λ, mut_view<> work, index_t stride) const
[Cyqlone solve CR]
- void solve_reverse_serial (mut_view<> λ, mut_view<> work, index_t stride) const
[Cyqlone solve CR]
[Cyqlone solve CR serial]
- void solve_u_backward (index_t l, index_t iU, mut_view<> λ, mut_view<> w, index_t stride) const
- void solve_y_backward (index_t l, index_t iY, mut_view<> λ, index_t stride) const
- void solve_λ_backward (index_t biL, mut_view<> λ, view<> w, index_t stride) const
Low-level factorization update routines
-
void set_update_rank_extra(index_t m)#
-
void clear_update_rank_extra()#
-
std::pair<index_t, index_t> cols_Ups_fwd(index_t l, index_t i) const#
-
std::pair<index_t, index_t> cols_Ups_bwd(index_t l, index_t i) const#
-
std::pair<index_t, index_t> cols_Q_cr(index_t l, index_t i) const#
-
index_t work_Ups_fwd_w(index_t l, index_t i) const#
-
index_t work_Ups_bwd_w(index_t l, index_t i) const#
-
mut_batch_view<column_major> work_Ups_fwd(index_t l, index_t i)#
-
mut_batch_view<column_major> work_Ups_bwd(index_t l, index_t i)#
-
mut_batch_view<column_major> work_Q_cr(index_t l, index_t i)#
- mut_batch_view< column_major > work_Σ_fwd (index_t l, index_t i)
- mut_batch_view< column_major > work_Σ_bwd (index_t l, index_t i)
- mut_batch_view< column_major > work_Σ_Q (index_t l, index_t i)
-
mut_batch_view<column_major> work_Ups_fwd_last()#
-
mut_batch_view<column_major> work_Ups_bwd_last()#
- mut_batch_view< column_major > work_Σ_fwd_last ()
- mut_batch_view< column_major > work_Σ_bwd_last ()
-
mut_batch_view<column_major> work_Ups_extra()#
- mut_batch_view< column_major > work_Σ_extra ()
- template<bool Solve = true> void update_solve_cr (Context &ctx, mut_view<> λ, index_t stride)
[Cyqlone update CR]
-
void update_L(index_t l, index_t i)#
[Cyqlone update CR helper]
-
void update_U(index_t l, index_t i)#
-
void update_Y(index_t l, index_t i)#
- template<index_t Level> void update_pcr_level (index_t m, mut_batch_view<> WYU, mut_batch_view<> WΣ)
- void update_pcr (batch_view<> fwd, batch_view<> bwd, batch_view<> Σ)
[Cyqlone update CR helper]
[PCR update]
Low-level prefetching
-
template<StorageOrder O>
void prefetch(batch_view<O> X) const# [Cyqlone solve CR helper]
-
template<StorageOrder O>
void prefetch_L(batch_view<O> X) const#
-
void prefetch_L(index_t bi) const#
-
void prefetch_U(index_t l, index_t iU) const#
-
void prefetch_Y(index_t l, index_t iY) const#
-
template<index_t VL = 4, class T = real_t, StorageOrder DefaultOrder = StorageOrder::ColMajor, class Ctx = parallel::Context<>>
struct CyqloneSolver# - #include <cyqlone/cyqlone.hpp>
Linear solver for systems with optimal control structure.
- Template Parameters:
VL – Vector length.
T – Scalar type.
DefaultOrder – Storage order for the matrix workspaces (row/column major).
Ctx – Parallel execution context type, see parallel::Context.
Parallelization and vectorization
-
using tricyqle_t = TricyqleSolver<VL, T, DefaultOrder, Ctx>#
Tricyqle solver type for solving block-tridiagonal systems in parallel.
-
using Context = tricyqle_t::Context#
-
using simd = tricyqle_t::simd#
-
const index_t p#
Number of processors/threads.
-
const index_t n = (N_horiz + p * v - 1) / (p * v)#
Number of stages per thread per vector lane (rounded up).
-
inline index_t lv() const#
log₂(v), logarithm of the vector length.
-
inline index_t lp() const#
log₂(p), logarithm of the number of processors/threads, rounded up.
-
inline index_t ceil_P() const#
The number of parallel execution units P rounded up to the next power of two.
-
inline std::unique_ptr<SharedContext> create_parallel_context() const#
Matrix data structures
-
template<StorageOrder O = column_major>
using matrix = typename tricyqle_t::template matrix<O># Owning type for a batch of matrices (with batch size v).
-
template<StorageOrder O = column_major>
using view = typename tricyqle_t::template view<O># Non-owning immutable view type for matrix.
-
template<StorageOrder O = column_major>
using mut_view = typename tricyqle_t::template mut_view<O># Non-owning mutable view type for matrix.
-
using layer_stride = typename tricyqle_t::layer_stride#
-
template<StorageOrder O = column_major>
using batch_view = typename tricyqle_t::template batch_view<O># Non-owning immutable view type for a single batch of v matrices.
-
template<StorageOrder O = column_major>
using mut_batch_view = typename tricyqle_t::template mut_batch_view<O># Non-owning mutable view type for a single batch of v matrices.
-
static auto default_order = tricyqle_t::default_order#
Default storage order for most matrices.
-
static auto column_major = tricyqle_t::column_major#
Column-major storage order for column vectors and update matrices.
Problem dimensions
-
const index_t N_horiz#
Horizon length of the optimal control problem.
-
const index_t nx#
Number of states of the OCP.
-
const index_t nu#
Number of controls of the OCP.
-
const index_t ny#
Number of general constraints of the OCP per stage.
-
const index_t ny_0#
Number of general constraints at stage 0, D(0) u(0).
-
const index_t ny_N#
Number of general constraints at the final stage, C(N) x(N).
-
inline index_t num_variables() const#
Get the total number of primal variables in the OCP.
Note
The actual number of variable stored in Cyqlone’s internal data structures may be larger.
-
inline index_t num_dynamics_constraints() const#
Get the total number of dynamics constraints in the OCP.
Note
The actual number of constraints stored in Cyqlone’s internal data structures may be larger.
-
inline index_t num_general_constraints() const#
Get the total number of general constraints in the OCP.
Note
The actual number of constraints stored in Cyqlone’s internal data structures may be larger.
Solver parameters
-
CyqloneParams<value_type> params = {}#
Solver parameters and settings.
-
inline CyqloneParams<value_type> get_params() const#
Get the current Cyqlone solver parameters.
-
inline void update_params(const CyqloneParams<value_type> &new_params)#
Update the Cyqlone solver parameters.
-
inline TricyqleParams<value_type> get_tricyqle_params() const#
Get the current Tricyqle solver parameters.
-
inline void update_tricyqle_params(const TricyqleParams<value_type> &new_params)#
Update the Tricyqle solver parameters.
-
inline std::string get_params_string() const#
Get a string representation of the main solver parameters. Used mainly for file names.
Tricyqle solver for block-tridiagonal systems
- tricyqle_t tricyqle = {.block_size =nx,.max_rank = std::max(ny, ny_0 + ny_N) * N_horiz,.p = p,}
Block-tridiagonal solver (CR/PCR/PCG).
OCP data (reordered for use during the Cyqlone algorithm)
-
matrix<default_order> data_H =
[this] {returnmatrix<default_order>{{.depth = ceil_N(), .rows = nu + nx, .cols = nu + nx}};}()# Stage-wise Hessian blocks H(j) = [ R(j) S(j); S(j)ᵀ Q(j) ] of the OCP cost function.
-
matrix<default_order> data_F =
[this] {returnmatrix<default_order>{{.depth = ceil_N(), .rows = nx, .cols = nu + nx}};}()# Stage-wise dynamics matrices F(j) = [ B(j) A(j) ] of the OCP.
- matrix< default_order > data_Gᵀ = [this] {const auto nyM = std::max(ny, ny_0 + ny_N);returnmatrix<default_order>{{.depth = ceil_N(), .rows = nu + nx, .cols = nyM}};}()
Stage-wise constraint Jacobians G(j)ᵀ = [ D(j) C(j) ]ᵀ of the OCP.
Modified Riccati data structures
-
matrix<default_order> riccati_LH =
[this] {returnmatrix<default_order>{{.depth = p * v, .rows = nu + nx, .cols = n * (nu + nx)}};}()# Cholesky factors of the Hessian blocks for the Riccati recursion.
LH(j) = [ LR(j) 0; LS(j) LQ(j) ]
-
matrix<default_order> riccati_LAB =
[this] {returnmatrix<default_order>{{.depth = p * v, .rows = nx, .cols = n * nx + n * nu}};}()# Storage for the matrices LB(j), Acl(j) and LA(j₁) for the Riccati recursion.
Grouped per thread, with layout [ Acl(jₙ) … Acl(j₂) LA(j₁) | LB(jₙ) … LB(j₁) ], so that LA(j₁) and LB(j) are contiguous (useful when evaluating the Schur complement).
-
matrix<default_order> riccati_V =
[this] {const auto nyM = std::max(ny, ny_0 + ny_N);returnmatrix<default_order>{{.depth = p * v, .rows = nu + nx, .cols = nx+ nyM}};}()# Temporary storage for the V(j) = [ B(j)ᵀ LQ(j); A(j)ᵀ LQ(j) ] matrices during the Riccati recursion.
The workspace is wider than just V to also accommodate the active constraint Jacobians, since both are used to update the Hessian blocks during the Riccati recursion.
-
matrix<column_major> riccati_work =
[this] {returnmatrix<column_major>{{.depth = p * v, .rows = nx, .cols = 1}};}()# Temporary workspace for the Riccati solve phase.
Riccati factorization update data structures
- matrix< column_major > work_Σ = [this] {const auto nyM = std::max(ny, ny_0 + ny_N);returnmatrix<column_major>{{.depth = p * v, .rows = n* nyM, .cols = 1}};}()
Compressed representation of the nonzero diagonal elements of the matrix Σ, populated for each thread separately during the factorization update of the Riccati recursion.
- matrix< column_major > riccati_Υ1 = [this] {const auto nyM = std::max(ny, ny_0 + ny_N);returnmatrix<column_major>{{.depth = p * v, .rows = nu + nx + nx, .cols = n* nyM}};}()
Workspace to store the update matrices Υu, Υx, Υλ, Φu, Φx and Φλ during the factorization update of the Riccati recursion.
Both riccati_Υ1 and riccati_Υ2 are used alternately.
- matrix< column_major > riccati_Υ2 = [this] {const auto nyM = std::max(ny, ny_0 + ny_N);returnmatrix<column_major>{{.depth = p * v, .rows = nu + nx + nx, .cols = n* nyM}};}()
Alternate workspace to riccati_Υ1.
Indexing utilities
-
inline index_t ceil_N() const#
Horizon length, rounded up to a multiple of the number of parallel execution units.
- index_t ν2 (index_t i) const
2-adic valuation ν₂.
- index_t ν2p (index_t i) const
2-adic valuation modulo p, i.e.
ν2p(0) = ν2p(p) = lp().
-
index_t sub_wrap_ceil_P(index_t a, index_t b) const#
-
index_t add_wrap_ceil_P(index_t a, index_t b) const#
-
index_t get_linear_batch_offset(index_t biA) const#
Packing and unpacking of OCP data to Cyqlone storage format
-
void update_data(const CyqloneStorage<value_type> &ocp)#
Update the internal data structures to reflect changes in the OCP data (without changing the problem size).
-
void initialize_rhs(const CyqloneStorage<value_type> &ocp, mut_view<> rhs) const#
Initialize the right-hand side vector for the dynamics constraints of the OCP, using the custom Cyqlone storage format.
-
matrix initialize_rhs(const CyqloneStorage<value_type> &ocp) const#
Initialize the right-hand side vector for the dynamics constraints of the OCP, using the custom Cyqlone storage format.
-
void initialize_gradient(const CyqloneStorage<value_type> &ocp, mut_view<> grad) const#
Initialize the gradient vector for the OCP cost function, using the custom Cyqlone storage format.
-
matrix initialize_gradient(const CyqloneStorage<value_type> &ocp) const#
Initialize the gradient vector for the OCP cost function, using the custom Cyqlone storage format.
-
void initialize_bounds(const CyqloneStorage<value_type> &ocp, mut_view<> b_min, mut_view<> b_max) const#
Initialize the lower and upper bounds for the general constraints of the OCP, using the custom Cyqlone storage format.
-
std::pair<matrix<>, matrix<>> initialize_bounds(const CyqloneStorage<value_type> &ocp) const#
Initialize the lower and upper bounds for the general constraints of the OCP, using the custom Cyqlone storage format.
-
void pack_variables(std::span<const value_type> ux_lin, mut_view<> ux) const#
-
matrix pack_variables(std::span<const value_type> ux_lin) const#
-
void unpack_variables(view<> ux, std::span<value_type> ux_lin) const#
-
std::vector<value_type> unpack_variables(view<> ux) const#
- void pack_dynamics (std::span< const value_type > λ_lin, mut_view<> λ) const
- matrix pack_dynamics (std::span< const value_type > λ_lin) const
- void unpack_dynamics (view<> λ, std::span< value_type > λ_lin) const
- std::vector< value_type > unpack_dynamics (view<> λ) const
-
void pack_constraints(std::span<const value_type> y_lin, mut_view<> y, value_type fill = 0) const#
-
matrix pack_constraints(std::span<const value_type> y_lin, value_type fill = 0) const#
-
void unpack_constraints(view<> y, std::span<value_type> y_lin) const#
-
std::vector<value_type> unpack_constraints(view<> y) const#
-
matrix initialize_variables() const#
Get a zero-initialized matrix for the primal variables u and x.
-
matrix initialize_dynamics_constraints() const#
Get a zero-initialized matrix for the dynamics constraints (or their multipliers).
-
matrix initialize_general_constraints() const#
Get a zero-initialized matrix for the general constraints (or their multipliers).
-
static CyqloneSolver build(const CyqloneStorage<value_type> &ocp, index_t p)#
Initialize a Cyqlone solver for the given OCP.
Note: constraints on u(0) and x(N) should be independent.
nx nu ocp.CD(0) = [ 0 | D ] ny₀ [ 0 | 0 ] ny - ny₀Since ocp.D(0) and ocp.C(N) will be merged, the top ny₀ rows of ocp.C(N) should be zero.
OCP cost gradient and constraints evaluation
-
void residual_dynamics_constr(Context &ctx, view<> x, view<> b, mut_view<> Mxb) const#
Compute Mx + b, where M is the dynamics constraint Jacobian matrix of the OCP.
In other words, evaluate the residuals for all stages, i.e., \( A_j x^j + B_j u^j - x^{j+1} + b^j \).
- void transposed_dynamics_constr (Context &ctx, view<> λ, mut_view<> Mᵀλ, bool accum=false) const
Compute Mᵀλ, where M is the dynamics constraint Jacobian matrix of the OCP.
Optionally add the result to the existing contents of Mᵀλ by setting
accumto true.
-
void general_constr(Context &ctx, view<> ux, mut_view<> DCux) const#
Compute the general constraints Gx, where G is the general constraint Jacobian matrix of the OCP.
In other words, evaluate the constraints for all stages, i.e., \( C_j x^j + D_j u^j \).
- void transposed_general_constr (Context &ctx, view<> y, mut_view<> DCᵀy) const
Compute Gᵀy, where G is the general constraint Jacobian matrix of the OCP.
- void transposed_general_constr (view<> y, mut_view<> DCᵀy) const
Compute Gᵀy, where G is the general constraint Jacobian matrix of the OCP.
- void cost_gradient (Context &ctx, view<> ux, value_type α, view<> q, value_type β, mut_view<> grad_f) const
Compute the cost gradient, with optional scaling factors.
grad_f ← Q ux + α q + β grad_f
- void cost_gradient_regularized (Context &ctx, value_type γ, view<> ux, view<> ux0, view<> q, mut_view<> grad_f) const
Compute the regularized cost gradient, with regularization parameter γ⁻¹, with respect to the point
ux0.
- void cost_gradient_remove_regularization (Context &ctx, value_type γ, view<> x, view<> x0, mut_view<> grad_f) const
Subtract the regularization term from the cost gradient.
Factorization and solve routines
- void factor_solve (Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
Compute the Cyqlone factorization of the KKT matrix of the OCP and perform a forward solve (fused for improved locality).
See also
- Parameters:
ctx – Parallel context.
γ – Reciprocal primal regularization.
Σ – ALM penalty factors.
ux – [inout] Negative augmented Lagrangian gradient on entry; solution of forward solve on exit.
λ – [inout] Constant term of the dynamics constraints on entry; solution of forward solve on exit. To obtain the solution of the KKT system, a reverse solve with the same factorization must be performed afterwards.
- void factor (Context &ctx, value_type γ, view<> Σ)
Compute the Cyqlone factorization of the KKT matrix of the OCP.
See also
- void solve_forward (Context &ctx, mut_view<> ux, mut_view<> λ)
Perform a forward solve with the Cyqlone factorization.
See also
- void solve_reverse (Context &ctx, mut_view<> ux, mut_view<> λ)
Perform a reverse solve with the Cyqlone factorization.
[Cyqlone solve CR serial]
- Parameters:
ctx – Parallel context.
ux – On entry, the result of the forward solve; on exit the solution of the primal variables of the KKT system.
λ – On entry, the result of the forward solve; on exit the solution of the dual variables corresponding to the dynamics constraints of the KKT system.
- void solve_reverse_mul (Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> Mᵀλ)
Fused variant of solve_reverse and transposed_dynamics_constr (for improved locality of the dynamics Jacobians).
- void update (Context &ctx, view<> ΔΣ)
Perform factorization updates of the Cyqlone factorization as described by Algorithm 4 in the paper.
[Cyqlone update]
- Parameters:
ctx – Parallel context.
ΔΣ – Changes to the ALM penalty factors Σ.
- void update_solve (Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
Fused variant of update and solve_forward.
Low-level factorization and forward solve routines
- template<bool Factor = true, bool Solve = true> void factor_riccati_solve (Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
[Modified Riccati factorization and fused forward solve]
- template<bool Factor = true, bool Solve = true> void compute_schur (Context &ctx, mut_view<> ux, mut_view<> λ)
[Cyqlone compute Schur]
Low-level factorization and forward solve routines for parallel cyclic reduction
- template<bool Factor = true, bool Solve = true> void factor_solve_impl (Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
[Cyqlone factorization and fused forward solve]
Low-level reverse solve routines
- void solve_riccati_reverse (Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> work, std::optional< mut_view<> > Mᵀλ) const
[Modified Riccati factorization and fused forward solve]
- void solve_reverse (Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> work, std::optional< mut_view<> > Mᵀλ=std::nullopt) const
Low-level factorization update routines
- template<bool Solve = true> void update_riccati_solve (Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
Update the modified Riccati factorization of a single block column as described by Algorithm 3 in the paper.
[Cyqlone update CR]
[Cyqlone update Riccati]
- inline void update_riccati (Context &ctx, view<> ΔΣ)
- template<bool Solve = true> void update_solve_impl (Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
[PCR update]
[Cyqlone update]
Build sparse representations for debugging and testing
- SparseMatrix build_sparse (const CyqloneStorage< value_type > &ocp, std::span< const value_type > Σ) const
-
std::vector<value_type> build_rhs(view<> rq, view<> b, value_type scale_rq = -1, value_type scale_b = -1) const#
- inline std::vector< value_type > build_sol (view<> ux, view<> λ) const
-
SparseMatrix build_sparse_factor() const#
-
SparseMatrix build_sparse_diag() const#
-
template<class T>
class NeumaierSum# - #include <cyqlone/neumaier.hpp>
Kahan-Babuška-Neumaier compensated summation.
Public Functions
-
inline NeumaierSum operator-() const#
-
inline NeumaierSum &operator+=(T v)#
-
inline NeumaierSum &operator+=(const NeumaierSum &other)#
-
inline NeumaierSum &operator-=(T v)#
-
inline NeumaierSum &operator-=(const NeumaierSum &other)#
Friends
-
inline friend NeumaierSum operator+(NeumaierSum lhs, T rhs)#
-
inline friend NeumaierSum operator+(T lhs, NeumaierSum rhs)#
-
inline friend NeumaierSum operator+(NeumaierSum lhs, NeumaierSum rhs)#
-
inline friend NeumaierSum operator-(NeumaierSum lhs, T rhs)#
-
inline friend NeumaierSum operator-(T lhs, NeumaierSum rhs)#
-
inline friend NeumaierSum operator-(NeumaierSum lhs, const NeumaierSum &rhs)#
-
inline friend NeumaierSum operator*(NeumaierSum lhs, NeumaierSum rhs)#
-
inline friend NeumaierSum operator*(NeumaierSum lhs, T rhs)#
-
inline friend NeumaierSum operator*(T lhs, NeumaierSum rhs)#
-
inline NeumaierSum operator-() const#
-
struct OCPDim#
- #include <cyqlone/ocp.hpp>
Dimensions of an optimal control problem.
-
struct LinearOCPStorage#
- #include <cyqlone/ocp.hpp>
Storage for a linear-quadratic OCP of the form.
ₙ₋₁ minimize ∑ [½ uᵢᵀ Rᵢ uᵢ + uᵢᵀ S xᵢ + ½ xᵢᵀ Qᵢ xᵢ + rᵢᵀuᵢ + qᵢᵀxᵢ] + ½ xₙᵀ Qₙ xₙ + qₙᵀ xₙ ⁱ⁼⁰ s.t. x₀ = b₀ xᵢ₊₁ = Aᵢ xᵢ + Bᵢ uᵢ + bᵢ₊₁ lᵢ ≤ Cᵢ xᵢ + Dᵢ uᵢ ≤ uᵢ lₙ ≤ Cₙ xₙ ≤ uₙPublic Functions
-
inline guanaqo::MatrixView<real_t, index_t> H(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> Q(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> R(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> S(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> S_trans(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> CD(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> C(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> D(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> AB(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> A(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> B(index_t i)#
-
inline guanaqo::MatrixView<const real_t, index_t> H(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> Q(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> R(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> S(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> S_trans(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> CD(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> C(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> D(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> AB(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> A(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> B(index_t i) const#
-
inline index_t num_variables() const#
-
inline index_t num_constraints() const#
-
inline index_t num_dynamics_constraints() const#
-
inline guanaqo::MatrixView<real_t, index_t> qr()#
-
inline guanaqo::MatrixView<real_t, index_t> qr(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> q(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> r(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> b()#
-
inline guanaqo::MatrixView<real_t, index_t> b(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> b_min()#
-
inline guanaqo::MatrixView<real_t, index_t> b_min(index_t i)#
-
inline guanaqo::MatrixView<real_t, index_t> b_max()#
-
inline guanaqo::MatrixView<real_t, index_t> b_max(index_t i)#
-
inline guanaqo::MatrixView<const real_t, index_t> qr() const#
-
inline guanaqo::MatrixView<const real_t, index_t> qr(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> q(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> r(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> b() const#
-
inline guanaqo::MatrixView<const real_t, index_t> b(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> b_min() const#
-
inline guanaqo::MatrixView<const real_t, index_t> b_min(index_t i) const#
-
inline guanaqo::MatrixView<const real_t, index_t> b_max() const#
-
inline guanaqo::MatrixView<const real_t, index_t> b_max(index_t i) const#
Public Members
-
std::vector<real_t> storage = create_storage(dim)#
Create a single contiguous storage for all problem data.
Storage layout: size offset N × [ Q Sᵀ] = H (nx+nu)² 0 [ S R ] 1 × [ Q ] nx² N (nx+nu)² N × [ C D ] ny(nx+nu) N (nx+nu)² + nx² 1 × [ C ] ny_N nx N (nx+nu+ny)(nx+nu) + nx² N × [ A B ] nx(nx+nu) N (nx+nu+ny)(nx+nu) + (nx+ny_N)nx N × [ q r ] nx+nu N (2nx+nu+ny)(nx+nu) + (nx+ny_N)nx 1 × [ q ] nx N (2nx+nu+ny+1)(nx+nu) + (nx+ny_N)nx N+1 × [ b ] nx N (2nx+nu+ny+1)(nx+nu) + (nx+ny_N+1)nx N × [ l ] ny N (2nx+nu+ny+1)(nx+nu) + (nx+ny_N+2+N)nx 1 × [ l ] ny_N N (2nx+nu+ny+1)(nx+nu) + (nx+ny_N+2+N)nx + N ny N × [ u ] ny N (2nx+nu+ny+1)(nx+nu) + (nx+ny_N+2+N)nx + N ny + ny_N 1 × [ u ] ny_N N (2nx+nu+ny+1)(nx+nu) + (nx+ny_N+2+N)nx + N (2ny) + ny_N N (2nx+nu+ny+1)(nx+nu) + (nx+ny_N+2+N)nx + N (2ny) + 2ny_N
-
struct Solution#
- #include <cyqlone/ocp.hpp>
Public Functions
- inline guanaqo::MatrixView< real_t, index_t > λ (OCPDim dim)
- inline guanaqo::MatrixView< const real_t, index_t > λ (OCPDim dim) const
- inline guanaqo::MatrixView< real_t, index_t > λ (OCPDim dim, index_t i)
- inline guanaqo::MatrixView< const real_t, index_t > λ (OCPDim dim, index_t i) const
-
struct KKTError#
- #include <cyqlone/ocp.hpp>
-
inline guanaqo::MatrixView<real_t, index_t> H(index_t i)#
-
template<class T, class simd>
struct norms# - #include <cyqlone/reduce.hpp>
Utilities for computing vector norms.
- Template Parameters:
T – Scalar type.
simd – SIMD type. Void for scalar-only.
Public Functions
-
inline result_simd operator()(result_simd accum, simd t) const#
Update the accumulator with a new value.
-
inline result operator()(result_simd accum) const#
Reduce the SIMD accumulator to a scalar result.
Public Static Functions
-
static inline result_simd zero_simd()#
-
struct result_simd#
- #include <cyqlone/reduce.hpp>
Lane-wise accumulators.
-
template<class T>
struct norms<T, void># - #include <cyqlone/reduce.hpp>
Public Functions
Public Static Functions
-
static inline result_simd zero_simd()#
-
static inline result_simd zero_simd()#
-
struct SparseMatrix#
- #include <cyqlone/sparse.hpp>
A sparse matrix in COO format.
Public Functions
-
inline auto iter_coo() const#
-
inline auto iter_coo() const#
-
struct SparseMatrixBuilder#
- #include <cyqlone/sparse.hpp>
A builder for constructing a SparseMatrix incrementally.
Public Functions
-
inline void add(index_t row, index_t col, real_t value)#
-
template<std::convertible_to<real_t> T, class I, class S, guanaqo::StorageOrder O>
inline void add(index_t row, index_t col, guanaqo::MatrixView<T, I, S, O> dense, std::remove_cvref_t<T> scale = 1, batmat::linalg::MatrixStructure structure = batmat::linalg::MatrixStructure::General)#
-
inline void add_diag(index_t row, index_t col, real_t value, index_t n)#
-
inline SparseMatrix build() const &#
-
inline SparseMatrix build() &&#
-
inline void add(index_t row, index_t col, real_t value)#
-
template<class T>
struct matio_traits#
-
template<>
struct matio_traits<float>#
-
template<>
struct matio_traits<double>#
-
template<index_t VL, class T, StorageOrder DefaultOrder>
struct PCRFactorTest# Public Types
-
template<StorageOrder O = DefaultOrder>
using bmatrix = batmat::matrix::Matrix<value_type, index_t, vl_t, index_t, O, align_t>#
-
template<StorageOrder O = DefaultOrder>
using matrix = batmat::matrix::Matrix<value_type, index_t, vl_t, vl_t, O, align_t>#
-
template<StorageOrder O = DefaultOrder>
using mut_view = matrix<O>::view_type#
-
template<StorageOrder O = DefaultOrder>
using view = matrix<O>::const_view_type#
Public Functions
- inline void solve_pcr (mut_view<> λ)
- inline void solve_pcr (mut_view<> λ, mut_view<> work_pcr) const
- template<index_t l> inline void solve_pcr_level (mut_view<> λ, mut_view<> work_pcr) const
Public Members
-
index_t n#
-
template<StorageOrder O = DefaultOrder>
- template<class class Index, class class StorageIndex> SparseCSC
-
Public Functions
-
length_t nnz() const#
-
length_t nnz() const#
- template<class class Index> SparseCOO
Public Types
-
enum Order
Values:
-
typedef Index index_t
Public Functions
-
length_t nnz() const
-
enum Order
-
namespace detail#
Functions
-
template<class T1, class I1, class S1, guanaqo::StorageOrder O1, class T2, class I2, class S2, guanaqo::StorageOrder O2>
void copy(guanaqo::MatrixView<T1, I1, S1, O1> src, guanaqo::MatrixView<T2, I2, S2, O2> dst)# Simple (inefficient) matrix copy that supports slices with non-unit strides.
-
template<class T0, class T1, class I1, class S1, guanaqo::StorageOrder O1, class T2, class I2, class S2, guanaqo::StorageOrder O2>
void scale(T0 scalar, guanaqo::MatrixView<T1, I1, S1, O1> src, guanaqo::MatrixView<T2, I2, S2, O2> dst)# Simple (inefficient) scaled matrix copy that supports slices with non-unit strides.
-
template<class T1, class I1, class S1, guanaqo::StorageOrder O1, class T2, class I2, class S2, guanaqo::StorageOrder O2>
-
namespace parallel#
-
template<class SC>
struct Context# - #include <cyqlone/parallel.hpp>
Thread context for parallel execution.
Each thread has a unique thread index, and can synchronize and communicate with other threads in the same shared context.
See also
Public Functions
-
inline bool is_master() const#
Check if this thread is the master thread (thread index 0).
Useful for determining which thread should perform operations like printing to the console, which should be done by a single thread and does not require synchronization.
-
inline arrival_token arrive()#
Arrive at the barrier and obtain a token that can be used to wait for completion of the current barrier phase.
Note
Token must be awaited before any other call to arrive.
-
inline void wait(arrival_token &&token)#
Await a token returned by arrive(), waiting for the barrier phase to complete.
-
inline void arrive_and_wait()#
Arrive at the barrier and wait for the barrier phase to complete.
This is a convenience wrapper around arrive() and wait() for the common case where the thread does not have other work to do while waiting.
-
inline void arrive_and_wait(int line)#
Debug version of arrive_and_wait() that performs a sanity check to ensure that all threads are arriving at the same line of code.
The
lineparameter should be the same for all threads arriving at the same barrier. It is only verified in debug builds, and is equivalent to arrive_and_wait() in release builds.
-
template<class T>
inline T broadcast(T x, index_t src = 0)# Broadcast a value
xfrom the thread with indexsrcto all threads.
-
template<class F, class ...Args>
inline std::invoke_result_t<F, Args...> call_broadcast(F &&f, Args&&... args)# Call a function
fwith the givenargson a single thread and broadcast the return value to all threads.
-
template<class T, class F>
inline auto arrive_reduce(T x, F func)# Perform a reduction of
xacross all threads using the given binary functionfunc.Returns a token that can be used to wait for the reduction to complete and obtain the reduced value.
Wait for the reduction initiated by arrive_reduce() to complete and obtain the reduced value.
-
template<class T, class F>
inline T reduce(T x, F func)# Perform a reduction of
xacross all threads using the given binary functionfunc, and wait for the result.
-
template<class T>
inline T reduce(T x)# Reduction with
std::plus, i.e., summation across all threads.See also
-
template<class F>
inline void run_single_sync(F &&f)# Wait for all threads to reach this point, then run the given function on a single thread before releasing all threads again.
Changes by all threads are visible during the call to
fand changes made byfare visible to all threads after this function returns.
-
inline bool is_master() const#
- #include <cyqlone/parallel.hpp>
Abstraction for a parallel execution context: a set of threads that can synchronize and communicate with each other using barriers.
See also
Public Types
Public Functions
Execute the given function in parallel on all threads, blocking until completion.
The function will be called with a Context that contains the thread index, and that can be used to synchronize and communicate between threads.
Configure the barrier spin count used in parallel synchronization before falling back to a futex wait.
Public Members
-
template<class SC>
-
namespace qpalm#
Typedefs
-
using ABSum_t = real_t#
Enums
-
enum class StorageOrder#
Values:
Functions
-
template<index_t VL, StorageOrder DefaultOrder>
unique_CyQPALMBackend<VL, DefaultOrder> make_cyqpalm_backend(const CyqloneStorage<> &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)#
-
template<std::ranges::bidirectional_range R, class F, class C>
static std::ranges::subrange<std::ranges::iterator_t<R>> partition_min(R &&range, F pred, C cmp)# A variant of std::ranges::partition where the first element of the return value is the smallest element of the “false” partition.
-
template<std::permutable I, std::sentinel_for<I> S, class F>
static decltype(auto) partition(I first, S last, F key)#
-
template<class I, class T, class BinOp, class UnOp>
T transform_reduce(I first, I last, T init, BinOp binary_op, UnOp unary_op)#
- ABSums partial_sum_negative (PartitionedBreakpoints breakpoints, real_t η=0, real_t β=0)
- template<class Vec> std::span< Breakpoint > compute_breakpoints_default (std::vector< Breakpoint > &breakpoints, const Vec &Σ, const Vec &y, const Vec &Ad, const Vec &Ax, const Vec &b_min, const Vec &b_max)
Compute the break points t[i] using formula (3.6) in the QPALM paper.
- Returns:
t, α, δ
-
PartitionedBreakpoints partition_breakpoints_default(std::span<Breakpoint> breakpoints)#
Moves any non-finite elements in t to the end of the range, and all negative elements to the front.
Returns the negative and positive partitions.
-
inline const char *enum_name(SolverStatus s)#
-
std::ostream &operator<<(std::ostream &os, SolverStatus s)#
- template unique_CyQPALMBackend< 1, StorageOrder::ColMajor > make_cyqpalm_backend< 1, StorageOrder::ColMajor > (const CyqloneStorage<> &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)
- template void update_cyqpalm_backend< 1, StorageOrder::ColMajor > (CyQPALMBackend< 1, StorageOrder::ColMajor > &, const CyqloneStorage<> &ocp)
- template void update_cyqpalm_backend< 1, StorageOrder::ColMajor > (CyQPALMBackend< 1, StorageOrder::ColMajor > &, const LinearOCPStorage &ocp)
- template unique_CyQPALMBackend< 4, StorageOrder::ColMajor > make_cyqpalm_backend< 4, StorageOrder::ColMajor > (const CyqloneStorage<> &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)
- template void update_cyqpalm_backend< 4, StorageOrder::ColMajor > (CyQPALMBackend< 4, StorageOrder::ColMajor > &, const CyqloneStorage<> &ocp)
- template void update_cyqpalm_backend< 4, StorageOrder::ColMajor > (CyQPALMBackend< 4, StorageOrder::ColMajor > &, const LinearOCPStorage &ocp)
- template unique_CyQPALMBackend< 8, StorageOrder::ColMajor > make_cyqpalm_backend< 8, StorageOrder::ColMajor > (const CyqloneStorage<> &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)
- template void update_cyqpalm_backend< 8, StorageOrder::ColMajor > (CyQPALMBackend< 8, StorageOrder::ColMajor > &, const CyqloneStorage<> &ocp)
- template void update_cyqpalm_backend< 8, StorageOrder::ColMajor > (CyQPALMBackend< 8, StorageOrder::ColMajor > &, const LinearOCPStorage &ocp)
- template unique_CyQPALMBackend< 1, StorageOrder::RowMajor > make_cyqpalm_backend< 1, StorageOrder::RowMajor > (const CyqloneStorage<> &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)
- template void update_cyqpalm_backend< 1, StorageOrder::RowMajor > (CyQPALMBackend< 1, StorageOrder::RowMajor > &, const CyqloneStorage<> &ocp)
- template void update_cyqpalm_backend< 1, StorageOrder::RowMajor > (CyQPALMBackend< 1, StorageOrder::RowMajor > &, const LinearOCPStorage &ocp)
- template unique_CyQPALMBackend< 4, StorageOrder::RowMajor > make_cyqpalm_backend< 4, StorageOrder::RowMajor > (const CyqloneStorage<> &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)
- template void update_cyqpalm_backend< 4, StorageOrder::RowMajor > (CyQPALMBackend< 4, StorageOrder::RowMajor > &, const CyqloneStorage<> &ocp)
- template void update_cyqpalm_backend< 4, StorageOrder::RowMajor > (CyQPALMBackend< 4, StorageOrder::RowMajor > &, const LinearOCPStorage &ocp)
- template unique_CyQPALMBackend< 8, StorageOrder::RowMajor > make_cyqpalm_backend< 8, StorageOrder::RowMajor > (const CyqloneStorage<> &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)
- template void update_cyqpalm_backend< 8, StorageOrder::RowMajor > (CyQPALMBackend< 8, StorageOrder::RowMajor > &, const CyqloneStorage<> &ocp)
- template void update_cyqpalm_backend< 8, StorageOrder::RowMajor > (CyQPALMBackend< 8, StorageOrder::RowMajor > &, const LinearOCPStorage &ocp)
-
std::ostream &operator<<(std::ostream&, const SolverTimings&)#
Variables
-
struct cyqlone::qpalm::compute_breakpoints_fn compute_breakpoints#
-
struct cyqlone::qpalm::get_partitioned_breakpoints_fn get_partitioned_breakpoints#
-
struct cyqlone::qpalm::get_breakpoints_fn get_breakpoints#
-
struct CyqloneData#
- #include <cyqlone/qpalm/backends/ocp-backend-cyqlone.hpp>
-
struct CyQPALMBackendSettings#
- #include <cyqlone/qpalm/backends/ocp-backend-cyqlone.hpp>
Public Members
-
index_t processors = 8#
-
bool print_residuals = false#
-
int print_precision = 3#
-
double changing_constr_factor = 0.05#
-
index_t max_update_count = 5#
-
bool detailed_timings = false#
-
cyqlone::TricyqleParams<real_t> tricyqle_params = {}#
-
uint32_t spin_count = std::numeric_limits<uint32_t>::max()#
-
WarmStartingStrategy strategy = WarmStartingStrategy::Copy#
-
index_t processors = 8#
-
struct CyQPALMBackendStats#
- #include <cyqlone/qpalm/backends/ocp-backend-cyqlone.hpp>
-
template<index_t VL, StorageOrder DefaultOrder>
struct CyQPALMBackend# Line search
- BreakpointsResult compute_partition_breakpoints (Context &ctx, std::vector< Breakpoint > &breakpoints, const ineq_constr_vec_t &Σ, const ineq_constr_vec_t &y, const ineq_constr_vec_t &Ad, const ineq_constr_vec_t &Ax, const ineq_constr_vec_t &b_min, const ineq_constr_vec_t &b_max)
-
template<class T, size_t N>
static void merge_chunk(std::span<const T> chunk, size_t chunk_index, std::span<const std::array<size_t, N>> separators, std::span<T> out)#
- inline friend BreakpointsResult guanaqo_tag_invoke (guanaqo::tag_t< get_breakpoints >, CyQPALMBackend &backend, Context &ctx, std::vector< Breakpoint > &breakpoints, const ineq_constr_vec_t &Σ, const ineq_constr_vec_t &y, const ineq_constr_vec_t &Ad, const ineq_constr_vec_t &Ax, const ineq_constr_vec_t &b_min, const ineq_constr_vec_t &b_max)
Linear algebra operations (level 1 BLAS-like)
-
template<class T, class U>
void xaxpy(Context &ctx, real_t a, const T &x, U &y)# Compute y = a x + y.
-
template<class T, class U>
void set_constant(Context &ctx, T &x, const U &y) const# Set each element of x to the constant value y.
-
template<class ...Args>
void local_dots(std::span<real_t, 1 + sizeof...(Args) / 2> out, const auto &a, const auto &b, const Args&... others) const# Compute multiple partial dot products, without reducing across threads.
-
template<class ...Args>
std::array<real_t, sizeof...(Args) / 2> dots(Context &ctx, const Args&... args) const# Compute multiple dot products at once.
This is more efficient than computing them separately because only a single reduction across threads is needed.
Public Types
-
using OCP_t = cyqlone::CyqloneSolver<VL, real_t, DefaultOrder>#
-
using Stats = CyQPALMBackendStats#
Public Functions
-
inline CyQPALMBackend(const CyqloneStorage<> &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)#
-
inline void update_data(const CyqloneStorage<> &ocp)#
-
inline void set_b_eq(std::span<const real_t> b_eq)#
-
inline void set_b_lb(std::span<const real_t> b_lb)#
-
inline void set_b_ub(std::span<const real_t> b_ub)#
- inline void warm_start (const var_vec_t &x, const ineq_constr_vec_t &y, const eq_constr_vec_t &λ)
-
inline void reset()#
-
inline index_t num_var() const#
-
inline index_t num_eq_constr() const#
-
inline index_t num_ineq_constr() const#
-
inline eq_constr_vec_t eq_constr_vec() const#
-
inline ineq_constr_vec_t ineq_constr_vec() const#
-
inline active_set_t active_set() const#
- template<class... Λs> inline void initialize_eq_constr_vec (Λs &...λs) const
- inline void initial_multipliers_eq (Context &ctx, eq_constr_vec_t &λ) const
-
inline void initial_multipliers_ineq(Context &ctx, ineq_constr_vec_t &y) const#
-
void ineq_constr_resid(Context &ctx, const ineq_constr_vec_t &Ax, ineq_constr_vec_t &e) const#
-
void project_multipliers_ineq(Context &ctx, ineq_constr_vec_t &y) const#
-
real_t ineq_constr_viol(Context &ctx, const ineq_constr_vec_t &Ax) const#
- real_t ineq_constr_resid_al (Context &ctx, const ineq_constr_vec_t &y, const ineq_constr_vec_t &ŷ, const ineq_constr_vec_t &Σ, ineq_constr_vec_t &e)
-
inline void eq_constr_resid(Context &ctx, const var_vec_t &x, eq_constr_vec_t &Mxb)#
- inline void mat_vec_MT (Context &ctx, const eq_constr_vec_t &λ, var_vec_t &Mᵀλ)
-
inline real_t unscaled_eq_constr_viol(Context &ctx, const eq_constr_vec_t &Mxb) const#
- inline void mat_vec_AT (Context &ctx, const ineq_constr_vec_t &y, var_vec_t &Aᵀy)
- inline void mat_vec_AT (const ineq_constr_vec_t &y, var_vec_t &Aᵀy)
-
inline void mat_vec_A(Context &ctx, const var_vec_t &x, ineq_constr_vec_t &Ax)#
-
inline ineq_constr_vec_t mat_vec_A(Context &ctx, const var_vec_t &x)#
-
inline void grad_f_regularized(Context &ctx, real_t S, const var_vec_t &x, const var_vec_t &x_reg, var_vec_t &grad_f)#
-
inline void grad_f_remove_regularization(Context &ctx, real_t S, const var_vec_t &x, const var_vec_t &x_reg, var_vec_t &grad_f)#
- index_t update_penalty_y (Context &ctx, ineq_constr_vec_t &Σ, const ineq_constr_vec_t &e, const ineq_constr_vec_t &e_old, const PenaltySettings &settings)
Increase the penalty parameters Σ for which the violation e has not decreased sufficiently compared to e_old.
-
inline void update_regularization_changed(Context &ctx, real_t S_new, real_t S_old)#
Called when the primal regularization S has changed: causes the factorization to be reset.
-
inline real_t boost_regularization(Context &ctx, real_t S, real_t S_boost)#
Decrease the primal regularization S to S_boost, to boost convergence when close to the solution.
- inline void update_penalty_changed (Context &ctx, const ineq_constr_vec_t &Σ, index_t num_Σ_changed)
Called when the penalty parameters Σ have been updated: causes the factorization to be reset if any of the penalty parameters have changed.
-
inline const ineq_constr_vec_t &Ax_min() const#
-
inline const ineq_constr_vec_t &Ax_max() const#
- index_t calc_ŷ_Aᵀŷ (Context &ctx, const ineq_constr_vec_t &Ax, const ineq_constr_vec_t &Σ, const ineq_constr_vec_t &y, ineq_constr_vec_t &ŷ, var_vec_t &Aᵀŷ, active_set_t &J)
- inline real_t unscaled_aug_lagr_norm (Context &ctx, const var_vec_t &grad_f, const var_vec_t &Mᵀλ, const var_vec_t &Aᵀŷ) const
-
inline void scale_ineq_constr(std::span<const real_t> in, ineq_constr_vec_t &out) const#
-
inline void scale_eq_constr(std::span<const real_t> in, eq_constr_vec_t &out) const#
-
inline void unscale_ineq_constr(const ineq_constr_vec_t &in, std::span<real_t> out) const#
-
inline void unscale_ineq_constr(const active_set_t &in, std::span<real_t> out) const#
-
inline void unscale_eq_constr(const eq_constr_vec_t &in, std::span<real_t> out) const#
- inline index_t active_set_change (Context &ctx, real_t, const ineq_constr_vec_t &Σ, const active_set_t &J, const active_set_t &J_old)
- inline void recompute_inner (Context &ctx, real_t S, const var_vec_t &x_outer, const var_vec_t &x, const eq_constr_vec_t &λ, var_vec_t &grad, ineq_constr_vec_t &Ax, var_vec_t &Mᵀλ)
- inline real_t recompute_outer (Context &ctx, const var_vec_t &x, const var_vec_t &Aᵀŷ, const eq_constr_vec_t &λ, var_vec_t &grad, ineq_constr_vec_t &Ax, var_vec_t &Mᵀλ)
- void print_solve_rhs_norms (Context &ctx, const var_vec_t &d, const eq_constr_vec_t &Δλ, const var_vec_t &grad, const var_vec_t &Mᵀλ, const var_vec_t &Aᵀŷ) const
- void print_solve_resid_norms (Context &ctx, const var_vec_t &x, const var_vec_t &d, const var_vec_t &grad, const var_vec_t &ξ, const var_vec_t &Mᵀλ, const var_vec_t &Aᵀŷ, const var_vec_t &MᵀΔλ, const ineq_constr_vec_t &Ad, const active_set_t &J)
- void solve (Context &ctx, const var_vec_t &x, const var_vec_t &grad, const var_vec_t &Mᵀλ, const var_vec_t &Aᵀŷ, const eq_constr_vec_t &Mxb, real_t S, const ineq_constr_vec_t &Σ, const active_set_t &J, var_vec_t &d, var_vec_t &ξ, ineq_constr_vec_t &Ad, eq_constr_vec_t &Δλ, var_vec_t &MᵀΔλ)
Public Members
-
CyQPALMBackendSettings settings#
-
ineq_constr_vec_t b_min_strided#
-
ineq_constr_vec_t b_max_strided#
-
eq_constr_vec_t b_eq_strided#
- ineq_constr_vec_t ΔΣ
-
std::optional<ineq_constr_vec_t> y0#
- std::optional< eq_constr_vec_t > λ0
-
std::vector<std::array<size_t, 4>> thread_indices#
-
std::vector<Breakpoint> breakpoints_temp#
-
bool reset_factorization = true#
-
bool update_pending = false#
-
index_t num_updates = 0#
-
eq_constr_vec_t temp_eq#
-
ineq_constr_vec_t temp_ineq#
-
struct var_vec_t : public storage_t#
Public Functions
-
var_vec_t() = default#
Public Members
- friend CyQPALMBackend
-
var_vec_t() = default#
-
struct eq_constr_vec_t : public storage_t#
Public Functions
-
eq_constr_vec_t() = default#
Public Members
- friend CyQPALMBackend
-
eq_constr_vec_t() = default#
-
struct ineq_constr_vec_t : public storage_t#
Public Functions
-
ineq_constr_vec_t() = default#
Public Members
- friend CyQPALMBackend
-
ineq_constr_vec_t() = default#
-
struct active_set_t : public storage_t#
Public Functions
-
active_set_t() = default#
Public Members
- friend CyQPALMBackend
-
active_set_t() = default#
-
struct Timings#
-
Public Members
-
struct PenaltySettings#
-
template<index_t VL, StorageOrder DefaultOrder = StorageOrder::ColMajor>
struct unique_CyQPALMBackend : public std::unique_ptr<CyQPALMBackend<VL, StorageOrder::ColMajor>># - #include <cyqlone/qpalm/backends/ocp-backend-cyqlone.hpp>
Public Functions
-
unique_CyQPALMBackend() = default#
-
unique_CyQPALMBackend(unique_CyQPALMBackend&&) noexcept = default#
-
unique_CyQPALMBackend &operator=(unique_CyQPALMBackend&&) noexcept = default#
-
~unique_CyQPALMBackend()#
-
inline unique_CyQPALMBackend(std::unique_ptr<CyQPALMBackend<VL, DefaultOrder>> &&o) noexcept#
-
T *operator->()#
STL member.
Public Members
-
T ptr#
STL member.
-
unique_CyQPALMBackend() = default#
-
struct DetailedStats#
- #include <cyqlone/qpalm/detailed-stats.hpp>
Public Types
-
struct Entry#
- #include <cyqlone/qpalm/detailed-stats.hpp>
-
struct Entry#
-
struct Breakpoint#
- #include <cyqlone/qpalm/implementation/breakpoint.hpp>
Public Functions
- inline real_t α () const
-
struct ABSums#
- #include <cyqlone/qpalm/implementation/breakpoint.hpp>
-
struct PartitionedBreakpoints#
- #include <cyqlone/qpalm/implementation/breakpoint.hpp>
-
struct BreakpointsResult#
- #include <cyqlone/qpalm/implementation/breakpoint.hpp>
-
struct compute_breakpoints_fn#
- #include <cyqlone/qpalm/implementation/breakpoint.hpp>
Public Types
Public Functions
- template<class Backend> inline auto operator() (Backend &backend, typename Backend::Context &ctx, std::vector< Breakpoint > &breakpoints, const vec_t< Backend > &Σ, const vec_t< Backend > &y, const vec_t< Backend > &Ad, const vec_t< Backend > &Ax, const vec_t< Backend > &b_min, const vec_t< Backend > &b_max) const
- template<class Backend> inline auto operator() (Backend &, typename Backend::Context &ctx, std::vector< Breakpoint > &breakpoints, const vec_t< Backend > &Σ, const vec_t< Backend > &y, const vec_t< Backend > &Ad, const vec_t< Backend > &Ax, const vec_t< Backend > &b_min, const vec_t< Backend > &b_max) const
-
struct get_partitioned_breakpoints_fn#
- #include <cyqlone/qpalm/implementation/breakpoint.hpp>
Public Types
Public Functions
- template<class Backend> inline PartitionedBreakpoints operator() (Backend &backend, typename Backend::Context &ctx, std::vector< Breakpoint > &breakpoints, const vec_t< Backend > &Σ, const vec_t< Backend > &y, const vec_t< Backend > &Ad, const vec_t< Backend > &Ax, const vec_t< Backend > &b_min, const vec_t< Backend > &b_max) const
- template<class Backend> inline PartitionedBreakpoints operator() (Backend &backend, typename Backend::Context &ctx, std::vector< Breakpoint > &breakpoints, const vec_t< Backend > &Σ, const vec_t< Backend > &y, const vec_t< Backend > &Ad, const vec_t< Backend > &Ax, const vec_t< Backend > &b_min, const vec_t< Backend > &b_max) const
-
struct get_breakpoints_fn#
- #include <cyqlone/qpalm/implementation/breakpoint.hpp>
Public Types
Public Functions
- template<class Backend> inline auto operator() (Backend &backend, typename Backend::Context &ctx, std::vector< Breakpoint > &breakpoints, const vec_t< Backend > &Σ, const vec_t< Backend > &y, const vec_t< Backend > &Ad, const vec_t< Backend > &Ax, const vec_t< Backend > &b_min, const vec_t< Backend > &b_max) const
- template<class Backend> inline BreakpointsResult operator() (Backend &backend, typename Backend::Context &ctx, std::vector< Breakpoint > &breakpoints, const vec_t< Backend > &Σ, const vec_t< Backend > &y, const vec_t< Backend > &Ad, const vec_t< Backend > &Ax, const vec_t< Backend > &b_min, const vec_t< Backend > &b_max) const
-
template<class Vec>
struct LineSearch# -
Public Functions
- Result operator() (auto &ctx, auto &backend, real_t η, real_t β, const vec_t &Σ, const vec_t &y, const vec_t &Ad, const vec_t &Ax, const vec_t &b_min, const vec_t &b_max)
Perform an exact line search on the augmented Lagrangian.
Implements Algorithm 2 in the QPALM paper.
- Parameters:
η – \( \eta = \inprod{d}{\xi} \)
β – \( \beta = \inprod{d}{\grad\tilde f_k(x^{k,\nu})} \)
Σ – Penalty factor \( \Sigma_k \) (diagonal)
y – Lagrange multipliers \( y^k \)
Ad – Matrix-vector product \( A d \)
Ax – Matrix-vector product \( A x^{k,\nu} \)
b_min – Constraint lower bound \( b_\mathrm{min} \)
b_max – Constraint upper bound \( b_\mathrm{max} \)
- Returns:
τ Optimal step size \( \tau_\star \)
Public Static Functions
-
static Result find_stepsize_base(ABSum_t a, ABSum_t b, size_t i0, std::span<Breakpoint> pos_bp)#
-
static Result find_stepsize(ABSum_t a, ABSum_t b, size_t i0, std::span<Breakpoint> pos_bp, bool partition_1 = true)#
-
struct Result#
-
template<class Backend>
struct SolverImplementation# Public Types
-
using active_set_t = typename backend_type::active_set_t#
-
using ineq_vec_t = typename backend_type::ineq_constr_vec_t#
-
using eq_vec_t = typename backend_type::eq_constr_vec_t#
-
using var_vec_t = typename backend_type::var_vec_t#
Public Functions
-
inline void ensure_storage(backend_type &backend)#
-
SolverStatus do_main_loop(Backend::Context &ctx, backend_type &backend, const Settings &settings, guanaqo::AtomicStopSignal &stop_signal, SolverStats &stats)#
Public Members
-
LineSearch<ineq_vec_t> linesearch#
-
active_set_t active_set#
-
active_set_t active_set_old#
- ineq_vec_t Σ
- ineq_vec_t ŷ
-
ineq_vec_t e_old#
-
ineq_vec_t Ax#
-
ineq_vec_t Ad#
- eq_vec_t Δλ
- eq_vec_t λ
- var_vec_t Mᵀλ
- var_vec_t Aᵀŷ
- var_vec_t MᵀΔλ
- var_vec_t ξ
Public Static Functions
- static inline void initialize_penalty_y (Backend::Context &ctx, backend_type &backend, real_t f0, const ineq_vec_t &e0, ineq_vec_t &Σ, const Settings &settings)
- static inline index_t update_penalty_y (Backend::Context &ctx, backend_type &backend, ineq_vec_t &Σ, const ineq_vec_t &e, const ineq_vec_t &e_old, const Settings &settings)
-
using active_set_t = typename backend_type::active_set_t#
-
struct Settings#
- #include <cyqlone/qpalm/settings.hpp>
Public Functions
Public Members
-
unsigned max_outer_iter = 100#
Maximum number of (total) iterations.
-
unsigned max_inner_iter = 100#
-
unsigned max_total_inner_iter = 10000#
-
std::chrono::microseconds max_time = 5min#
-
real_t tolerance = real_t(1e-8)#
Primal tolerance.
-
real_t dual_tolerance = real_t(1e-8)#
-
real_t eq_constr_tolerance = real_t(1e-10)#
-
real_t initial_inner_tolerance = real_t(1)#
- real_t ρ = real_t(1e-1)
- real_t θ = 0.25
- real_t Δy = 100
- real_t Δy_always = 1
-
real_t max_penalty_y = 1e9#
-
real_t initial_penalty_y = 20#
-
bool scale_initial_penalty_y = false#
- real_t Δx = 10
-
real_t max_penalty_x = 1e7#
-
real_t boost_penalty_x = 1e12#
-
real_t initial_penalty_x = 1e7#
-
bool proximal = true#
-
bool recompute_eq_res = true#
-
bool recompute_inner = false#
-
bool recompute = true#
-
bool verbose = true#
-
int print_precision = 3#
-
unsigned max_no_changes_active_set = 5#
-
bool linesearch_include_multipliers = true#
-
bool force_linesearch_if_no_set_change = true#
-
bool force_linesearch_if_dir_deriv_pos = false#
-
bool detailed_stats = false#
-
bool scale_newton_step = false#
-
bool print_directional_deriv = false#
-
bool print_linesearch_inputs = false#
-
unsigned max_outer_iter = 100#
-
struct SolverTimings#
- #include <cyqlone/qpalm/solver.hpp>
Public Members
-
DefaultTimings total#
-
DefaultTimings scaling#
-
DefaultTimings line_search#
-
DefaultTimings recompute_inner#
-
DefaultTimings recompute_outer#
-
DefaultTimings mat_vec_M#
-
DefaultTimings mat_vec_MT#
-
DefaultTimings mat_vec_A#
-
DefaultTimings mat_vec_AT#
-
DefaultTimings mat_vec_Q#
-
DefaultTimings active_set_change#
-
DefaultTimings update_penalty#
-
DefaultTimings update_regularization#
-
DefaultTimings boost_regularization#
-
DefaultTimings solve#
-
std::map<std::string, DefaultTimings> backend#
-
std::ostream &operator<<(std::ostream&, const SolverTimings&)#
-
DefaultTimings total#
-
struct SolverStats#
- #include <cyqlone/qpalm/solver.hpp>
Public Members
-
unsigned inner_iter = 0#
-
unsigned outer_iter = 0#
-
real_t stationarity = std::numeric_limits<real_t>::quiet_NaN()#
-
real_t primal_residual_norm = std::numeric_limits<real_t>::quiet_NaN()#
-
real_t max_penalty = std::numeric_limits<real_t>::quiet_NaN()#
-
SolverTimings timings = {}#
-
std::optional<DetailedStats> detail = std::nullopt#
-
unsigned inner_iter = 0#
-
template<class Backend>
class Solver# - #include <cyqlone/qpalm/solver.hpp>
Public Types
-
using BackendStats = typename detail::backend_stats_type<backend_type>::type#
Public Functions
-
inline SolverStatus operator()()#
-
index_t get_num_variables() const#
-
index_t get_num_equality_constraints() const#
-
index_t get_num_inequality_constraints() const#
-
bool has_result() const#
-
void get_solution(std::span<real_t>) const#
-
std::vector<real_t> get_solution() const#
-
void get_equality_multipliers(std::span<real_t>) const#
-
std::vector<real_t> get_equality_multipliers() const#
-
void get_equality_constraints(std::span<real_t>) const#
-
std::vector<real_t> get_equality_constraints() const#
-
void get_inequality_multipliers(std::span<real_t>) const#
-
std::vector<real_t> get_inequality_multipliers() const#
-
void get_inequality_constraints(std::span<real_t>) const#
-
std::vector<real_t> get_inequality_constraints() const#
-
void get_penalty_factors(std::span<real_t>) const#
-
std::vector<real_t> get_penalty_factors() const#
-
void warm_start_solution()#
- void set_initial_guess (std::span< const real_t > x, std::span< const real_t > y, std::span< const real_t > λ)
- bool get_initial_guess (std::span< real_t > x, std::span< real_t > y, std::span< real_t > λ)
-
void set_b_eq(std::span<const real_t> b_eq)#
-
void set_b_lb(std::span<const real_t> b_lb)#
-
void set_b_ub(std::span<const real_t> b_ub)#
-
inline void stop()#
-
~Solver()#
Public Members
-
std::unique_ptr<SolverImplementation<backend_type>> impl = {}#
-
std::optional<SolverStats> stats = std::nullopt#
-
std::optional<BackendStats> stats_backend = std::nullopt#
-
guanaqo::AtomicStopSignal stop_signal = {}#
-
using BackendStats = typename detail::backend_stats_type<backend_type>::type#
-
namespace problems#
Typedefs
-
typedef Eigen::MatrixX<real_t> eigen_mat#
Functions
-
LinearOCPStorage load_from_csv(const fs::path &folder, const std::string &name)#
-
PlatooningProblem platooning(PlatooningParams p)#
-
SpringMassProblem spring_mass(SpringMassParams p)#
-
struct PlatooningParams#
- #include <cyqlone/qpalm/example-problems/platooning.hpp>
-
struct PlatooningProblem#
- #include <cyqlone/qpalm/example-problems/platooning.hpp>
-
struct SpringMassParams#
- #include <cyqlone/qpalm/example-problems/spring-mass.hpp>
Public Types
Public Members
-
real_t friction = 0#
friction coefficient
-
real_t k_spring = 1#
spring constant between masses
-
real_t F_max = 0.5#
maximum actuator force
-
real_t p_max = 4#
maximum displacement of each mass
-
real_t v_max = 0#
maximum velocity of each mass (zero = no limit)
-
real_t width = 1#
width of the setup (distance between the walls)
-
index_t N_horiz = 32#
number of discretization steps
-
real_t T_horiz = 15#
time horizon
-
real_t q_vel = 1#
scaling factor for cost terms for the velocity
-
real_t q_pos = 1#
scaling factor for cost terms for the position
-
real_t r_act = 1#
scaling factor for cost terms for the actuators
-
std::vector<real_t> masses = {1, 1, 1, 1, 1, 1}#
-
enum cyqlone::qpalm::problems::SpringMassParams::ActuatorPlacement actuator_placement = IndividualActuators#
-
uint64_t seed = 0#
random seed for actuator placement
Public Static Functions
-
static inline SpringMassParams wang_boyd_2008(index_t n_masses, index_t N_horiz = 30, uint64_t seed = 0)#
-
static inline SpringMassParams wang_boyd_2008_width(index_t n_masses, index_t N_horiz = 30, uint64_t seed = 0, double steady_state_spring_length = 0.1)#
-
static inline SpringMassParams domahidi_2012(index_t n_masses, index_t N_horiz, uint64_t seed = 0)#
-
static inline SpringMassParams active_state_constr(index_t n_masses = 18, index_t N_horiz = 256, uint64_t seed = 0)#
-
real_t friction = 0#
-
struct SpringMassProblem#
- #include <cyqlone/qpalm/example-problems/spring-mass.hpp>
-
typedef Eigen::MatrixX<real_t> eigen_mat#
-
namespace detail#
Typedefs
-
template<class T>
using backend_type_t = typename backend_type<T>::type#
-
template<index_t VL, StorageOrder DefaultOrder>
struct backend_stats_type<CyQPALMBackend<VL, DefaultOrder>># - #include <cyqlone/qpalm/backends/ocp-backend-cyqlone.hpp>
Public Types
-
using type = CyQPALMBackendStats#
-
using type = CyQPALMBackendStats#
-
template<index_t VL, StorageOrder DefaultOrder>
struct backend_type<unique_CyQPALMBackend<VL, DefaultOrder>># - #include <cyqlone/qpalm/backends/ocp-backend-cyqlone.hpp>
Public Types
-
using type = CyQPALMBackend<VL, DefaultOrder>#
-
using type = CyQPALMBackend<VL, DefaultOrder>#
-
template<class T>
struct backend_type#
-
template<class T, class D>
struct backend_type<std::unique_ptr<T, D>># - #include <cyqlone/qpalm/solver.hpp>
-
template<class T>
struct backend_stats_type#
-
template<class T>
-
using ABSum_t = real_t#
-
using DefaultTimings = batmat::DefaultTimings#