develop/Doxygen/cyqlone_8hpp_source.html

#pragma once


/// @file

/// The main header for the Cyqlone and Tricyqle linear solvers.

/// @ingroup topic-lin-solvers


#include <cyqlone/config.hpp>

#include <cyqlone/cyqlone-params.hpp>

#include <cyqlone/cyqlone-storage.hpp>

#include <cyqlone/parallel.hpp>

#include <cyqlone/sparse.hpp>

#include <cyqlone/timing.hpp>

#include <batmat/assume.hpp>

#include <batmat/config.hpp>

#include <batmat/linalg/hyhound.hpp> // TODO: isolate size functions

#include <batmat/matrix/layout.hpp>

#include <batmat/matrix/matrix.hpp>

#include <batmat/openmp.h>

#include <batmat/simd.hpp>

#include <batmat/unroll.h>

#include <guanaqo/trace.hpp>


#include <algorithm>

#include <bit>

#include <cassert>

#include <utility>


namespace CYQLONE_NS(cyqlone) {


using batmat::matrix::StorageOrder;


[[nodiscard]] constexpr bool is_pow_2(index_t n) {

    BATMAT_ASSUME(n > 0);

    auto un = static_cast<std::make_unsigned_t<index_t>>(n);

    return std::has_single_bit(un);

}


[[nodiscard]] constexpr index_t ceil_log2(index_t n) {

    BATMAT_ASSUME(n > 0);

    auto un = static_cast<std::make_unsigned_t<index_t>>(n);

    return static_cast<index_t>(std::bit_width(un - 1));

}


// TODO: replace by ν2


[[nodiscard]] constexpr index_t get_level(index_t i) {

    BATMAT_ASSUME(i > 0);

    auto ui = static_cast<std::make_unsigned_t<index_t>>(i);

    return static_cast<index_t>(std::countr_zero(ui));

}


// TODO: move to indexing.tpp or data.tpp?


[[nodiscard]] constexpr index_t get_index_in_level(index_t i) {

    if (i == 0)

        return 0;

    auto l = get_level(i);

    return i >> (l + 1);

}


/// Solver for block-tridiagonal systems using cyclic reduction (CR), parallel cyclic reduction

/// (PCR), and preconditioned conjugate gradient (PCG) methods.

/// @tparam VL              Vector length.

/// @tparam T               Scalar type.

/// @tparam DefaultOrder    Storage order for the matrix workspaces (row/column major).

/// @ingroup topic-block-tridiag-solvers

template <index_t VL = 4, class T = real_t, StorageOrder DefaultOrder = StorageOrder::ColMajor,

          class Ctx = parallel::Context<>>


struct TricyqleSolver {

    using value_type    = T;

    using Params        = TricyqleParams<value_type>;

    using Context       = Ctx;

    using SharedContext = typename Context::shared_context_type;


    /// @name Problem dimensions

    /// @{


    const index_t block_size;   ///< Block size of the block-tridiagonal system.

    const index_t max_rank = 0; ///< Maximum update rank.


    /// Whether the block-tridiagonal system is circular (nonzero top-right & bottom-left corners).

    bool circular = false;


    /// @}


    /// @name Solver parameters

    /// @{


    /// Solver parameters for Tricyqle-specific settings.

    Params params{};


    /// Get the current solver parameters.

    [[nodiscard]] Params get_params() const { return params; }


    /// Update the solver parameters.

    void update_params(const Params &new_params) { params = new_params; }


    /// @}


    /// @name Parallelization and vectorization

    /// @{


    /// Number of processors/threads.

    const index_t p = 8;

    /// Vector length.

    static constexpr index_t v = VL;

    /// log₂(p), logarithm of the number of processors/threads @ref p, rounded up.

    [[nodiscard]] constexpr index_t lp() const { return ceil_log2(p); }

    /// The number of processors @ref p rounded up to the next power of two.

    [[nodiscard]] constexpr index_t ceil_p() const { return 1 << lp(); }

    /// The number of parallel execution units `P = p * v` rounded up to the next power of two.

    [[nodiscard]] constexpr index_t ceil_P() const { return 1 << (lp() + lv()); }

    /// log₂(v), logarithm of the vector length @ref v.

    [[nodiscard]] static constexpr index_t lv() { return ceil_log2(v); }


    /// Represents a SIMD vector of width @ref v storing values of type @ref value_type.

    using simd = batmat::datapar::deduced_simd<value_type, v>;

    /// Integral constant type for the vector length.

    using vl_t = std::integral_constant<index_t, v>;

    /// Integral constant type for the alignment of the batched matrix data structures.

    using align_t = std::integral_constant<index_t, v * alignof(value_type)>;


    /// Create a new parallel execution context, storing synchronization primitives and shared data

    /// for the parallel algorithms.


    std::unique_ptr<SharedContext> create_parallel_context() const {

        return std::make_unique<SharedContext>(p);

    }


    /// @}


    /// @name Indexing utilities

    /// @{


    /// 2-adic valuation ν₂.

    [[nodiscard]] index_t ν2(index_t i) const;

    /// 2-adic valuation modulo p, i.e. `ν2p(0) = ν2p(p) = lp()`.

    [[nodiscard]] index_t ν2p(index_t i) const;

    /// Add @p b to @p a modulo @ref ceil_p().

    [[nodiscard]] index_t add_wrap_ceil_p(index_t a, index_t b) const;

    /// Subtract @p b from @p a modulo @ref ceil_p().

    [[nodiscard]] index_t sub_wrap_ceil_p(index_t a, index_t b) const;


    /// @}


    /// @name Matrix data structures

    /// @{


    /// Default storage order for most matrices.

    static constexpr auto default_order = DefaultOrder;

    /// Column-major storage order for column vectors and update matrices.

    static constexpr auto column_major = StorageOrder::ColMajor;


    /// Owning type for a batch of matrices (with batch size v).

    template <StorageOrder O = column_major>

    using matrix = batmat::matrix::Matrix<value_type, index_t, vl_t, index_t, O, align_t>;

    /// Non-owning immutable view type for @ref matrix.

    template <StorageOrder O = column_major>

    using view = batmat::matrix::View<const value_type, index_t, vl_t, index_t, index_t, O>;

    /// Non-owning mutable view type for @ref matrix.

    template <StorageOrder O = column_major>

    using mut_view     = batmat::matrix::View<value_type, index_t, vl_t, index_t, index_t, O>;

    using layer_stride = batmat::matrix::DefaultStride;

    /// Non-owning immutable view type for a single batch of v matrices.

    template <StorageOrder O = column_major>

    using batch_view = batmat::matrix::View<const value_type, index_t, vl_t, vl_t, layer_stride, O>;

    /// Non-owning mutable view type for a single batch of v matrices.

    template <StorageOrder O = column_major>

    using mut_batch_view = batmat::matrix::View<value_type, index_t, vl_t, vl_t, layer_stride, O>;


    /// @}


    /// @name Factorization and solve routines

    /// @{


    /// Initialize the diagonal blocks M of the block tridiagonal system using a user-provided

    /// function. The function is called with an index `i` in `[0, p)` and a mutable view to the

    /// compact workspace of depth `v` where the diagonal blocks `M(i+l*p)` for `l` in `[0, v)`

    /// should be stored. Copying non-compact data to this workspace can be achieved using the

    /// @ref cyqlone::linalg::pack function.


    decltype(auto) init_diag(Context &ctx, auto &&func) {

        return func(ctx.index, cr_L.batch(ctx.index));

    }


    /// Initialize the subdiagonal blocks K of the block tridiagonal system using a user-provided

    /// function. The function is called with an index `i` in `[0, p)` and a mutable view to the

    /// compact workspace of depth `v` where the subdiagonal blocks `K(i+l*p)` for `l` in `[0, v)`

    /// should be stored. Copying non-compact data to this workspace can be achieved using the

    /// @ref cyqlone::linalg::pack function.


    decltype(auto) init_subdiag(Context &ctx, auto &&func) {

        return (p == 1 || ctx.index & 1) ? func(ctx.index, cr_Y.batch(ctx.index))

                                         : func(ctx.index, cr_U.batch(ctx.index + 1).transposed());

    }


    /// Initialize the right-hand side of the linear system using a user-provided function. The

    /// function is called with an index `i` in `[0, p)` and a mutable view of the batch of @p b

    /// of depth `v` where the right-hand side blocks `b(i+l*p)` for `l` in `[0, v)` should be

    /// stored. Copying non-compact data to this batch can be achieved using the

    /// @ref cyqlone::linalg::pack function.


    decltype(auto) init_rhs(Context &ctx, mut_view<> b, auto &&func) const {

        return func(ctx.index, b.batch(ctx.index));

    }


    /// Get access to the solution computed by this thread using a user-provided function. The

    /// function is called with an index `i` in `[0, p)` and a view to the compact batch of @p λ

    /// of depth `v` where the solution blocks `λ(i+l*p)` for `l` in `[0, v)` are stored. Copying

    /// this batch to a non-compact layout can be achieved using the @ref cyqlone::linalg::unpack

    /// function.


    decltype(auto) get_solution(Context &ctx, view<> λ, auto &&func) const {

        return func(ctx.index, λ.batch(ctx.index));

    }


    /// @copydoc get_solution


    decltype(auto) get_solution(Context &ctx, mut_view<> λ, auto &&func) const {

        return func(ctx.index, λ.batch(ctx.index));

    }


    /// @copydoc get_solution

    template <class Λ, class F>


    decltype(auto) get_solution(Context &ctx, Λ &&λ, F &&func) const {

        return func(ctx.index, λ.batch(ctx.index));

    }


    /// Fused factorization and forward solve. Performs CR to reduce the system down to a single

    /// block tridiagonal system of size @ref v which is then solved using PCR or PCG.

    ///

    /// @pre The workspaces `cr_L.batch(i)` contain the diagonal blocks `M(i:v:p)` of the block

    ///      tridiagonal system (see @ref init_diag).

    /// @pre If `p > 1`, the workspaces `cr_Y.batch(i)` with odd i contain the subdiagonal blocks

    ///      `K(i:v:p)` of the block tridiagonal system (see @ref init_subdiag).

    /// @pre If `p > 1`, the workspaces `cr_U.batch(i)` with odd i contain the superdiagonal blocks

    ///      `K(i-1:v:p)ᵀ` of the block tridiagonal system (see @ref init_subdiag).

    /// @pre If `p = 1`, the workspace `cr_Y.batch(0)` contains the subdiagonal blocks `K(0:v:1)`

    ///      of the block tridiagonal system, and `cr_U.batch(0)` is not used.

    /// @post The workspaces `cr_L.batch(i)` with `i > 0` contain the diagonal blocks of the CR

    ///       Cholesky factor. `cr_Y.batch(i)` and `cr_U.batch(i)` contain the subdiagonal blocks.

    ///       The final batch `cr_L.batch(0)` contains the diagonal blocks of the Schur complement

    ///       of all other blocks, which is the input to the PCR or PCG solver. The batch

    ///       `pcr_L.batch(0)` contains its Cholesky factorizations. The subdiagonal blocks of this

    ///       Schur complement are stored in `pcr_Y.batch(0)`.

    /// @post If the PCR solver was selected, the full PCR factorization is stored in @ref pcr_L,

    ///       @ref pcr_U and @ref pcr_Y.

    ///

    /// @param ctx  Parallel execution context for communication/synchronization between threads.

    /// @param λ    On entry, the right-hand side of the linear system. On exit, the solution of the

    ///             forward solve phase.

    /// @param stride   Stride (in number of batches) between batches of @p λ. In total, @p λ

    ///                 contains `stride * p` batches, but only every `stride`-th batch is accessed.

    ///

    /// Note that a backward solve of the final block `λ.batch(0)` is performed during the forward

    /// solve phase, so `λ.batch(0)` contains (part of) the final solution. Performing the forward

    /// and backward solves separately for this block is not possible, because it is solved using

    /// PCR or PCG. Both methods solve the full block tridiagonal system at once, rather than using

    /// a single explicit CR Cholesky factorization with distinct forward and backward solves.

    void factor_solve(Context &ctx, mut_view<> λ, index_t stride = 1);

    /// Perform only the factorization as described by @ref factor_solve.

    void factor(Context &ctx);

    /// Perform only the forward solve as described by @ref factor_solve.

    void solve_forward(Context &ctx, mut_view<> λ, index_t stride = 1);


    /// Perform the backward solve phase, after the forward solve phase has been performed by

    /// @ref factor_solve.

    /// @param ctx  Parallel execution context for communication/synchronization between threads.

    /// @param λ    On entry, the solution of the forward solve phase. On exit, the solution of the

    ///             full linear system.

    /// @param work  Workspace of `p * v` column vectors of size @ref block_size.

    /// @param stride   Stride (in number of batches) between batches of @p λ. In total, @p λ

    ///                 contains `stride * p` batches, but only every `stride`-th batch is accessed.

    void solve_reverse(Context &ctx, mut_view<> λ, mut_view<> work, index_t stride = 1) const;


    void solve_reverse(Context &ctx, mut_view<> λ, index_t stride = 1) {

        solve_reverse(ctx, λ, work_cr, stride);

    }


    /// @}


    /// @name Cyclic reduction data structures

    /// @{


    /// Diagonal blocks of the Cholesky factor of the Schur complement (used during CR).

    /// Batch indices correspond to block column indices of the block tridiagonal system.


    matrix<default_order> cr_L = [this] {

        return matrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};

    }();


    /// Subdiagonal blocks U of the Cholesky factor of the Schur complement (used during CR).

    /// Batch indices correspond to block column indices of the block tridiagonal system.


    matrix<default_order> cr_U = [this] {

        return matrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};

    }();


    /// Subdiagonal blocks Y of the Cholesky factor of the Schur complement (used during CR).

    /// Batch indices correspond to block column indices of the block tridiagonal system.


    matrix<default_order> cr_Y = [this] {

        return matrix<default_order>{{.depth = p * v, .rows = block_size, .cols = block_size}};

    }();


    /// Temporary workspace for the CR solve phase.


    matrix<column_major> work_cr = [this] {

        return matrix<column_major>{{.depth = p * v, .rows = block_size, .cols = 1}};

    }();


    /// @}


    /// @name Parallel cyclic reduction data structures

    /// @{


    /// Diagonal blocks of the PCR Cholesky factorizations.


    matrix<default_order> pcr_L = [this] {

        return matrix<default_order>{

            {.depth = v * (lv() + 1), .rows = block_size, .cols = block_size}};

    }();


    /// Subdiagonal blocks Y of the PCR Cholesky factorizations.


    matrix<default_order> pcr_Y = [this] {

        return matrix<default_order>{{.depth = v * lv(), .rows = block_size, .cols = block_size}};

    }();


    /// Subdiagonal blocks U of the PCR Cholesky factorizations.


    matrix<default_order> pcr_U = [this] {

        return matrix<default_order>{{.depth = v * lv(), .rows = block_size, .cols = block_size}};

    }();


    /// Workspace to store the diagonal blocks during the PCR factorization.


    matrix<default_order> pcr_M = [this] {

        return matrix<default_order>{{.depth = v, .rows = block_size, .cols = block_size}};

    }();


    /// Temporary workspace for CG vectors.


    matrix<column_major> work_pcg = [this] {

        return matrix<column_major>{{.depth = v, .rows = block_size, .cols = 4}};

    }();


    /// @}


    /// @name Factorization update data structures

    /// @{


    /// Update rank (number of changing constraints) per thread.

    std::vector<index_t> m_update = std::vector<index_t>(p);

    /// Update rank from D(0). Negative if D(0) is not handled separately.

    index_t m_update_u0 = -1;


    /// Compressed reprentation of the nonzero diagonal elements of the matrix Σ.


    matrix<column_major> work_update_Σ = [this] {

        return matrix<column_major>{{.depth = v, .rows = max_rank, .cols = 1}};

    }();


    /// Workspace to store the update matrices Ξ(Υ) for the factorization update.


    matrix<column_major> work_update = [this] {

        return matrix<column_major>{{.depth = 4 * v, .rows = block_size, .cols = max_rank}};

    }();


    /// Storage for the hyperbolic Householder transformations.


    matrix<column_major> work_hyh = [this] {

        using namespace batmat::linalg;

        const auto [r, c] = hyhound_size_W(tril(cr_L.batch(0)));

        return matrix<column_major>{{.depth = p * v, .rows = r, .cols = c}};

    }();


    /// Two copies of @ref work_update_Σ for PCR updates.


    matrix<column_major> work_update_pcr_Σ = [this] {

        return matrix<column_major>{{.depth = v, .rows = 2 * max_rank, .cols = 1}};

    }();


    /// Update matrices to apply to the diagonal blocks L during PCR updates.


    matrix<column_major> work_update_pcr_L = [this] {

        return matrix<column_major>{{.depth = v, .rows = block_size, .cols = max_rank}};

    }();


    /// Update matrices to apply to the subdiagonal blocks U and Y during PCR updates.


    matrix<column_major> work_update_pcr_UY = [this] {

        return matrix<column_major>{{.depth = v, .rows = block_size, .cols = 2 * max_rank}};

    }();


    /// @}


    /// @name Low-level factorization and solve routines

    /// @{


    /// Fused factorization and forward solve. Unlike @ref factor_solve, this function assumes that

    /// the odd diagonal blocks in the first level of CR have already been factorized and solved.

    /// This allows the user to fuse the evaluation and factorization of these blocks, possibly

    /// enabling higher performance by avoiding an additional trip to memory.

    /// @pre If `p > 1`, the workspaces `cr_L.batch(i)` contain the diagonal blocks M(i) for even i,

    ///      and the Cholesky factors L(i) of M(i) for odd i.

    /// @pre If `p = 1`, the workspace `cr_L.batch(0)` contains the diagonal block M(0), and the

    ///      workspace `pcr_L.batch(0)` contains its Cholesky factor L(0).

    /// @pre If `p > 1`, the workspaces `cr_Y.batch(i)` contain the subdiagonal blocks K(i) for

    ///      odd i (uninitialized for even i).

    /// @pre If `p = 1`, the workspace `cr_Y.batch(0)` contains the subdiagonal block K(0).

    /// @pre If `p > 1`, the workspace `cr_U.batch(i)` contains the superdiagonal blocks

    ///      K(i-1)ᵀ for odd i (uninitialized for even i).

    /// @pre If `p = 1`, the workspace `cr_U.batch(0)` is not used.

    /// @pre If `p > 1`, the right-hand sides `λ.batch(stride * i)` contain the right-hand sides of

    ///      the system for even i, and the right-hand sides multiplied by L(i)⁻¹ for odd i.

    /// @pre If `p = 1`, the right-hand side `λ.batch(0)` contains the right-hand side of the system.

    template <bool Factor = true, bool Solve = true>

    void factor_solve_skip_first(Context &ctx, mut_view<> λ, index_t stride = 1);

    /// Factorization-only variant of @ref factor_solve_skip_first.

    void factor_skip_first(Context &ctx) { factor_solve_skip_first<true, false>(ctx, {}); }

    /// Solution-only variant of @ref factor_solve_skip_first.


    void solve_forward_skip_first(Context &ctx, mut_view<> λ, index_t stride = 1) {

        factor_solve_skip_first<false, true>(ctx, λ, stride);

    }


    /// Implementation of @ref factor_solve.

    template <bool Factor = true, bool Solve = true>

    void factor_solve_impl(Context &ctx, mut_view<> λ, index_t stride = 1);


    /// @}


    /// @name Low-level CR factorization and solve routines

    /// @{


    [[nodiscard]] index_t cr_thread_assignment(index_t l, index_t c) const;

    /// Compute a block U in the Cholesky factor for the given CR level @p l and column index @p iU.

    void factor_U(index_t l, index_t iU);

    /// Compute a block Y in the Cholesky factor for the given CR level @p l and column index @p iY.

    void factor_Y(index_t l, index_t iY);

    /// Update and factorize a block L in the Cholesky factor for CR level @p l+1 and column index

    /// @p i, using the previously computed blocks U and Y in the same row at level @p l.

    void factor_L(index_t l, index_t i);

    /// Compute a subdiagonal block K of the Schur complement for CR level @p l+1 and column index

    /// @p i, using the previously computed blocks U and Y in the same row at level @p l.

    void update_K(index_t l, index_t i);


    /// Update the right-hand side @p λ during the forward solve phase of CR after computing

    /// block @p iU of @p λ at level @p l, subtracting the product `U(iU) λ(iU)` from the block

    /// of @p λ in the same row as `U(iU)`.

    /// The @p stride parameter specifies the stride between consecutive blocks of @p λ.

    void solve_u_forward(index_t l, index_t iU, mut_view<> λ, index_t stride) const;

    /// Update the right-hand side @p λ during the forward solve phase of CR after computing

    /// block @p iY of @p λ at level @p l, subtracting the product `Y(iY) λ(iY)` from the block

    /// of @p λ in the same row as `Y(iY)`.

    /// The product `Y(iY) λ(iY)` is stored in the workspace @p w to allow it to be computed

    /// concurrently with @ref solve_u_forward, which updates the same block of @p λ.

    /// The @p stride parameter specifies the stride between consecutive blocks of @p λ.

    void solve_y_forward(index_t l, index_t iY, mut_view<> λ, mut_view<> w, index_t stride) const;

    /// Apply the updates to block @p iL of the right-hand side from @ref solve_u_forward and

    /// @ref solve_y_forward, and then solve with the diagonal block L for the next level @p l+1.

    /// The @p stride parameter specifies the stride between consecutive blocks of @p λ.

    void solve_λ_forward(index_t l, index_t iL, mut_view<> λ, view<> w, index_t stride) const;


    /// @}


    /// @name Low-level PCR factorization and solve routines

    /// @{


    static constexpr bool merge_last_level_pcr = true;

    /// Compute the parallel cyclic reduction factorization of the final block tridiagonal system

    /// of size @ref v.

    /// The function assumes that the Cholesky factors of the diagonal blocks are stored in

    /// `cr_L.batch(0)`, and the subdiagonal blocks are stored in `cr_Y.batch(0)`.

    /// This variant computes both subdiagonal blocks on a single thread.

    void factor_pcr();

    /// Perform a single level of the PCR factorization.

    /// This variant computes both subdiagonal blocks on a single thread.

    template <index_t Level>

    void factor_pcr_level();

    /// Compute the parallel cyclic reduction factorization of the final block tridiagonal system

    /// of size @ref v.

    /// This variant computes the subdiagonal blocks in parallel on different threads.

    void factor_pcr_parallel(Context &ctx);

    /// Perform a single level of the PCR factorization.

    /// This variant computes the subdiagonal blocks in parallel on different threads.

    template <index_t Level>

    void factor_pcr_level_parallel(Context &ctx);


    /// Solve a linear system with the final block tridiagonal system of size @ref v using the PCR

    /// factorization.

    void solve_pcr(mut_batch_view<> λ, mut_batch_view<> work_pcr) const;

    /// @copydoc solve_pcr

    void solve_pcr(mut_batch_view<> λ) { solve_pcr(λ, work_pcg.batch(0).left_cols(1)); }

    /// Perform a single level of the PCR solve.

    template <index_t Level>

    void solve_pcr_level(mut_batch_view<> λ, mut_batch_view<> work_pcr) const;


    /// @}


    /// @name Low-level PCG routines

    /// @{


    /// Multiply a vector by the final block tridiagonal matrix of size @ref v.

    /// The matrix is represented by the Cholesky factors of the diagonal blocks L and the

    /// subdiagonal blocks K.

    /// ~~~

    /// M = [ M(0)  Kᵀ(0)             ]  where M(i) = L(i) L(i)ᵀ

    ///     [ K(0)  M(1)  Kᵀ(1)       ]

    ///     [       K(1)  M(2)  Kᵀ(2) ]

    ///     [             K(2)  M(3)  ]

    /// ~~~

    value_type mul_Mv(batch_view<> p, mut_batch_view<> Mp, batch_view<default_order> L,

                      batch_view<default_order> K) const;

    /// Multiply a vector by the preconditioner for the final block tridiagonal system of size

    /// @ref v.

    /// @see mul_Mv

    value_type mul_precond(batch_view<> r, mut_batch_view<> z, mut_batch_view<> w,

                           batch_view<default_order> L, batch_view<default_order> K) const;

    /// Solve a linear system with the final block tridiagonal system of size @ref v using the

    /// preconditioned conjugate gradient method.

    /// The function assumes that the Cholesky factors of the diagonal blocks are stored in

    /// `cr_L.batch(0)`, and the subdiagonal blocks are stored in `cr_Y.batch(0)`.

    void solve_pcg(mut_batch_view<> λ, mut_batch_view<> work_pcg) const;

    /// @copydoc solve_pcg

    void solve_pcg(mut_batch_view<> λ) { solve_pcg(λ, work_pcg.batch(0)); }


    /// @}


    /// @name Low-level reverse solve routines

    /// @{


    void solve_reverse_parallel(Context &ctx, mut_view<> λ, mut_view<> work, index_t stride) const;

    void solve_reverse_serial(mut_view<> λ, mut_view<> work, index_t stride) const;

    void solve_u_backward(index_t l, index_t iU, mut_view<> λ, mut_view<> w, index_t stride) const;

    void solve_y_backward(index_t l, index_t iY, mut_view<> λ, index_t stride) const;

    void solve_λ_backward(index_t biL, mut_view<> λ, view<> w, index_t stride) const;


    /// @}


    /// @name Low-level factorization update routines

    /// @{


    /// @todo properly define semantics and indices.

    void set_thread_update_rank(Context &ctx, index_t c, index_t m);

    /// @todo properly define semantics and indices.

    void set_update_rank_extra(index_t m);

    /// @todo properly define semantics and indices.

    void clear_update_rank_extra();


    [[nodiscard]] std::pair<index_t, index_t> cols_Ups_fwd(index_t l, index_t i) const;

    [[nodiscard]] std::pair<index_t, index_t> cols_Ups_bwd(index_t l, index_t i) const;

    [[nodiscard]] std::pair<index_t, index_t> cols_Q_cr(index_t l, index_t i) const;

    [[nodiscard]] index_t work_Ups_fwd_w(index_t l, index_t i) const;

    [[nodiscard]] index_t work_Ups_bwd_w(index_t l, index_t i) const;

    [[nodiscard]] mut_batch_view<column_major> work_Ups_fwd(index_t l, index_t i);

    [[nodiscard]] mut_batch_view<column_major> work_Ups_bwd(index_t l, index_t i);

    [[nodiscard]] mut_batch_view<column_major> work_Q_cr(index_t l, index_t i);

    [[nodiscard]] mut_batch_view<column_major> work_Σ_fwd(index_t l, index_t i);

    [[nodiscard]] mut_batch_view<column_major> work_Σ_bwd(index_t l, index_t i);

    [[nodiscard]] mut_batch_view<column_major> work_Σ_Q(index_t l, index_t i);

    [[nodiscard]] mut_batch_view<column_major> work_Ups_fwd_last();

    [[nodiscard]] mut_batch_view<column_major> work_Ups_bwd_last();

    [[nodiscard]] mut_batch_view<column_major> work_Σ_fwd_last();

    [[nodiscard]] mut_batch_view<column_major> work_Σ_bwd_last();

    [[nodiscard]] mut_batch_view<column_major> work_Ups_extra();

    [[nodiscard]] mut_batch_view<column_major> work_Σ_extra();


    template <bool Solve = true>

    void update_solve_cr(Context &ctx, mut_view<> λ, index_t stride);

    void update_L(index_t l, index_t i);

    void update_U(index_t l, index_t i);

    void update_Y(index_t l, index_t i);


    template <index_t Level>

    void update_pcr_level(index_t m, mut_batch_view<> WYU, mut_batch_view<> WΣ);

    void update_pcr(batch_view<> fwd, batch_view<> bwd, batch_view<> Σ);


    /// @}


    /// @name Low-level prefetching

    /// @{


    template <StorageOrder O>

    void prefetch(batch_view<O> X) const;

    template <StorageOrder O>

    void prefetch_L(batch_view<O> X) const;

    void prefetch_L(index_t bi) const;

    void prefetch_U(index_t l, index_t iU) const;

    void prefetch_Y(index_t l, index_t iY) const;


    /// @}

};


/// Linear solver for systems with optimal control structure.

/// @tparam VL              Vector length.

/// @tparam T               Scalar type.

/// @tparam DefaultOrder    Storage order for the matrix workspaces (row/column major).

/// @tparam Ctx             Parallel execution context type, see @ref parallel::Context.

/// @ingroup topic-ocp-solvers

template <index_t VL = 4, class T = real_t, StorageOrder DefaultOrder = StorageOrder::ColMajor,

          class Ctx = parallel::Context<>>


struct CyqloneSolver {

    using value_type = T;


    /// @name Problem dimensions

    /// @{


    const index_t N_horiz; ///< Horizon length of the optimal control problem.

    const index_t nx;      ///< Number of states of the OCP.

    const index_t nu;      ///< Number of controls of the OCP.

    const index_t ny;      ///< Number of general constraints of the OCP per stage.

    const index_t ny_0;    ///< Number of general constraints at stage 0, D(0) u(0).

    const index_t ny_N;    ///< Number of general constraints at the final stage, C(N) x(N).


    /// Get the total number of primal variables in the OCP.

    /// @note The actual number of variable stored in Cyqlone's internal data structures may be

    ///       larger.

    [[nodiscard]] index_t num_variables() const { return N_horiz * (nu + nx); }

    /// Get the total number of dynamics constraints in the OCP.

    /// @note The actual number of constraints stored in Cyqlone's internal data structures may be

    ///       larger.

    [[nodiscard]] index_t num_dynamics_constraints() const { return N_horiz * nx; }

    /// Get the total number of general constraints in the OCP.

    /// @note The actual number of constraints stored in Cyqlone's internal data structures may be

    ///       larger.


    [[nodiscard]] index_t num_general_constraints() const {

        return (N_horiz - 1) * ny + ny_0 + ny_N;

    }


    /// @}


    /// @name Parallelization and vectorization

    /// @{


    /// Tricyqle solver type for solving block-tridiagonal systems in parallel.

    using tricyqle_t    = TricyqleSolver<VL, T, DefaultOrder, Ctx>;

    using Context       = tricyqle_t::Context;

    using SharedContext = tricyqle_t::SharedContext;

    using simd          = tricyqle_t::simd;


    /// Number of processors/threads.

    const index_t p;

    /// Vector length.

    static constexpr index_t v = VL;

    /// Number of stages per thread per vector lane (rounded up).

    const index_t n = (N_horiz + p * v - 1) / (p * v);


    /// log₂(v), logarithm of the vector length.

    [[nodiscard]] constexpr index_t lv() const { return tricyqle.lv(); }

    /// log₂(p), logarithm of the number of processors/threads, rounded up.

    [[nodiscard]] constexpr index_t lp() const { return tricyqle.lp(); }

    /// The number of processors @ref p rounded up to the next power of two.

    [[nodiscard]] constexpr index_t ceil_p() const { return tricyqle.ceil_p(); }

    /// The number of parallel execution units P rounded up to the next power of two.

    [[nodiscard]] constexpr index_t ceil_P() const { return tricyqle.ceil_P(); }


    std::unique_ptr<SharedContext> create_parallel_context() const {

        return tricyqle.create_parallel_context();

    }


    /// Call a function for each stage in the horizon, passing the stage index, the data batch

    /// index, and optionally the corresponding batches of the given arrays.

    /// Iterates backwards in time (decreasing stage index j).


    void foreach_stage(Context &ctx, auto &&func, auto &&...xs) const {

        BATMAT_ASSERT(((xs.batch_size() == v) && ...));

        BATMAT_ASSERT(((xs.depth() == ceil_N()) && ...));

        const index_t ti = riccati_thread_assignment(ctx);

        for (index_t i = 0; i < n; ++i) {

            const index_t di = ti * n + i;

            const index_t j  = sub_wrap_ceil_N(ti * n, i);

            func(j, di, xs.batch(di)...);

        }

    }


    /// Call a function for each stage in the horizon, passing the stage index, the data batch

    /// index, and optionally the corresponding batches of the given arrays.

    /// Iterates forward in time (increasing stage index j).


    void foreach_stage_fwd(Context &ctx, auto &&func, auto &&...xs) const {

        BATMAT_ASSERT(((xs.batch_size() == v) && ...));

        BATMAT_ASSERT(((xs.depth() == ceil_N()) && ...));

        const index_t ti = riccati_thread_assignment(ctx);

        for (index_t i = n; i --> 0;) {

            const index_t di = ti * n + i;

            const index_t j  = sub_wrap_ceil_N(ti * n, i);

            func(j, di, xs.batch(di)...);

        }

    }


    /// @}


    /// @name Indexing utilities

    /// @{


    /// Horizon length, rounded up to a multiple of the number of parallel execution units.

    [[nodiscard]] index_t ceil_N() const { return n * p * v; }

    /// 2-adic valuation ν₂.

    [[nodiscard]] index_t ν2(index_t i) const;

    /// 2-adic valuation modulo p, i.e. `ν2p(0) = ν2p(p) = lp()`.

    [[nodiscard]] index_t ν2p(index_t i) const;

    /// Add @p b to @p a modulo @ref N_horiz.

    [[nodiscard]] index_t add_wrap_ceil_N(index_t a, index_t b) const;

    /// Subtract @p b from @p a modulo @ref N_horiz.

    [[nodiscard]] index_t sub_wrap_ceil_N(index_t a, index_t b) const;

    /// Add @p b to @p a modulo @ref p.

    [[nodiscard]] index_t add_wrap_p(index_t a, index_t b) const;

    /// Subtract @p b from @p a modulo @ref p.

    [[nodiscard]] index_t sub_wrap_p(index_t a, index_t b) const;

    /// Add @p b to @p a modulo @ref ceil_p().

    [[nodiscard]] index_t add_wrap_ceil_p(index_t a, index_t b) const;

    /// Subtract @p b from @p a modulo @ref ceil_p().

    [[nodiscard]] index_t sub_wrap_ceil_p(index_t a, index_t b) const;


    /// @todo refactor sparse.tpp

    [[nodiscard]] index_t sub_wrap_ceil_P(index_t a, index_t b) const;

    /// @todo refactor sparse.tpp

    [[nodiscard]] index_t add_wrap_ceil_P(index_t a, index_t b) const;

    /// @todo refactor sparse.tpp

    [[nodiscard]] index_t get_linear_batch_offset(index_t biA) const;


    /// @}


    /// @name Matrix data structures

    /// @{


    /// Default storage order for most matrices.

    static constexpr auto default_order = tricyqle_t::default_order;

    /// Column-major storage order for column vectors and update matrices.

    static constexpr auto column_major = tricyqle_t::column_major;


    /// Owning type for a batch of matrices (with batch size v).

    template <StorageOrder O = column_major>

    using matrix = typename tricyqle_t::template matrix<O>;

    /// Non-owning immutable view type for @ref matrix.

    template <StorageOrder O = column_major>

    using view = typename tricyqle_t::template view<O>;

    /// Non-owning mutable view type for @ref matrix.

    template <StorageOrder O = column_major>

    using mut_view     = typename tricyqle_t::template mut_view<O>;

    using layer_stride = typename tricyqle_t::layer_stride;

    /// Non-owning immutable view type for a single batch of v matrices.

    template <StorageOrder O = column_major>

    using batch_view = typename tricyqle_t::template batch_view<O>;

    /// Non-owning mutable view type for a single batch of v matrices.

    template <StorageOrder O = column_major>

    using mut_batch_view = typename tricyqle_t::template mut_batch_view<O>;


    /// @}


    /// @name Solver parameters

    /// @{


    /// Solver parameters and settings.

    CyqloneParams<value_type> params{};


    /// Get the current Cyqlone solver parameters.

    [[nodiscard]] CyqloneParams<value_type> get_params() const { return params; }


    /// Update the Cyqlone solver parameters.

    void update_params(const CyqloneParams<value_type> &new_params) { params = new_params; }


    /// Get the current Tricyqle solver parameters.


    [[nodiscard]] TricyqleParams<value_type> get_tricyqle_params() const {

        return tricyqle.get_params();

    }


    /// Update the Tricyqle solver parameters.


    void update_tricyqle_params(const TricyqleParams<value_type> &new_params) {

        tricyqle.update_params(new_params);

    }


    /// Get a string representation of the main solver parameters. Used mainly for file names.


    [[nodiscard]] std::string get_params_string() const {

        const auto &tricyqle_params = get_tricyqle_params();

        std::string_view solve      = tricyqle_params.solve_method == SolveMethod::PCR ? "pcr"

                                      : tricyqle_params.solve_method == SolveMethod::StairPCG

                                          ? "pcg=stair"

                                          : "pcg=jacobi";

        std::string_view order      = default_order == StorageOrder::RowMajor ? "rm" : "cm";

        return std::format("nx={}-nu={}-ny={}-N={}-p={}-v={}-{}-{}", nx, nu, ny, N_horiz, p, v,

                           solve, order);

    }


    /// @}


    /// @name Tricyqle solver for block-tridiagonal systems

    /// @{


    /// Block-tridiagonal solver (CR/PCR/PCG).


    tricyqle_t tricyqle = {

        .block_size = nx,

        .max_rank   = std::max(ny, ny_0 + ny_N) * N_horiz,

        .p          = p,

    };


    /// @}


    // Note: the cumbersome IILE initialization syntax is to work around a GCC bug

    //       https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116015


    /// @name OCP data (reordered for use during the Cyqlone algorithm)

    /// @{


    /// Stage-wise Hessian blocks H(j) = [ R(j)  S(j);  S(j)ᵀ  Q(j) ] of the OCP cost function.


    matrix<default_order> data_H = [this] {

        return matrix<default_order>{{.depth = ceil_N(), .rows = nu + nx, .cols = nu + nx}};

    }();


    /// Stage-wise dynamics matrices F(j) = [ B(j)  A(j) ] of the OCP.


    matrix<default_order> data_F = [this] {

        return matrix<default_order>{{.depth = ceil_N(), .rows = nx, .cols = nu + nx}};

    }();


    /// Stage-wise constraint Jacobians G(j)ᵀ = [ D(j)  C(j) ]ᵀ of the OCP.


    matrix<default_order> data_Gᵀ = [this] {

        const auto nyM = std::max(ny, ny_0 + ny_N);

        return matrix<default_order>{{.depth = ceil_N(), .rows = nu + nx, .cols = nyM}};

    }();


    /// @}


    /// @name Modified Riccati data structures

    /// @{


    /// Cholesky factors of the Hessian blocks for the Riccati recursion.

    /// LH(j) = [ LR(j)  0;  LS(j)  LQ(j) ]


    matrix<default_order> riccati_LH = [this] {

        return matrix<default_order>{{.depth = p * v, .rows = nu + nx, .cols = n * (nu + nx)}};

    }();


    /// Storage for the matrices LB(j), Acl(j) and LA(j₁) for the Riccati recursion.

    /// Grouped per thread, with layout [ Acl(jₙ) ... Acl(j₂) LA(j₁) | LB(jₙ) ... LB(j₁) ], so that

    /// LA(j₁) and LB(j) are contiguous (useful when evaluating the Schur complement).


    matrix<default_order> riccati_LAB = [this] {

        return matrix<default_order>{{.depth = p * v, .rows = nx, .cols = n * nx + n * nu}};

    }();


    /// Temporary storage for the V(j) = [ B(j)ᵀ LQ(j);  A(j)ᵀ LQ(j) ] matrices during the Riccati

    /// recursion. The workspace is wider than just V to also accommodate the active constraint

    /// Jacobians, since both are used to update the Hessian blocks during the Riccati recursion.


    matrix<default_order> riccati_V = [this] {

        const auto nyM = std::max(ny, ny_0 + ny_N);

        return matrix<default_order>{{.depth = p * v, .rows = nu + nx, .cols = nx + nyM}};

    }();


    /// Temporary workspace for the Riccati solve phase.


    matrix<column_major> riccati_work = [this] {

        return matrix<column_major>{{.depth = p * v, .rows = nx, .cols = 1}};

    }();


    /// @name Riccati factorization update data structures

    /// @{


    /// Compressed representation of the nonzero diagonal elements of the matrix Σ, populated

    /// for each thread separately during the factorization update of the Riccati recursion.


    matrix<column_major> work_Σ = [this] {

        const auto nyM = std::max(ny, ny_0 + ny_N);

        return matrix<column_major>{{.depth = p * v, .rows = n * nyM, .cols = 1}};

    }();


    /// Workspace to store the update matrices Υu, Υx, Υλ, Φu, Φx and Φλ during the factorization

    /// update of the Riccati recursion.

    /// Both @ref riccati_Υ1 and @ref riccati_Υ2 are used alternately.


    matrix<column_major> riccati_Υ1 = [this] {

        const auto nyM = std::max(ny, ny_0 + ny_N);

        return matrix<column_major>{{.depth = p * v, .rows = nu + nx + nx, .cols = n * nyM}};

    }();


    /// Alternate workspace to @ref riccati_Υ1.


    matrix<column_major> riccati_Υ2 = [this] {

        const auto nyM = std::max(ny, ny_0 + ny_N);

        return matrix<column_major>{{.depth = p * v, .rows = nu + nx + nx, .cols = n * nyM}};

    }();


    /// @}


    /// @name Packing and unpacking of OCP data to Cyqlone storage format

    /// @{


    /// @}


    /// @name Packing and unpacking of OCP data to Cyqlone storage format

    /// @{


    /// Initialize a Cyqlone solver for the given OCP.

    ///

    /// Note: constraints on u(0) and x(N) should be independent.

    /// ~~~

    ///               nx  nu

    /// ocp.CD(0) = [ 0 | D ] ny₀

    ///             [ 0 | 0 ] ny - ny₀

    /// ~~~

    ///

    /// Since ocp.D(0) and ocp.C(N) will be merged, the top ny₀ rows of ocp.C(N)

    /// should be zero.

    ///

    /// @todo Create documentation page about the different OCP representations and storage formats.

    static CyqloneSolver build(const CyqloneStorage<value_type> &ocp, index_t p);

    /// Update the internal data structures to reflect changes in the OCP data (without changing

    /// the problem size).

    void update_data(const CyqloneStorage<value_type> &ocp);

    /// Initialize the right-hand side vector for the dynamics constraints of the OCP, using the

    /// custom Cyqlone storage format.

    void initialize_rhs(const CyqloneStorage<value_type> &ocp, mut_view<> rhs) const;

    /// @copydoc initialize_rhs

    matrix<> initialize_rhs(const CyqloneStorage<value_type> &ocp) const;

    /// Initialize the gradient vector for the OCP cost function, using the custom Cyqlone storage

    /// format.

    void initialize_gradient(const CyqloneStorage<value_type> &ocp, mut_view<> grad) const;

    /// @copydoc initialize_gradient

    matrix<> initialize_gradient(const CyqloneStorage<value_type> &ocp) const;

    /// Initialize the lower and upper bounds for the general constraints of the OCP, using the

    /// custom Cyqlone storage format.

    void initialize_bounds(const CyqloneStorage<value_type> &ocp, mut_view<> b_min,

                           mut_view<> b_max) const;

    /// @copydoc initialize_bounds

    std::pair<matrix<>, matrix<>> initialize_bounds(const CyqloneStorage<value_type> &ocp) const;


    /// @todo check and document behavior when `N_horiz != ceil_N()`.

    void pack_variables(std::span<const value_type> ux_lin, mut_view<> ux) const;

    matrix<> pack_variables(std::span<const value_type> ux_lin) const;

    void unpack_variables(view<> ux, std::span<value_type> ux_lin) const;

    std::vector<value_type> unpack_variables(view<> ux) const;

    void pack_dynamics(std::span<const value_type> λ_lin, mut_view<> λ) const;

    matrix<> pack_dynamics(std::span<const value_type> λ_lin) const;

    void unpack_dynamics(view<> λ, std::span<value_type> λ_lin) const;

    std::vector<value_type> unpack_dynamics(view<> λ) const;

    void pack_constraints(std::span<const value_type> y_lin, mut_view<> y,

                          value_type fill = 0) const;

    matrix<> pack_constraints(std::span<const value_type> y_lin, value_type fill = 0) const;

    void unpack_constraints(view<> y, std::span<value_type> y_lin) const;

    std::vector<value_type> unpack_constraints(view<> y) const;


    /// Get a zero-initialized matrix for the primal variables u and x.

    matrix<> initialize_variables() const;

    /// Get a zero-initialized matrix for the dynamics constraints (or their multipliers).

    matrix<> initialize_dynamics_constraints() const;

    /// Get a zero-initialized matrix for the general constraints (or their multipliers).

    matrix<> initialize_general_constraints() const;


    /// @}


    /// @name OCP cost gradient and constraints evaluation

    /// @{


    /// Compute Mx + b, where M is the dynamics constraint Jacobian matrix of the OCP.

    /// In other words, evaluate the residuals for all stages, i.e.,

    /// @f$ A_j x^j + B_j u^j - x^{j+1} + b^j @f$.

    void residual_dynamics_constr(Context &ctx, view<> x, view<> b, mut_view<> Mxb) const;

    /// Compute Mᵀλ, where M is the dynamics constraint Jacobian matrix of the OCP.

    /// Optionally add the result to the existing contents of Mᵀλ by setting @p accum to true.

    void transposed_dynamics_constr(Context &ctx, view<> λ, mut_view<> Mᵀλ,

                                    bool accum = false) const;

    /// Compute the general constraints Gx, where G is the general constraint Jacobian matrix of the

    /// OCP. In other words, evaluate the constraints for all stages, i.e.,

    /// @f$ C_j x^j + D_j u^j @f$.

    void general_constr(Context &ctx, view<> ux, mut_view<> DCux) const;

    /// Compute Gᵀy, where G is the general constraint Jacobian matrix of the OCP.

    void transposed_general_constr(Context &ctx, view<> y, mut_view<> DCᵀy) const;

    /// @copydoc transposed_general_constr

    /// @todo Remove.

    void transposed_general_constr(view<> y, mut_view<> DCᵀy) const;

    /// Compute the cost gradient, with optional scaling factors.

    /// grad_f ← Q ux + α q + β grad_f

    void cost_gradient(Context &ctx, view<> ux, value_type α, view<> q, value_type β,

                       mut_view<> grad_f) const;

    /// Compute the regularized cost gradient, with regularization parameter γ⁻¹, with respect to

    /// the point @p ux0.

    void cost_gradient_regularized(Context &ctx, value_type γ, view<> ux, view<> ux0, view<> q,

                                   mut_view<> grad_f) const;

    /// Subtract the regularization term from the cost gradient.

    void cost_gradient_remove_regularization(Context &ctx, value_type γ, view<> x, view<> x0,

                                             mut_view<> grad_f) const;


    /// @}


    /// @name Factorization and solve routines

    /// @{


    /// Compute the Cyqlone factorization of the KKT matrix of the OCP and perform a forward solve

    /// (fused for improved locality).

    /// @param ctx Parallel context.

    /// @param γ Reciprocal primal regularization.

    /// @param Σ ALM penalty factors.

    /// @param[in,out] ux Negative augmented Lagrangian gradient on entry; solution of forward solve

    ///                   on exit.

    /// @param[in,out] λ Constant term of the dynamics constraints on entry; solution of forward

    ///                  solve on exit.

    /// To obtain the solution of the KKT system, a reverse solve with the same factorization must

    /// be performed afterwards.

    /// @see @ref solve_reverse

    void factor_solve(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ);

    /// Compute the Cyqlone factorization of the KKT matrix of the OCP.

    /// @see @ref factor_solve

    void factor(Context &ctx, value_type γ, view<> Σ);

    /// Perform a forward solve with the Cyqlone factorization.

    /// @see @ref factor_solve

    void solve_forward(Context &ctx, mut_view<> ux, mut_view<> λ);

    /// Perform a reverse solve with the Cyqlone factorization.

    /// @param ctx Parallel context.

    /// @param ux On entry, the result of the forward solve; on exit the solution of the primal

    ///           variables of the KKT system.

    /// @param λ On entry, the result of the forward solve; on exit the solution of the dual

    ///          variables corresponding to the dynamics constraints of the KKT system.

    void solve_reverse(Context &ctx, mut_view<> ux, mut_view<> λ);

    /// Fused variant of @ref solve_reverse and @ref transposed_dynamics_constr (for improved

    /// locality of the dynamics Jacobians).

    void solve_reverse_mul(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> Mᵀλ);

    /// Perform factorization updates of the Cyqlone factorization as described by

    /// Algorithm 4 in the paper.

    /// @param ctx Parallel context.

    /// @param ΔΣ Changes to the ALM penalty factors Σ.

    void update(Context &ctx, view<> ΔΣ);

    /// Fused variant of @ref update and @ref solve_forward.

    void update_solve(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ);


    /// @}


    /// @name Low-level factorization and forward solve routines

    /// @{


    index_t riccati_thread_assignment(Context &ctx) const { return add_wrap_p(ctx.index, 1); }

    template <bool Factor = true, bool Solve = true>

    void factor_riccati_solve(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ);

    template <bool Factor = true, bool Solve = true>

    void compute_schur(Context &ctx, mut_view<> ux, mut_view<> λ);


    /// @}


    /// @name Low-level factorization and forward solve routines for parallel cyclic reduction

    /// @{


    template <bool Factor = true, bool Solve = true>

    void factor_solve_impl(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ);


    /// @}


    /// @name Low-level reverse solve routines

    /// @{


    void solve_riccati_reverse(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> work,

                               std::optional<mut_view<>> Mᵀλ) const;

    void solve_reverse(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> work,

                       std::optional<mut_view<>> Mᵀλ = std::nullopt) const;


    /// @}


    /// @name Low-level factorization update routines

    /// @{


    /// Update the modified Riccati factorization of a single block column as described by

    /// Algorithm 3 in the paper.

    template <bool Solve = true>

    void update_riccati_solve(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ);

    void update_riccati(Context &ctx, view<> ΔΣ) { update_riccati_solve<false>(ctx, ΔΣ, {}, {}); }

    template <bool Solve = true>

    void update_solve_impl(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ);


    /// @}


    /// @name Build sparse representations for debugging and testing

    /// @{


    [[nodiscard]] SparseMatrix build_sparse(const CyqloneStorage<value_type> &ocp,

                                            std::span<const value_type> Σ) const;

    [[nodiscard]] std::vector<value_type> build_rhs(view<> rq, view<> b, value_type scale_rq = -1,

                                                    value_type scale_b = -1) const;


    [[nodiscard]] std::vector<value_type> build_sol(view<> ux, view<> λ) const {

        return build_rhs(ux, λ, 1, 1);

    }


    [[nodiscard]] SparseMatrix build_sparse_factor() const;

    [[nodiscard]] SparseMatrix build_sparse_diag() const;


    /// @}

};


} // namespace CYQLONE_NS(cyqlone)

BATMAT_ASSUME
#define BATMAT_ASSUME(x)

BATMAT_ASSERT
#define BATMAT_ASSERT(x)

order
std::string_view order(qp::StorageOrder o)
Definition spring-mass.cpp:340

config.hpp

cyqlone-params.hpp

cyqlone-storage.hpp
Data structure for optimal control problems where the initial states are eliminated.

cyqlone::SolveMethod::StairPCG
@ StairPCG
Preconditioned Conjugate Gradient with staircase preconditioner (iterative).
Definition cyqlone-params.hpp:12

cyqlone::SolveMethod::PCR
@ PCR
Parallel Cyclic Reduction (direct).
Definition cyqlone-params.hpp:14

cyqlone::TricyqleParams
Parameters and settings for the Tricyqle block-tridiagonal solver.
Definition cyqlone-params.hpp:30

batmat::linalg::hyhound_size_W
auto hyhound_size_W(Structured< VL, SL > L)

batmat::linalg::fill
void fill(simdified_value_t< VB > a, VB &&B)

batmat::linalg::tril
constexpr auto tril(M &&m)

cyqlone::CyqloneParams
Parameters and settings for the Cyqlone solver.
Definition cyqlone-params.hpp:61

batmat::datapar::deduced_simd
simd< Tp, deduced_abi< Tp, Np > > deduced_simd

batmat::linalg

batmat::linalg::StorageOrder
StorageOrder

batmat::matrix::StorageOrder
StorageOrder

cyqlone::get_level
constexpr index_t get_level(index_t i)
Definition cyqlone.hpp:45

cyqlone::get_index_in_level
constexpr index_t get_index_in_level(index_t i)
Definition cyqlone.hpp:51

cyqlone::ceil_log2
constexpr index_t ceil_log2(index_t n)
Definition cyqlone.hpp:38

cyqlone::is_pow_2
constexpr bool is_pow_2(index_t n)
Definition cyqlone.hpp:32

parallel.hpp
Parallel execution context and synchronization primitives.

sparse.hpp
Sparse matrix utilities.

batmat::matrix::DefaultStride

batmat::matrix::Matrix

batmat::matrix::View

batmat::matrix::View::batch
batch_view_type batch(index_type b) const

cyqlone::CyqloneSolver
Linear solver for systems with optimal control structure.
Definition cyqlone.hpp:561

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::n
const index_t n
Definition cyqlone.hpp:605

cyqlone::CyqloneSolver::num_variables
index_t num_variables() const
Get the total number of primal variables in the OCP.
Definition cyqlone.hpp:577

cyqlone::CyqloneSolver::build_sol
std::vector< value_type > build_sol(view<> ux, view<> λ) const
Definition cyqlone.hpp:1018

cyqlone::CyqloneSolver::simd
tricyqle_t::simd simd
Definition cyqlone.hpp:598

cyqlone::CyqloneSolver::pack_constraints
matrix pack_constraints(std::span< const value_type > y_lin, value_type fill=0) const
Definition data.tpp:485

cyqlone::CyqloneSolver::build_sparse_factor
SparseMatrix build_sparse_factor() const
Definition sparse.tpp:159

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::data_H
matrix< default_order > data_H
Definition cyqlone.hpp:762

cyqlone::CyqloneSolver::update
void update(Context &ctx, view<> ΔΣ)
Perform factorization updates of the Cyqlone factorization as described by Algorithm 4 in the paper.
Definition update.tpp:284

cyqlone::CyqloneSolver::transposed_general_constr
void transposed_general_constr(view<> y, mut_view<> DCᵀy) const
Compute Gᵀy, where G is the general constraint Jacobian matrix of the OCP.

cyqlone::CyqloneSolver::initialize_variables
matrix initialize_variables() const
Get a zero-initialized matrix for the primal variables u and x.
Definition data.tpp:501

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::ceil_N
index_t ceil_N() const
Definition cyqlone.hpp:653

cyqlone::CyqloneSolver::unpack_constraints
std::vector< value_type > unpack_constraints(view<> y) const
Definition data.tpp:493

cyqlone::CyqloneSolver::ν2p
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
Definition indexing.tpp:125

cyqlone::CyqloneSolver::ceil_P
constexpr index_t ceil_P() const
The number of parallel execution units P rounded up to the next power of two.
Definition cyqlone.hpp:614

cyqlone::CyqloneSolver::add_wrap_ceil_N
index_t add_wrap_ceil_N(index_t a, index_t b) const
Add b to a modulo N_horiz.
Definition indexing.tpp:42

cyqlone::CyqloneSolver::view
typename tricyqle_t::template view< O > view
Non-owning immutable view type for matrix.
Definition cyqlone.hpp:693

cyqlone::CyqloneSolver::update_data
void update_data(const CyqloneStorage< value_type > &ocp)
Update the internal data structures to reflect changes in the OCP data (without changing the problem ...
Definition data.tpp:94

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::data_F
matrix< default_order > data_F
Definition cyqlone.hpp:766

cyqlone::CyqloneSolver::update_params
void update_params(const CyqloneParams< value_type > &new_params)
Update the Cyqlone solver parameters.
Definition cyqlone.hpp:717

cyqlone::CyqloneSolver::solve_reverse_mul
void solve_reverse_mul(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> Mᵀλ)
Fused variant of solve_reverse and transposed_dynamics_constr (for improved locality of the dynamics ...
Definition factor.tpp:261

cyqlone::CyqloneSolver::factor
void factor(Context &ctx, value_type γ, view<> Σ)
Compute the Cyqlone factorization of the KKT matrix of the OCP.
Definition factor.tpp:137

cyqlone::CyqloneSolver::solve_reverse
void solve_reverse(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> work, std::optional< mut_view<> > Mᵀλ=std::nullopt) const
Definition factor.tpp:155

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::data_Gᵀ
matrix< default_order > data_Gᵀ
Definition cyqlone.hpp:770

cyqlone::CyqloneSolver::batch_view
typename tricyqle_t::template batch_view< O > batch_view
Non-owning immutable view type for a single batch of v matrices.
Definition cyqlone.hpp:700

cyqlone::CyqloneSolver::update_solve
void update_solve(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
Fused variant of update and solve_forward.
Definition update.tpp:289

cyqlone::CyqloneSolver::pack_variables
void pack_variables(std::span< const value_type > ux_lin, mut_view<> ux) const
Definition data.tpp:242

cyqlone::CyqloneSolver::factor_solve
void factor_solve(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
Compute the Cyqlone factorization of the KKT matrix of the OCP and perform a forward solve (fused for...
Definition factor.tpp:132

cyqlone::CyqloneSolver::pack_constraints
void pack_constraints(std::span< const value_type > y_lin, mut_view<> y, value_type fill=0) const
Definition data.tpp:366

cyqlone::CyqloneSolver::layer_stride
typename tricyqle_t::layer_stride layer_stride
Definition cyqlone.hpp:697

cyqlone::CyqloneSolver::cost_gradient_remove_regularization
void cost_gradient_remove_regularization(Context &ctx, value_type γ, view<> x, view<> x0, mut_view<> grad_f) const
Subtract the regularization term from the cost gradient.
Definition mat-vec.tpp:147

cyqlone::CyqloneSolver::initialize_gradient
void initialize_gradient(const CyqloneStorage< value_type > &ocp, mut_view<> grad) const
Initialize the gradient vector for the OCP cost function, using the custom Cyqlone storage format.
Definition data.tpp:170

cyqlone::CyqloneSolver::unpack_variables
void unpack_variables(view<> ux, std::span< value_type > ux_lin) const
Definition data.tpp:281

cyqlone::CyqloneSolver::initialize_rhs
matrix initialize_rhs(const CyqloneStorage< value_type > &ocp) const
Initialize the right-hand side vector for the dynamics constraints of the OCP, using the custom Cyqlo...
Definition data.tpp:429

cyqlone::CyqloneSolver::transposed_dynamics_constr
void transposed_dynamics_constr(Context &ctx, view<> λ, mut_view<> Mᵀλ, bool accum=false) const
Compute Mᵀλ, where M is the dynamics constraint Jacobian matrix of the OCP.
Definition mat-vec.tpp:52

cyqlone::CyqloneSolver::residual_dynamics_constr
void residual_dynamics_constr(Context &ctx, view<> x, view<> b, mut_view<> Mxb) const
Compute Mx + b, where M is the dynamics constraint Jacobian matrix of the OCP.
Definition mat-vec.tpp:17

cyqlone::CyqloneSolver::ν2
index_t ν2(index_t i) const
2-adic valuation ν₂.
Definition indexing.tpp:121

cyqlone::CyqloneSolver::update_solve_impl
void update_solve_impl(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
[PCR update]
Definition update.tpp:263

cyqlone::CyqloneSolver::update_riccati
void update_riccati(Context &ctx, view<> ΔΣ)
Definition cyqlone.hpp:1005

cyqlone::CyqloneSolver::get_tricyqle_params
TricyqleParams< value_type > get_tricyqle_params() const
Get the current Tricyqle solver parameters.
Definition cyqlone.hpp:720

cyqlone::CyqloneSolver::build_sparse
SparseMatrix build_sparse(const CyqloneStorage< value_type > &ocp, std::span< const value_type > Σ) const
Definition sparse.tpp:15

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::sub_wrap_ceil_N
index_t sub_wrap_ceil_N(index_t a, index_t b) const

cyqlone::CyqloneSolver::lv
constexpr index_t lv() const
log₂(v), logarithm of the vector length.
Definition cyqlone.hpp:608

cyqlone::CyqloneSolver::create_parallel_context
std::unique_ptr< SharedContext > create_parallel_context() const
Definition cyqlone.hpp:616

cyqlone::CyqloneSolver::get_params_string
std::string get_params_string() const
Get a string representation of the main solver parameters. Used mainly for file names.
Definition cyqlone.hpp:730

cyqlone::CyqloneSolver::unpack_constraints
void unpack_constraints(view<> y, std::span< value_type > y_lin) const
Definition data.tpp:401

cyqlone::CyqloneSolver::get_linear_batch_offset
index_t get_linear_batch_offset(index_t biA) const
Definition indexing.tpp:112

cyqlone::CyqloneSolver::add_wrap_p
index_t add_wrap_p(index_t a, index_t b) const
Add b to a modulo p.
Definition indexing.tpp:73

cyqlone::CyqloneSolver::cost_gradient_regularized
void cost_gradient_regularized(Context &ctx, value_type γ, view<> ux, view<> ux0, view<> q, mut_view<> grad_f) const
Compute the regularized cost gradient, with regularization parameter γ⁻¹, with respect to the point u...
Definition mat-vec.tpp:129

cyqlone::CyqloneSolver::matrix
typename tricyqle_t::template matrix< O > matrix
Owning type for a batch of matrices (with batch size v).
Definition cyqlone.hpp:690

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::N_horiz
const index_t N_horiz
Definition cyqlone.hpp:567

cyqlone::CyqloneSolver::Context
tricyqle_t::Context Context
Definition cyqlone.hpp:596

cyqlone::CyqloneSolver::solve_forward
void solve_forward(Context &ctx, mut_view<> ux, mut_view<> λ)
Perform a forward solve with the Cyqlone factorization.
Definition factor.tpp:141

cyqlone::CyqloneSolver::num_dynamics_constraints
index_t num_dynamics_constraints() const
Get the total number of dynamics constraints in the OCP.
Definition cyqlone.hpp:581

cyqlone::CyqloneSolver::build
static CyqloneSolver build(const CyqloneStorage< value_type > &ocp, index_t p)
Initialize a Cyqlone solver for the given OCP.
Definition data.tpp:41

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::ny
const index_t ny
Definition cyqlone.hpp:570

cyqlone::CyqloneSolver::cost_gradient
void cost_gradient(Context &ctx, view<> ux, value_type α, view<> q, value_type β, mut_view<> grad_f) const
Compute the cost gradient, with optional scaling factors.
Definition mat-vec.tpp:115

cyqlone::CyqloneSolver::initialize_dynamics_constraints
matrix initialize_dynamics_constraints() const
Get a zero-initialized matrix for the dynamics constraints (or their multipliers).
Definition data.tpp:506

cyqlone::CyqloneSolver::transposed_general_constr
void transposed_general_constr(Context &ctx, view<> y, mut_view<> DCᵀy) const
Compute Gᵀy, where G is the general constraint Jacobian matrix of the OCP.
Definition mat-vec.tpp:105

cyqlone::CyqloneSolver::foreach_stage
void foreach_stage(Context &ctx, auto &&func, auto &&...xs) const
Call a function for each stage in the horizon, passing the stage index, the data batch index,...
Definition cyqlone.hpp:623

cyqlone::CyqloneSolver::sub_wrap_ceil_p
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
Definition indexing.tpp:82

cyqlone::CyqloneSolver::initialize_rhs
void initialize_rhs(const CyqloneStorage< value_type > &ocp, mut_view<> rhs) const
Initialize the right-hand side vector for the dynamics constraints of the OCP, using the custom Cyqlo...
Definition data.tpp:147

cyqlone::CyqloneSolver::solve_reverse
void solve_reverse(Context &ctx, mut_view<> ux, mut_view<> λ)
Perform a reverse solve with the Cyqlone factorization.
Definition factor.tpp:255

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::riccati_Υ2
matrix< column_major > riccati_Υ2
Definition cyqlone.hpp:820

cyqlone::CyqloneSolver::unpack_variables
std::vector< value_type > unpack_variables(view<> ux) const
Definition data.tpp:461

cyqlone::CyqloneSolver::tricyqle_t
TricyqleSolver< VL, T, DefaultOrder, Ctx > tricyqle_t
Tricyqle solver type for solving block-tridiagonal systems in parallel.
Definition cyqlone.hpp:595

cyqlone::CyqloneSolver::solve_riccati_reverse
void solve_riccati_reverse(Context &ctx, mut_view<> ux, mut_view<> λ, mut_view<> work, std::optional< mut_view<> > Mᵀλ) const
[Modified Riccati factorization and fused forward solve]
Definition riccati.tpp:145

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::riccati_thread_assignment
index_t riccati_thread_assignment(Context &ctx) const
Definition cyqlone.hpp:972

cyqlone::CyqloneSolver::general_constr
void general_constr(Context &ctx, view<> ux, mut_view<> DCux) const
Compute the general constraints Gx, where G is the general constraint Jacobian matrix of the OCP.
Definition mat-vec.tpp:95

cyqlone::CyqloneSolver::add_wrap_ceil_P
index_t add_wrap_ceil_P(index_t a, index_t b) const
Definition indexing.tpp:102

cyqlone::CyqloneSolver::compute_schur
void compute_schur(Context &ctx, mut_view<> ux, mut_view<> λ)
[Cyqlone compute Schur]
Definition schur.tpp:31

cyqlone::CyqloneSolver::initialize_general_constraints
matrix initialize_general_constraints() const
Get a zero-initialized matrix for the general constraints (or their multipliers).
Definition data.tpp:511

cyqlone::CyqloneSolver::initialize_gradient
matrix initialize_gradient(const CyqloneStorage< value_type > &ocp) const
Initialize the gradient vector for the OCP cost function, using the custom Cyqlone storage format.
Definition data.tpp:437

cyqlone::CyqloneSolver::mut_batch_view
typename tricyqle_t::template mut_batch_view< O > mut_batch_view
Non-owning mutable view type for a single batch of v matrices.
Definition cyqlone.hpp:703

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::riccati_Υ1
matrix< column_major > riccati_Υ1
Definition cyqlone.hpp:815

cyqlone::CyqloneSolver::update_riccati_solve
void update_riccati_solve(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
Update the modified Riccati factorization of a single block column as described by Algorithm 3 in the...
Definition update.tpp:352

cyqlone::CyqloneSolver::sub_wrap_p
index_t sub_wrap_p(index_t a, index_t b) const
Subtract b from a modulo p.
Definition indexing.tpp:64

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::riccati_work
matrix< column_major > riccati_work
Definition cyqlone.hpp:799

cyqlone::CyqloneSolver::initialize_bounds
void initialize_bounds(const CyqloneStorage< value_type > &ocp, mut_view<> b_min, mut_view<> b_max) const
Initialize the lower and upper bounds for the general constraints of the OCP, using the custom Cyqlon...
Definition data.tpp:204

cyqlone::CyqloneSolver::unpack_dynamics
std::vector< value_type > unpack_dynamics(view<> λ) const
Definition data.tpp:477

cyqlone::CyqloneSolver::add_wrap_ceil_p
index_t add_wrap_ceil_p(index_t a, index_t b) const
Add b to a modulo ceil_p().
Definition indexing.tpp:87

cyqlone::CyqloneSolver::mut_view
typename tricyqle_t::template mut_view< O > mut_view
Non-owning mutable view type for matrix.
Definition cyqlone.hpp:696

cyqlone::CyqloneSolver::pack_dynamics
void pack_dynamics(std::span< const value_type > λ_lin, mut_view<> λ) const
Definition data.tpp:316

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::ny_0
const index_t ny_0
Definition cyqlone.hpp:571

cyqlone::CyqloneSolver::update_tricyqle_params
void update_tricyqle_params(const TricyqleParams< value_type > &new_params)
Update the Tricyqle solver parameters.
Definition cyqlone.hpp:725

cyqlone::CyqloneSolver::pack_dynamics
matrix pack_dynamics(std::span< const value_type > λ_lin) const
Definition data.tpp:469

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::nu
const index_t nu
Definition cyqlone.hpp:569

cyqlone::CyqloneSolver::ceil_p
constexpr index_t ceil_p() const
The number of processors p rounded up to the next power of two.
Definition cyqlone.hpp:612

cyqlone::CyqloneSolver::foreach_stage_fwd
void foreach_stage_fwd(Context &ctx, auto &&func, auto &&...xs) const
Call a function for each stage in the horizon, passing the stage index, the data batch index,...
Definition cyqlone.hpp:636

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::riccati_LH
matrix< default_order > riccati_LH
Definition cyqlone.hpp:782

cyqlone::CyqloneSolver::factor_riccati_solve
void factor_riccati_solve(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
[Modified Riccati factorization and fused forward solve]
Definition riccati.tpp:23

cyqlone::CyqloneSolver::build_sparse_diag
SparseMatrix build_sparse_diag() const
Definition sparse.tpp:283

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::params
CyqloneParams< value_type > params
Definition cyqlone.hpp:711

cyqlone::CyqloneSolver::initialize_bounds
std::pair< matrix<>, matrix<> > initialize_bounds(const CyqloneStorage< value_type > &ocp) const
Initialize the lower and upper bounds for the general constraints of the OCP, using the custom Cyqlon...
Definition data.tpp:445

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::column_major
static constexpr auto column_major
Definition cyqlone.hpp:686

cyqlone::CyqloneSolver::value_type
T value_type
Definition cyqlone.hpp:562

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::default_order
static constexpr auto default_order
Definition cyqlone.hpp:684

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::p
const index_t p
Definition cyqlone.hpp:601

cyqlone::CyqloneSolver::num_general_constraints
index_t num_general_constraints() const
Get the total number of general constraints in the OCP.
Definition cyqlone.hpp:585

cyqlone::CyqloneSolver::pack_variables
matrix pack_variables(std::span< const value_type > ux_lin) const
Definition data.tpp:453

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::work_Σ
matrix< column_major > work_Σ
Definition cyqlone.hpp:808

cyqlone::CyqloneSolver::get_params
CyqloneParams< value_type > get_params() const
Get the current Cyqlone solver parameters.
Definition cyqlone.hpp:714

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::tricyqle
tricyqle_t tricyqle
Definition cyqlone.hpp:747

cyqlone::CyqloneSolver::build_rhs
std::vector< value_type > build_rhs(view<> rq, view<> b, value_type scale_rq=-1, value_type scale_b=-1) const
Definition sparse.tpp:105

cyqlone::CyqloneSolver::factor_solve_impl
void factor_solve_impl(Context &ctx, value_type γ, view<> Σ, mut_view<> ux, mut_view<> λ)
[Cyqlone factorization and fused forward solve]
Definition factor.tpp:13

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::ny_N
const index_t ny_N
Definition cyqlone.hpp:572

cyqlone::CyqloneSolver::lp
constexpr index_t lp() const
log₂(p), logarithm of the number of processors/threads, rounded up.
Definition cyqlone.hpp:610

cyqlone::CyqloneSolver::unpack_dynamics
void unpack_dynamics(view<> λ, std::span< value_type > λ_lin) const
Definition data.tpp:342

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::v
static constexpr index_t v
Definition cyqlone.hpp:603

cyqlone::CyqloneSolver::SharedContext
tricyqle_t::SharedContext SharedContext
Definition cyqlone.hpp:597

cyqlone::CyqloneSolver::sub_wrap_ceil_P
index_t sub_wrap_ceil_P(index_t a, index_t b) const
Definition indexing.tpp:92

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::riccati_V
matrix< default_order > riccati_V
Definition cyqlone.hpp:794

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::nx
const index_t nx
Definition cyqlone.hpp:568

cyqlone::CyqloneSolver< VL, real_t, DefaultOrder >::riccati_LAB
matrix< default_order > riccati_LAB
Definition cyqlone.hpp:788

cyqlone::CyqloneStorage
Storage for a linear-quadratic OCP with the initial states x₀ eliminated.
Definition cyqlone-storage.hpp:39

cyqlone::SparseMatrix
A sparse matrix in COO format.
Definition sparse.hpp:26

cyqlone::TricyqleSolver
Solver for block-tridiagonal systems using cyclic reduction (CR), parallel cyclic reduction (PCR),...
Definition cyqlone.hpp:66

cyqlone::TricyqleSolver::update_pcr
void update_pcr(batch_view<> fwd, batch_view<> bwd, batch_view<> Σ)
[Cyqlone update CR helper]
Definition update.tpp:180

cyqlone::TricyqleSolver::lp
constexpr index_t lp() const
log₂(p), logarithm of the number of processors/threads p, rounded up.
Definition cyqlone.hpp:105

cyqlone::TricyqleSolver::factor_solve_impl
void factor_solve_impl(Context &ctx, mut_view<> λ, index_t stride=1)
Implementation of factor_solve.
Definition factor.tpp:29

cyqlone::TricyqleSolver::mul_precond
value_type mul_precond(batch_view<> r, mut_batch_view<> z, mut_batch_view<> w, batch_view< default_order > L, batch_view< default_order > K) const
Multiply a vector by the preconditioner for the final block tridiagonal system of size v.
Definition pcg.tpp:35

cyqlone::TricyqleSolver::solve_reverse_serial
void solve_reverse_serial(mut_view<> λ, mut_view<> work, index_t stride) const
[Cyqlone solve CR]
Definition factor.tpp:228

cyqlone::TricyqleSolver::work_Σ_extra
mut_batch_view< column_major > work_Σ_extra()
Definition update.tpp:706

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::lv
static constexpr index_t lv()
Definition cyqlone.hpp:111

cyqlone::TricyqleSolver::mut_batch_view
batmat::matrix::View< value_type, index_t, vl_t, vl_t, layer_stride, O > mut_batch_view
Non-owning mutable view type for a single batch of v matrices.
Definition cyqlone.hpp:165

cyqlone::TricyqleSolver::work_Σ_fwd
mut_batch_view< column_major > work_Σ_fwd(index_t l, index_t i)
Definition update.tpp:636

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::max_rank
const index_t max_rank
Definition cyqlone.hpp:76

cyqlone::TricyqleSolver::factor_L
void factor_L(index_t l, index_t i)
Update and factorize a block L in the Cholesky factor for CR level l+1 and column index i,...
Definition cr.tpp:71

cyqlone::TricyqleSolver::prefetch_L
void prefetch_L(batch_view< O > X) const
Definition cr.tpp:276

cyqlone::TricyqleSolver::get_solution
decltype(auto) get_solution(Context &ctx, mut_view<> λ, auto &&func) const
Get access to the solution computed by this thread using a user-provided function.
Definition cyqlone.hpp:206

cyqlone::TricyqleSolver::work_Ups_fwd_last
mut_batch_view< column_major > work_Ups_fwd_last()
Definition update.tpp:657

cyqlone::TricyqleSolver::work_Σ_bwd_last
mut_batch_view< column_major > work_Σ_bwd_last()
Definition update.tpp:689

cyqlone::TricyqleSolver::value_type
T value_type
Definition cyqlone.hpp:67

cyqlone::TricyqleSolver::init_subdiag
decltype(auto) init_subdiag(Context &ctx, auto &&func)
Initialize the subdiagonal blocks K of the block tridiagonal system using a user-provided function.
Definition cyqlone.hpp:185

cyqlone::TricyqleSolver::work_Σ_Q
mut_batch_view< column_major > work_Σ_Q(index_t l, index_t i)
Definition update.tpp:650

cyqlone::TricyqleSolver::work_Ups_extra
mut_batch_view< column_major > work_Ups_extra()
Definition update.tpp:699

cyqlone::TricyqleSolver::factor_skip_first
void factor_skip_first(Context &ctx)
Factorization-only variant of factor_solve_skip_first.
Definition cyqlone.hpp:380

cyqlone::TricyqleSolver::factor_solve_skip_first
void factor_solve_skip_first(Context &ctx, mut_view<> λ, index_t stride=1)
Fused factorization and forward solve.
Definition factor.tpp:48

cyqlone::TricyqleSolver::prefetch_L
void prefetch_L(index_t bi) const
Definition cr.tpp:291

cyqlone::TricyqleSolver::SharedContext
typename Context::shared_context_type SharedContext
Definition cyqlone.hpp:70

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::work_update_pcr_UY
matrix< column_major > work_update_pcr_UY
Definition cyqlone.hpp:351

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::column_major
static constexpr auto column_major
Definition cyqlone.hpp:148

cyqlone::TricyqleSolver::ν2
index_t ν2(index_t i) const
2-adic valuation ν₂.
Definition indexing.tpp:30

cyqlone::TricyqleSolver::solve_pcr
void solve_pcr(mut_batch_view<> λ)
Solve a linear system with the final block tridiagonal system of size v using the PCR factorization.
Definition cyqlone.hpp:452

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::work_update_pcr_L
matrix< column_major > work_update_pcr_L
Definition cyqlone.hpp:347

cyqlone::TricyqleSolver::layer_stride
batmat::matrix::DefaultStride layer_stride
Definition cyqlone.hpp:159

cyqlone::TricyqleSolver::work_Q_cr
mut_batch_view< column_major > work_Q_cr(index_t l, index_t i)
Definition update.tpp:628

cyqlone::TricyqleSolver::simd
batmat::datapar::deduced_simd< value_type, v > simd
Represents a SIMD vector of width v storing values of type value_type.
Definition cyqlone.hpp:114

cyqlone::TricyqleSolver::solve_y_forward
void solve_y_forward(index_t l, index_t iY, mut_view<> λ, mut_view<> w, index_t stride) const
Update the right-hand side λ during the forward solve phase of CR after computing block iY of λ at le...
Definition cr.tpp:177

cyqlone::TricyqleSolver::solve_u_backward
void solve_u_backward(index_t l, index_t iU, mut_view<> λ, mut_view<> w, index_t stride) const
Definition cr.tpp:210

cyqlone::TricyqleSolver::view
batmat::matrix::View< const value_type, index_t, vl_t, index_t, index_t, O > view
Non-owning immutable view type for matrix.
Definition cyqlone.hpp:155

cyqlone::TricyqleSolver::mut_view
batmat::matrix::View< value_type, index_t, vl_t, index_t, index_t, O > mut_view
Non-owning mutable view type for matrix.
Definition cyqlone.hpp:158

cyqlone::TricyqleSolver::get_solution
decltype(auto) get_solution(Context &ctx, Λ &&λ, F &&func) const
Get access to the solution computed by this thread using a user-provided function.
Definition cyqlone.hpp:211

cyqlone::TricyqleSolver::ν2p
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
Definition indexing.tpp:36

cyqlone::TricyqleSolver::add_wrap_ceil_p
index_t add_wrap_ceil_p(index_t a, index_t b) const
Add b to a modulo ceil_p().
Definition indexing.tpp:19

cyqlone::TricyqleSolver::solve_reverse
void solve_reverse(Context &ctx, mut_view<> λ, index_t stride=1)
Definition cyqlone.hpp:261

cyqlone::TricyqleSolver::init_diag
decltype(auto) init_diag(Context &ctx, auto &&func)
Initialize the diagonal blocks M of the block tridiagonal system using a user-provided function.
Definition cyqlone.hpp:177

cyqlone::TricyqleSolver::solve_forward
void solve_forward(Context &ctx, mut_view<> λ, index_t stride=1)
Perform only the forward solve as described by factor_solve.
Definition factor.tpp:126

cyqlone::TricyqleSolver::work_Σ_bwd
mut_batch_view< column_major > work_Σ_bwd(index_t l, index_t i)
Definition update.tpp:643

cyqlone::TricyqleSolver::sub_wrap_ceil_p
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
Definition indexing.tpp:8

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::merge_last_level_pcr
static constexpr bool merge_last_level_pcr
Definition cyqlone.hpp:428

cyqlone::TricyqleSolver::cr_thread_assignment
index_t cr_thread_assignment(index_t l, index_t c) const
Adjust thread assignment for non-power-of-two p: The diagonal blocks M(⌊p/2⌋2) are usually mapped to ...
Definition factor.tpp:277

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::circular
bool circular
Definition cyqlone.hpp:79

cyqlone::TricyqleSolver::ceil_P
constexpr index_t ceil_P() const
The number of parallel execution units P = p * v rounded up to the next power of two.
Definition cyqlone.hpp:109

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::pcr_U
matrix< default_order > pcr_U
Definition cyqlone.hpp:305

cyqlone::TricyqleSolver::work_Ups_bwd
mut_batch_view< column_major > work_Ups_bwd(index_t l, index_t i)
Definition update.tpp:620

cyqlone::TricyqleSolver::solve_forward_skip_first
void solve_forward_skip_first(Context &ctx, mut_view<> λ, index_t stride=1)
Solution-only variant of factor_solve_skip_first.
Definition cyqlone.hpp:382

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::pcr_L
matrix< default_order > pcr_L
Definition cyqlone.hpp:296

cyqlone::TricyqleSolver::cols_Ups_fwd
std::pair< index_t, index_t > cols_Ups_fwd(index_t l, index_t i) const
Definition update.tpp:558

cyqlone::TricyqleSolver::update_L
void update_L(index_t l, index_t i)
[Cyqlone update CR helper]
Definition update.tpp:23

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::work_update
matrix< column_major > work_update
Definition cyqlone.hpp:332

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::cr_Y
matrix< default_order > cr_Y
Definition cyqlone.hpp:282

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::m_update
std::vector< index_t > m_update
Definition cyqlone.hpp:323

cyqlone::TricyqleSolver::cols_Q_cr
std::pair< index_t, index_t > cols_Q_cr(index_t l, index_t i) const
Definition update.tpp:588

cyqlone::TricyqleSolver::update_pcr_level
void update_pcr_level(index_t m, mut_batch_view<> WYU, mut_batch_view<> WΣ)
Definition update.tpp:198

cyqlone::TricyqleSolver::solve_λ_backward
void solve_λ_backward(index_t biL, mut_view<> λ, view<> w, index_t stride) const
Definition cr.tpp:241

cyqlone::TricyqleSolver::get_solution
decltype(auto) get_solution(Context &ctx, view<> λ, auto &&func) const
Get access to the solution computed by this thread using a user-provided function.
Definition cyqlone.hpp:202

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::work_cr
matrix< column_major > work_cr
Definition cyqlone.hpp:286

cyqlone::TricyqleSolver::work_Ups_bwd_last
mut_batch_view< column_major > work_Ups_bwd_last()
Definition update.tpp:668

cyqlone::TricyqleSolver::work_Σ_fwd_last
mut_batch_view< column_major > work_Σ_fwd_last()
Definition update.tpp:679

cyqlone::TricyqleSolver::work_Ups_bwd_w
index_t work_Ups_bwd_w(index_t l, index_t i) const
Definition update.tpp:604

cyqlone::TricyqleSolver::set_update_rank_extra
void set_update_rank_extra(index_t m)
Definition update.tpp:547

cyqlone::TricyqleSolver::solve_λ_forward
void solve_λ_forward(index_t l, index_t iL, mut_view<> λ, view<> w, index_t stride) const
Apply the updates to block iL of the right-hand side from solve_u_forward and solve_y_forward,...
Definition cr.tpp:190

cyqlone::TricyqleSolver::work_Ups_fwd
mut_batch_view< column_major > work_Ups_fwd(index_t l, index_t i)
Definition update.tpp:612

cyqlone::TricyqleSolver::solve_y_backward
void solve_y_backward(index_t l, index_t iY, mut_view<> λ, index_t stride) const
Definition cr.tpp:225

cyqlone::TricyqleSolver::update_solve_cr
void update_solve_cr(Context &ctx, mut_view<> λ, index_t stride)
[Cyqlone update CR]
Definition update.tpp:297

cyqlone::TricyqleSolver::set_thread_update_rank
void set_thread_update_rank(Context &ctx, index_t c, index_t m)
[Cyqlone update Riccati]
Definition update.tpp:539

cyqlone::TricyqleSolver::align_t
std::integral_constant< index_t, v *alignof(value_type)> align_t
Integral constant type for the alignment of the batched matrix data structures.
Definition cyqlone.hpp:118

cyqlone::TricyqleSolver::factor_pcr
void factor_pcr()
Compute the parallel cyclic reduction factorization of the final block tridiagonal system of size v.
Definition pcr.tpp:28

cyqlone::TricyqleSolver::get_params
Params get_params() const
Get the current solver parameters.
Definition cyqlone.hpp:90

cyqlone::TricyqleSolver::ceil_p
constexpr index_t ceil_p() const
The number of processors p rounded up to the next power of two.
Definition cyqlone.hpp:107

cyqlone::TricyqleSolver::create_parallel_context
std::unique_ptr< SharedContext > create_parallel_context() const
Create a new parallel execution context, storing synchronization primitives and shared data for the p...
Definition cyqlone.hpp:122

cyqlone::TricyqleSolver::factor_solve
void factor_solve(Context &ctx, mut_view<> λ, index_t stride=1)
Fused factorization and forward solve.
Definition factor.tpp:117

cyqlone::TricyqleSolver::factor_U
void factor_U(index_t l, index_t iU)
Compute a block U in the Cholesky factor for the given CR level l and column index iU.
Definition cr.tpp:23

cyqlone::TricyqleSolver::batch_view
batmat::matrix::View< const value_type, index_t, vl_t, vl_t, layer_stride, O > batch_view
Non-owning immutable view type for a single batch of v matrices.
Definition cyqlone.hpp:162

cyqlone::TricyqleSolver::solve_pcg
void solve_pcg(mut_batch_view<> λ)
Solve a linear system with the final block tridiagonal system of size v using the preconditioned conj...
Definition cyqlone.hpp:484

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::pcr_M
matrix< default_order > pcr_M
Definition cyqlone.hpp:309

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::work_update_Σ
matrix< column_major > work_update_Σ
Definition cyqlone.hpp:328

cyqlone::TricyqleSolver::solve_pcg
void solve_pcg(mut_batch_view<> λ, mut_batch_view<> work_pcg) const
Solve a linear system with the final block tridiagonal system of size v using the preconditioned conj...
Definition pcg.tpp:54

cyqlone::TricyqleSolver::vl_t
std::integral_constant< index_t, v > vl_t
Integral constant type for the vector length.
Definition cyqlone.hpp:116

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::v
static constexpr index_t v
Definition cyqlone.hpp:103

cyqlone::TricyqleSolver::factor
void factor(Context &ctx)
Perform only the factorization as described by factor_solve.
Definition factor.tpp:122

cyqlone::TricyqleSolver::solve_reverse_parallel
void solve_reverse_parallel(Context &ctx, mut_view<> λ, mut_view<> work, index_t stride) const
[Cyqlone solve CR]
Definition factor.tpp:179

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::m_update_u0
index_t m_update_u0
Definition cyqlone.hpp:325

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::work_pcg
matrix< column_major > work_pcg
Definition cyqlone.hpp:313

cyqlone::TricyqleSolver::prefetch_U
void prefetch_U(index_t l, index_t iU) const
Definition cr.tpp:297

cyqlone::TricyqleSolver::solve_reverse
void solve_reverse(Context &ctx, mut_view<> λ, mut_view<> work, index_t stride=1) const
Perform the backward solve phase, after the forward solve phase has been performed by factor_solve.
Definition factor.tpp:164

cyqlone::TricyqleSolver::prefetch
void prefetch(batch_view< O > X) const
[Cyqlone solve CR helper]
Definition cr.tpp:260

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::work_hyh
matrix< column_major > work_hyh
Definition cyqlone.hpp:336

cyqlone::TricyqleSolver::factor_pcr_level_parallel
void factor_pcr_level_parallel(Context &ctx)
Perform a single level of the PCR factorization.
Definition pcr.tpp:109

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::block_size
const index_t block_size
Definition cyqlone.hpp:75

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::params
Params params
Definition cyqlone.hpp:87

cyqlone::TricyqleSolver::init_rhs
decltype(auto) init_rhs(Context &ctx, mut_view<> b, auto &&func) const
Initialize the right-hand side of the linear system using a user-provided function.
Definition cyqlone.hpp:194

cyqlone::TricyqleSolver::solve_u_forward
void solve_u_forward(index_t l, index_t iU, mut_view<> λ, index_t stride) const
Update the right-hand side λ during the forward solve phase of CR after computing block iU of λ at le...
Definition cr.tpp:163

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::default_order
static constexpr auto default_order
Definition cyqlone.hpp:146

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::p
const index_t p
Definition cyqlone.hpp:101

cyqlone::TricyqleSolver::solve_pcr
void solve_pcr(mut_batch_view<> λ, mut_batch_view<> work_pcr) const
Solve a linear system with the final block tridiagonal system of size v using the PCR factorization.
Definition pcr.tpp:181

cyqlone::TricyqleSolver::update_params
void update_params(const Params &new_params)
Update the solver parameters.
Definition cyqlone.hpp:93

cyqlone::TricyqleSolver::work_Ups_fwd_w
index_t work_Ups_fwd_w(index_t l, index_t i) const
Definition update.tpp:593

cyqlone::TricyqleSolver::update_K
void update_K(index_t l, index_t i)
Compute a subdiagonal block K of the Schur complement for CR level l+1 and column index i,...
Definition cr.tpp:50

cyqlone::TricyqleSolver::solve_pcr_level
void solve_pcr_level(mut_batch_view<> λ, mut_batch_view<> work_pcr) const
Perform a single level of the PCR solve.
Definition pcr.tpp:194

cyqlone::TricyqleSolver::update_Y
void update_Y(index_t l, index_t i)
Definition update.tpp:157

cyqlone::TricyqleSolver::cols_Ups_bwd
std::pair< index_t, index_t > cols_Ups_bwd(index_t l, index_t i) const
Definition update.tpp:573

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::cr_U
matrix< default_order > cr_U
Definition cyqlone.hpp:277

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::pcr_Y
matrix< default_order > pcr_Y
Definition cyqlone.hpp:301

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::cr_L
matrix< default_order > cr_L
Definition cyqlone.hpp:272

cyqlone::TricyqleSolver::Params
TricyqleParams< value_type > Params
Definition cyqlone.hpp:68

cyqlone::TricyqleSolver< VL, T, DefaultOrder, Ctx >::work_update_pcr_Σ
matrix< column_major > work_update_pcr_Σ
Definition cyqlone.hpp:343

cyqlone::TricyqleSolver::matrix
batmat::matrix::Matrix< value_type, index_t, vl_t, index_t, O, align_t > matrix
Owning type for a batch of matrices (with batch size v).
Definition cyqlone.hpp:152

cyqlone::TricyqleSolver::update_U
void update_U(index_t l, index_t i)
Definition update.tpp:123

cyqlone::TricyqleSolver::factor_pcr_parallel
void factor_pcr_parallel(Context &ctx)
Compute the parallel cyclic reduction factorization of the final block tridiagonal system of size v.
Definition pcr.tpp:101

cyqlone::TricyqleSolver::mul_Mv
value_type mul_Mv(batch_view<> p, mut_batch_view<> Mp, batch_view< default_order > L, batch_view< default_order > K) const
Multiply a vector by the final block tridiagonal matrix of size v.
Definition pcg.tpp:23

cyqlone::TricyqleSolver::prefetch_Y
void prefetch_Y(index_t l, index_t iY) const
Definition cr.tpp:306

cyqlone::TricyqleSolver::clear_update_rank_extra
void clear_update_rank_extra()
Definition update.tpp:552

cyqlone::TricyqleSolver::factor_pcr_level
void factor_pcr_level()
Perform a single level of the PCR factorization.
Definition pcr.tpp:38

cyqlone::TricyqleSolver::factor_Y
void factor_Y(index_t l, index_t iY)
Compute a block Y in the Cholesky factor for the given CR level l and column index iY.
Definition cr.tpp:37

cyqlone::TricyqleSolver::Context
Ctx Context
Definition cyqlone.hpp:69

cyqlone::parallel::Context
Thread context for parallel execution.
Definition parallel.hpp:64

timing.hpp