develop/Doxygen/schur_8tpp_source.html

#include <cyqlone/cyqlone.hpp>

#include <cyqlone/linalg.hpp>

#include <cyqlone/tracing.hpp>


#include <batmat/assume.hpp>

#include <batmat/linalg/gemm.hpp>

#include <batmat/linalg/gemv.hpp>

#include <batmat/linalg/potrf.hpp>

#include <batmat/linalg/shift.hpp>

#include <batmat/linalg/trsm.hpp>

#include <batmat/linalg/trtri.hpp>

#include <utility>


namespace CYQLONE_NS(cyqlone) {


using namespace linalg;

using namespace batmat::linalg;


// Algorithm 2 “Cyqlone factorization”

// §4.3 “Computation of the Schur complement (step 3)”

//

// Build the Schur complement after factorizing the Riccati blocks, and/or update the right-hand

// side of the Schur complement after performing a forward solve of the Riccati blocks.

//

// See also: factor.tpp


//! [Cyqlone compute Schur]

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>

template <bool Factor, bool Solve>

// NOLINTNEXTLINE(*-cognitive-complexity) // Needs to match pseudocode structure


void CyqloneSolver<VL, T, DefaultOrder, Ctx>::compute_schur(Context &ctx, mut_view<> ux,

                                                            mut_view<> λ) {

    const index_t c   = riccati_thread_assignment(ctx);

    const auto c_next = add_wrap_p(c, 1);

    //  7|  j₁ = n(c-1)+1,  jₙ = nc

    const auto dn = c * n, dn_next = c_next * n, d1_next = dn_next + n - 1;

    //  8|  i˃ = c,  i˂ = c-1

    const index_t i_fwd = c, i_bwd = sub_wrap_ceil_p(c, 1);

    auto M = tril(tricyqle.cr_L.batch(c));

    // 13|  W = [ LB(jₙ) ... LB(j₁) LA(j₁) ]    -- The order here is [ LA(j₁) LB(jₙ) ... LB(j₁) ]

    auto W = riccati_LAB.batch(c).right_cols(nx + nu * n);

    if constexpr (Factor) {

        auto LH = riccati_LH.batch(c);

        auto LQ = tril(LH.bottom_right(nx, nx));

        //  9|  T(c) = LQ(j₁)⁻ᵀ

        BATMAT_ASSERT(nu >= 1); // T = LQ⁻ᵀ is upper triangular, stored one row up from LQ itself

        auto Tc = triu(LH.right_cols(nx).middle_rows(nu - 1, nx));

        {

            GUANAQO_TRACE("Invert Q", c);

            CYQ_TRACE_WRITE(T, c, 0);

            trtri(LQ, Tc.transposed());

        }

        auto T_ready = ctx.arrive();

        auto LA1     = riccati_LAB.batch(c).middle_cols(nx * (n - 1), nx); // LA(j₁)

        // 10|  if ν2(i˂) > ν2(i˃)    K˂(i˃) = -T(c) LA(j₁)ᵀ    else    K˃(i˂) = -LA(j₁) T(c)ᵀ

        if (ν2p(i_bwd) > ν2p(i_fwd)) {

            GUANAQO_TRACE("Compute first U", i_fwd);

            CYQ_TRACE_WRITE(Kb, i_fwd, 0);

            trmm_neg(Tc, LA1.transposed(), tricyqle.cr_U.batch(i_fwd));

        } else {

            GUANAQO_TRACE("Compute first Y", i_bwd);

            CYQ_TRACE_WRITE(Kf, i_bwd, 0);

            if (i_fwd > 0)

                trmm_neg(LA1, Tc.transposed(), tricyqle.cr_Y.batch(i_bwd));

            else if constexpr (v > 1)

                trmm_neg(LA1, Tc.transposed(), tricyqle.cr_Y.batch(i_bwd), //

                         with_rotate_C<-1>, with_rotate_D<-1>, with_mask_D<-1>);

        }

        // 11|  -- sync --

        //      Wait for the inversion in the next interval

        ctx.wait(std::move(T_ready));

        //      Each column of the cyclic part with coupling equations is updated by two threads:

        //      one for the forward, and one for the backward coupling. Update the diagonal blocks

        //      of the coupling equations, first forward in time ...

        auto R̂ŜQ̂_next = riccati_LH.batch(c_next);

        // 12|  M(c)˂ = T(c+1) T(c+1)ᵀ

        auto Tc_next = triu(R̂ŜQ̂_next.right_cols(nx).middle_rows(nu - 1, nx));

        {

            CYQ_TRACE_READ(T, c_next, 0);

            GUANAQO_TRACE("Compute TTᵀ", c_next);

            if (c_next > 0 || v == 1)

                trmm(Tc_next, Tc_next.transposed(), M);

            else

                trmm(Tc_next, Tc_next.transposed(), M, with_rotate_C<-1>, with_rotate_D<-1>);

        }

        //      And finally backward in time, optionally fused with the factorization.

        if (p == 1) { // no multi-threading

            GUANAQO_TRACE("Factor M last", c);

            CYQ_TRACE_WRITE(L, c, 0);

            auto L0 = tril(tricyqle.pcr_L.batch(0));

            // 13|  M(c)˃ = WWᵀ

            // 14|  M(c) = M(c)˂ + M(c)˃

            syrk_add(W, M);

            // 16|  L(c) = chol(M(c))

            potrf(M, L0); // Final block is stored separately (for PCR/PCG later)

        } else if (ν2p(i_fwd) == 0) {

            GUANAQO_TRACE("Factor M", c);

            CYQ_TRACE_WRITE(L, c, 0);

            CYQ_TRACE_WRITE(L, c, 1);

            // 13|  M(c)˃ = WWᵀ

            // 14|  M(c) = M(c)˂ + M(c)˃

            // 16|  L(c) = chol(M(c))

            syrk_add_potrf(W, M);

        } else {

            GUANAQO_TRACE("Compute WWᵀ", c);

            CYQ_TRACE_WRITE(M, c, 0);

            // 13|  M(c)˃ = WWᵀ

            // 14|  M(c) = M(c)˂ + M(c)˃

            syrk_add(W, M);

        }

    }

    if constexpr (Solve) {

        if (!Factor)

            ctx.arrive_and_wait(); // Wait for x_next

        {

            GUANAQO_TRACE("Update λ", dn);

            auto x_next = ux.batch(d1_next).bottom_rows(nx);

            if (c_next > 0 || v == 1)

                sub(λ.batch(dn), x_next);

            else

                sub(λ.batch(dn), x_next, with_rotate<1>);

        }

        {

            // TODO: λ(dn) here has a different thread assignment than in TricyqleSolver

            GUANAQO_TRACE("Solve λ", dn);

            if (ν2p(i_fwd) == 0 && p != 1)

                trsm(M, λ.batch(dn));

        }

    }

}


//! [Cyqlone compute Schur]


} // namespace CYQLONE_NS(cyqlone)

BATMAT_ASSERT
#define BATMAT_ASSERT(x)

cyqlone.hpp
The main header for the Cyqlone and Tricyqle linear solvers.

batmat::linalg::syrk_add_potrf
void syrk_add_potrf(VA &&A, Structured< VC, SC > C, Structured< VD, SC > D, simdified_value_t< VA > regularization=0)

batmat::linalg::trmm_neg
void trmm_neg(Structured< VA, SA > A, Structured< VB, SB > B, Structured< VD, SD > D, Opts... opts)

batmat::linalg::trsm
void trsm(Structured< VA, SA > A, VB &&B, VD &&D, with_rotate_B_t< RotB >={})

batmat::linalg::trtri
void trtri(Structured< VA, MatrixStructure::LowerTriangular > A, Structured< VD, MatrixStructure::LowerTriangular > D)

batmat::linalg::trmm
void trmm(Structured< VA, SA > A, Structured< VB, SB > B, Structured< VD, SD > D, Opts... opts)

batmat::linalg::syrk_add
void syrk_add(VA &&A, Structured< VC, SD > C, Structured< VD, SD > D, Opts... opts)

batmat::linalg::potrf
void potrf(Structured< VC, SC > C, Structured< VD, SC > D, simdified_value_t< VC > regularization=0)

batmat::linalg::triu
constexpr auto triu(M &&m)

cyqlone::linalg::sub
void sub(VA &&A, VB &&B, VC &&C, with_rotate_t< Rotate >={})
Subtract two matrices or vectors C = A - B. Rotate affects B.
Definition linalg.hpp:401

batmat::linalg::tril
constexpr auto tril(M &&m)

GUANAQO_TRACE
#define GUANAQO_TRACE(name, instance,...)

linalg.hpp

batmat::linalg::with_rotate_D
constexpr with_rotate_D_t< I > with_rotate_D

batmat::linalg::with_rotate
constexpr with_rotate_t< I > with_rotate

batmat::linalg::with_mask_D
constexpr with_mask_D_t< I > with_mask_D

batmat::linalg::with_rotate_C
constexpr with_rotate_C_t< I > with_rotate_C

cyqlone::CyqloneSolver::n
const index_t n
Number of stages per thread per vector lane (rounded up).
Definition cyqlone.hpp:605

cyqlone::CyqloneSolver::ν2p
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
Definition indexing.tpp:125

cyqlone::CyqloneSolver::add_wrap_p
index_t add_wrap_p(index_t a, index_t b) const
Add b to a modulo p.
Definition indexing.tpp:73

cyqlone::CyqloneSolver::Context
tricyqle_t::Context Context
Definition cyqlone.hpp:596

cyqlone::CyqloneSolver::sub_wrap_ceil_p
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
Definition indexing.tpp:82

cyqlone::CyqloneSolver::riccati_thread_assignment
index_t riccati_thread_assignment(Context &ctx) const
Definition cyqlone.hpp:972

cyqlone::CyqloneSolver::compute_schur
void compute_schur(Context &ctx, mut_view<> ux, mut_view<> λ)
[Cyqlone compute Schur]
Definition schur.tpp:31

cyqlone::CyqloneSolver::mut_view
typename tricyqle_t::template mut_view< O > mut_view
Non-owning mutable view type for matrix.
Definition cyqlone.hpp:696

cyqlone::CyqloneSolver::nu
const index_t nu
Number of controls of the OCP.
Definition cyqlone.hpp:569

cyqlone::CyqloneSolver::riccati_LH
matrix< default_order > riccati_LH
Cholesky factors of the Hessian blocks for the Riccati recursion.
Definition cyqlone.hpp:782

cyqlone::CyqloneSolver::p
const index_t p
Number of processors/threads.
Definition cyqlone.hpp:601

cyqlone::CyqloneSolver::tricyqle
tricyqle_t tricyqle
Block-tridiagonal solver (CR/PCR/PCG).
Definition cyqlone.hpp:747

cyqlone::CyqloneSolver::v
static constexpr index_t v
Vector length.
Definition cyqlone.hpp:603

cyqlone::CyqloneSolver::nx
const index_t nx
Number of states of the OCP.
Definition cyqlone.hpp:568

cyqlone::CyqloneSolver::riccati_LAB
matrix< default_order > riccati_LAB
Storage for the matrices LB(j), Acl(j) and LA(j₁) for the Riccati recursion.
Definition cyqlone.hpp:788

tracing.hpp

CYQ_TRACE_WRITE
#define CYQ_TRACE_WRITE(...)
Definition tracing.hpp:62

CYQ_TRACE_READ
#define CYQ_TRACE_READ(...)
Definition tracing.hpp:63