develop/Doxygen/update_8tpp_source.html

#include <cyqlone/cyqlone.hpp>

#include <cyqlone/tracing.hpp>


#include <batmat/assume.hpp>

#include <batmat/linalg/compress.hpp>

#include <batmat/linalg/copy.hpp>

#include <batmat/linalg/gemm-diag.hpp>

#include <batmat/linalg/gemm.hpp>

#include <batmat/linalg/hyhound.hpp>

#include <batmat/linalg/simdify.hpp>

#include <batmat/loop.hpp>


#include <numeric>


namespace CYQLONE_NS(cyqlone) {


using namespace batmat::linalg;


// Algorithm 4 “Cyqlone factorization updates”


//! [Cyqlone update CR helper]

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_L(index_t l, index_t i) {

    if (l < lp()) {

        CYQ_TRACE_READ(Upf, i, 0);

        CYQ_TRACE_READ(Upb, i, 0);

        GUANAQO_TRACE("Update L", i);

        CYQ_TRACE_WRITE(Q, i, 0);

        CYQ_TRACE_WRITE(Q, i, 1);

        auto L   = tril(cr_L.batch(i));

        auto UpQ = work_Q_cr(l, i);

        auto Σ   = work_Σ_Q(l, i);

        auto WQ  = work_hyh.batch(i);

        // 16|  [ L̃(i) | 0 ] = [ L(i) | Υ˃(i)  Υ˂(i) ] Q̆(i),  blkdiag(-I, 𝒮(i;l+1))-orthogonal

        hyhound_diag(L, UpQ, Σ, WQ);

        return;

    }


    // Last level

    auto M0 = tril(cr_L.batch(0)), L0 = tril(pcr_L.batch(0));

    auto Y0   = cr_Y.batch(0);

    auto Ypen = cr_Y.batch(p / 2), Upen = cr_U.batch(p / 2); // Subdiag blocks of penultimate level


    auto Υ0_bwd = work_Ups_bwd_last(), Υ0_fwd = work_Ups_fwd_last();

    auto Σ_bwd = work_Σ_bwd_last(), Σ_fwd = work_Σ_fwd_last();

    BATMAT_ASSERT(Σ_bwd.rows() == Σ_fwd.rows() || m_update_u0 >= 0);


    // For p=2, v=4, the update of the last level looks like:

    //

    // [ Υ˂(0)                Υ˃(0) | L(0)                   ]

    // [ Υ˃(2)  Υ˂(2)               | Y(0)  L(2)             ]

    // [        Υ˃(4)  Υ˂(4)        |       Y(2)  L(4)       ]

    // [               Υ˃(6)  Υ˂(6) |             Y(4)  L(6) ]

    //

    // where the blocks are stored as follows:

    //  Υ0_bwd = [ Υ˂(0)  Υ˂(2)  Υ˂(4)  Υ˂(6) ]

    //  Υ0_fwd = [ Υ˃(2)  Υ˃(4)  Υ˃(6)  Υ˃(0) ]

    //  L0     = [ L(0)   L(2)   L(4)   L(6) ]

    //  Y0     = [ Y(0)   Y(2)   Y(4)   -    ]

    //

    // Note that Υ˂ and Υ˃ are aligned by column, not by row. To apply the updates (row-wise),

    // we therefore need to rotate Υ0_fwd by one block to the right first.


    // Check the rank to decide whether to update or recompute

    const index_t nj      = std::max(Σ_fwd.rows(), Σ_bwd.rows());

    auto pcr_update_thres = params.pcr_max_update_fraction * static_cast<double>(block_size);

    auto y0_update_thres  = params.cr_max_update_fraction_Y0 * static_cast<double>(block_size);

    bool update           = static_cast<double>(nj) < pcr_update_thres;

    bool update_y         = static_cast<double>(nj) < y0_update_thres;

    bool do_update_pcr    = params.solve_method == SolveMethod::PCR && update && v > 1;

    bool do_refactor_pcr  = params.solve_method == SolveMethod::PCR && !update;


    CYQ_TRACE_READ(Upf, 0, 0);

    CYQ_TRACE_READ(Upb, 0, 0);

    // Perform the PCR update

    if (do_update_pcr)

        update_pcr(Υ0_fwd, Υ0_bwd, Σ_bwd);


    { // Update or recompute the matrices Y(0), M(0) and L(0) in the last CR level

        GUANAQO_TRACE("Update L", i);

        // Update or recompute the subdiagonal block Y of the last CR level.

        // If there's only a single thread, we always update because there is no previous CR level

        // to recompute from (we would need to recompute the Riccati products, which is slow).

        // Otherwise, we only update if the rank is sufficiently low.

        if constexpr (v > 1) {

            if (update_y || p == 1)

                gemm_diag_add(Υ0_fwd, Υ0_bwd.transposed(), Y0, Σ_fwd);

            else

                gemm_neg(Ypen, Upen.transposed(), Y0);

        }

        // If at some point in the future we need to refactor PCR, we may need Y(0). So we just

        // always update it here. Alternatively, we could recompute it when needed, but that would

        // complicate the bookkeeping. Besides, we need Y(0) for the PCG case anyway.


        // Make sure the diagonal block M of the last CR level is up to date (it is needed for PCR).

        // This is done in two steps, the backward and the forward updates, the latter of which

        // requires a rotation first.

        if (params.solve_method == SolveMethod::PCR)

            syrk_diag_add(Υ0_bwd, M0, Σ_bwd);

        // When using PCG, we need the Cholesky factors L(0) of M(0) for the preconditioner, so

        // update them here. Like with the update of M(0), we do this in two steps.

        if (!do_update_pcr)

            hyhound_diag(L0, Υ0_bwd, Σ_bwd);

        // Rotate and repeat for the forward update.

        batmat::linalg::copy(Σ_fwd, Σ_fwd, with_rotate<-1>);

        batmat::linalg::copy(Υ0_fwd, Υ0_fwd, with_rotate<-1>);

        if (params.solve_method == SolveMethod::PCR)

            syrk_diag_add(Υ0_fwd, M0, Σ_fwd);

        if (!do_update_pcr)

            hyhound_diag(L0, Υ0_fwd, Σ_fwd);

        // TODO: we should actually merge these two hyhound_diag calls to make sure that the

        //       intermediate matrix after the backward update does not become indefinite

        //       (although this shouldn't be an issue for QPALM, at least not in exact arithmetic).

        //       We already have the code for this in update_pcr_level.

    }


    // Finally, recompute the PCR factorization if we did not do an update.

    if (do_refactor_pcr)

        factor_pcr(); // TODO: use parallel variant (when doing so, synchronize in update_solve_cr)

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_U(index_t l, index_t i) {

    const index_t i_bwd = sub_wrap_ceil_p(i, 1 << l);

    CYQ_TRACE_READ(Upb, i_bwd, 0);

    CYQ_TRACE_READ(Q, i, 1);

    GUANAQO_TRACE("Update U", i);

    CYQ_TRACE_WRITE(Upb, i_bwd, 0);

    auto Up_bwd = work_Ups_bwd(l, i_bwd), Up_bwd_next = work_Ups_bwd(l + 1, i_bwd);

    if constexpr (v == 1)

        if (i >= p) { // happens in cases where p is not a power of two

            // There's no matrix Q̆(i) to apply, just copy the update matrices forward

            if (Up_bwd.data() != Up_bwd_next.data())

                copy(Up_bwd, Up_bwd_next);

            // If the number of threads is odd, then update_Y won't be called for this column i,

            // so we need to copy the forward update matrices here as well.

            index_t i_fwd = add_wrap_ceil_p(i, 1 << l);

            if (i_fwd >= p)

                i_fwd = 0;

            if (i_fwd == 0 && m_update_u0 >= 0)

                return; // Υ˃(0) = 0

            auto Up_fwd = work_Ups_fwd(l, i_fwd), Up_fwd_next = work_Ups_fwd(l + 1, i_fwd);

            if (Up_fwd.data() != Up_fwd_next.data())

                copy(Up_fwd, Up_fwd_next);

            return;

        }

    auto UpQ = work_Q_cr(l, i);

    auto Σ   = work_Σ_Q(l, i);

    auto WQ  = work_hyh.batch(i);

    auto U   = cr_U.batch(i);

    // 18|  [ Ũ(i) | Υ˂(i-2^l;l+1) ] = [ U(i) | Υ˂(i-2^l;l)  0 ] Q̆(i)

    hyhound_diag_apply(U, Up_bwd, Up_bwd_next, //

                       UpQ, Σ, WQ, 0);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_Y(index_t l, index_t i) {

    index_t i_fwd = add_wrap_ceil_p(i, 1 << l);

    CYQ_TRACE_READ(Upf, i_fwd, 0);

    CYQ_TRACE_READ(Q, i, 0);

    GUANAQO_TRACE("Update Y", i);

    CYQ_TRACE_WRITE(Upf, i_fwd, 0);

    if (i_fwd >= p)

        i_fwd = 0;

    if (i_fwd == 0 && m_update_u0 >= 0)

        return; // Υ˃(0) = 0

    auto UpQ    = work_Q_cr(l, i);

    auto Σ      = work_Σ_Q(l, i);

    auto WQ     = work_hyh.batch(i);

    auto Y      = cr_Y.batch(i);

    auto Up_fwd = work_Ups_fwd(l, i_fwd), Up_fwd_next = work_Ups_fwd(l + 1, i_fwd);

    // 20|  [ Ỹ(i) | Υ˃(i+2^l;l+1) ] = [ Y(i) | 0  Υ˃(i+2^l;l) ] Q̆(i)

    hyhound_diag_apply(Y, Up_fwd, Up_fwd_next, //

                       UpQ, Σ, WQ, Up_fwd_next.cols() - Up_fwd.cols());

}


//! [Cyqlone update CR helper]


//! [PCR update]

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_pcr(batch_view<> fwd, batch_view<> bwd,

                                                          batch_view<> Σbwd) {

    index_t m = fwd.cols();

    BATMAT_ASSUME(m == bwd.cols());

    auto WYU = work_update_pcr_UY.left_cols(VL * m).batch(0);

    auto WY  = WYU.left_cols(VL * m / 2); // WY and WU start in the middle of WYU and grow outwards

    auto WU  = WYU.right_cols(VL * m / 2);

    auto Σ   = work_update_pcr_Σ.top_rows(VL * m).batch(0);

    batmat::linalg::copy(bwd, WU.left_cols(m));

    batmat::linalg::copy(fwd, WY.right_cols(m), with_rotate<-1>);

    batmat::linalg::copy(Σbwd, Σ.bottom_rows(m));

    [&]<index_t... Levels>(std::integer_sequence<index_t, Levels...>) {

        (this->template update_pcr_level<Levels>(m, WYU, Σ), ...);

    }(std::make_integer_sequence<index_t, TricyqleSolver::lv()>{});

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>

template <index_t Level>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_pcr_level(index_t m, mut_batch_view<> WYU,

                                                                mut_batch_view<> WΣ) {

    constexpr index_t l = Level;

    // The algorithm requires the update matrices that are not reduced in the current level to be

    // offset by 2^l. We could do this by first rotating them by 2^l, applying the Householder

    // transformations, and then rotating them back. However, this would be inefficient, so instead

    // we leave the workspace rotated by 2^l from the previous level, and adjust the rotations in

    // the next level.

    constexpr index_t rot = 1 << l, prev_rot = rot >> 1;

    const index_t ml = m << l;

    GUANAQO_TRACE("Update PCR", l);

    auto Σ = WΣ.bottom_rows(2 * ml);

    if constexpr (prev_rot != 0)

        batmat::linalg::copy(Σ.bottom_rows(ml), Σ.bottom_rows(ml), with_rotate<+prev_rot>);

    batmat::linalg::copy(Σ.bottom_rows(ml), Σ.top_rows(ml), with_rotate<-rot>);

    if constexpr (l + 1 < lv()) {

        //          S(-1)    S(0)

        //  WL = [ Υ˃(0)  | Υ˂(0)  ]

        //  WY = [   0    | Υ˃(+1) ]

        //  WU = [ Υ˂(-1) |   0    ]

        auto WL  = work_update_pcr_L.left_cols(2 * ml).batch(0);

        auto WU0 = WYU.right_cols(VL * m / 2).left_cols(2 * ml);

        auto W0Y = WYU.left_cols(VL * m / 2).right_cols(2 * ml);

        auto WY  = W0Y.right_cols(ml);

        auto WU  = WU0.left_cols(ml);

        // undo workspace rotation

        batmat::linalg::copy(WY, WL.left_cols(ml), with_rotate<-prev_rot>);

        batmat::linalg::copy(WU, WL.right_cols(ml), with_rotate<+prev_rot>);

        // rotate element k-2^l to position k (but the workspace is already at -prev_rot)

        batmat::linalg::copy(WU, WU, with_rotate<-rot + prev_rot>);

        // rotate element k+2^l to position k (but the workspace is already at +prev_rot)

        batmat::linalg::copy(WY, WY, with_rotate<+rot - prev_rot>);

        // [ L̃(k;l) |       0       ]   [ L(k;l) | Υ˃(k;l)      Υ˂(k;l)     ]

        // [ Ũ(k;l) | Υ˂(k-2^l;l+1) ] = [ U(k;l) | Υ˂(k-2^l;l)     0        ] Q̆(k;l)

        // [ Ỹ(k;l) | Υ˃(k+2^l;l+1) ] = [ Y(k;l) |    0         Υ˃(k+2^l;l) ]

        hyhound_diag_cyclic(tril(pcr_L.batch(l)), WL, //

                            pcr_Y.batch(l), WY, W0Y,  //

                            pcr_U.batch(l), WU, WU0, Σ);

    } else {

        auto WL = WYU;

        auto WU = work_update_pcr_L.left_cols(2 * ml).batch(0);

        // undo workspace rotation

        batmat::linalg::copy(WYU.left_cols(ml), WL.left_cols(ml), with_rotate<-prev_rot>);

        batmat::linalg::copy(WYU.right_cols(ml), WL.right_cols(ml), with_rotate<+prev_rot>);

        //           S(-1)    S(0)

        //  WL =  [ Υ˃(0)  | Υ˂(0)  ]

        //  WYU = [ Υ˃(+1) | Υ˂(-1) |

        // rotate element k±2^l to position k

        batmat::linalg::copy(WL.left_cols(ml), WU.right_cols(ml), with_rotate<rot>);

        batmat::linalg::copy(WL.right_cols(ml), WU.left_cols(ml), with_rotate<rot>);

        // [ L̃(k;l) |       0       ]   [ L(k;l) | Υ˃(k;l)      Υ˂(k;l)     ]

        // [ Ũ(k;l) | Υ˂(k-2^l;l+1) ] = [ U(k;l) | Υ˂(k-2^l;l)  Υ˃(k+2^l;l) ] Q̆(k;l)

        hyhound_diag_2(tril(pcr_L.batch(l)), WL, pcr_U.batch(l), WU, Σ);

        batmat::linalg::copy(WU, WU, with_rotate<rot>); // undo rotation

        batmat::linalg::copy(Σ, Σ, with_rotate<+rot>);

        // Final diagonal block

        // [ L̃(k;l+1) |   0   ] = [ L(k;l+1) | Υ˃(k;l+1)  Υ˂(k;l+1) ] Q̆(k;l+1)

        hyhound_diag(tril(pcr_L.batch(l + 1)), WU, Σ);

    }

}


//! [PCR update]


//! [Cyqlone update]

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>

template <bool Solve>


void CyqloneSolver<VL, T, DefaultOrder, Ctx>::update_solve_impl(Context &ctx, view<> ΔΣ,

                                                                mut_view<> ux, mut_view<> λ) {

    //  2|  Υ˃(c;0), Υ˂(c-1;0), 𝒮(c;0) = update-block-column-riccati(c)

    //  3|  update-schur(c)

    update_riccati_solve<Solve>(ctx, ΔΣ, ux, λ);

    //  5|  -- sync --

    ctx.arrive_and_wait(); // wait for Υ˃, Υ˂, x_next

    if constexpr (Solve) {

        const index_t c   = ctx.index; // different assignment than compute_schur

        const auto c_next = add_wrap_p(c, 1);

        const auto dn = c * n, dn_next = c_next * n, d1_next = dn_next + n - 1; // see compute_schur

        auto x_next = ux.batch(d1_next).bottom_rows(nx);

        c_next > 0 || v == 1 ? sub(λ.batch(dn), x_next) //

                             : sub(λ.batch(dn), x_next, with_rotate<1>);

    }

    // Update the block-tridiagonal Schur complement using CR

    tricyqle.template update_solve_cr<Solve>(ctx, λ, n);

}


//! [Cyqlone update]


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void CyqloneSolver<VL, T, DefaultOrder, Ctx>::update(Context &ctx, view<> ΔΣ) {

    update_solve_impl<false>(ctx, ΔΣ, {}, {});

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void CyqloneSolver<VL, T, DefaultOrder, Ctx>::update_solve(Context &ctx, view<> ΔΣ, mut_view<> ux,

                                                           mut_view<> λ) {

    update_solve_impl<true>(ctx, ΔΣ, ux, λ);

}


//! [Cyqlone update CR]

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>

template <bool Solve>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_solve_cr(Context &ctx, mut_view<> λ,

                                                               index_t stride) {

    const index_t c = ctx.index;

    //  6|  if ν₂(c) = 0:  update-L(0, c)

    if (ν2p(c) == 0) {

        update_L(0, c);

        if constexpr (Solve)

            if (p != 1)

                trsm(tril(cr_L.batch(c)), λ.batch(c * stride));

    }

    //  7|  for l = 0 ... log₂(P)-1

    for (index_t l = 0; l < lp(); ++l) {

        const auto c_ = cr_thread_assignment(l, c);

        //  8|  iU = c+1, iY = c+1-2^l

        const auto iU = add_wrap_ceil_p(c_, 1), iY = sub_wrap_ceil_p(c_, (1 << l) - 1);

        //  9|  -- sync --

        ctx.arrive_and_wait(); // wait for Q̆

        // 10|  if ν₂(iU) = l:  update-U(l, iU)

        if (ν2p(iU) == l) {

            update_U(l, iU);

            if constexpr (Solve)

                solve_u_forward(l, iU, λ, stride);

        }

        // 11|  elif ν₂(iY) = l:  update-Y(l, iY)

        else if (ν2p(iY) == l) {

            update_Y(l, iY);

            if constexpr (Solve)

                solve_y_forward(l, iY, λ, work_cr, stride);

        }

        // 12|  -- sync --

        ctx.arrive_and_wait(); // wait for Υ˃, Υ˂

        // 13|  if ν₂(iY) = l+1:  update-L(l+1, iY)

        if (ν2p(iY) == l + 1)

            update_L(l + 1, iY);

        if (ν2p(iU) == l)

            if constexpr (Solve)

                solve_λ_forward(l, iY, λ, work_cr, stride);

    }

    if constexpr (Solve) {

        ctx.arrive_and_wait();

        // TODO: synchronize here if switching to parallel PCR factor in update_L

        if (ν2p(c + 1) + 1 == lp() || p == 1)

            params.solve_method == SolveMethod::PCR

                ? solve_pcr(λ.batch(0), work_pcg.batch(0).left_cols(1))

                : solve_pcg(λ.batch(0), work_pcg.batch(0));

    }

}


//! [Cyqlone update CR]


// Algorithm 3 “Factorization update of a single modified Riccati block column”


//! [Cyqlone update Riccati]

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>

template <bool Solve>

// NOLINTNEXTLINE(*-cognitive-complexity) // Needs to match pseudocode structure


void CyqloneSolver<VL, T, DefaultOrder, Ctx>::update_riccati_solve(Context &ctx, view<> ΔΣ,

                                                                   mut_view<> ux, mut_view<> λ) {

    const index_t c = riccati_thread_assignment(ctx);

    //  3|  j₁ = n(c-1)+1, jₙ = nc

    const index_t dn  = c * n; // data batch index

    const index_t jn  = c * n; // stage index

    const index_t nux = nu + nx, nyM = std::max(ny, ny_0 + ny_N);

    auto LHs = riccati_LH.batch(c);

    auto B̂s = riccati_LAB.batch(c).right_cols(n * nu), Âs = riccati_LAB.batch(c).left_cols(n * nx);

    auto Υ1 = riccati_Υ1.batch(c), Υ2 = riccati_Υ2.batch(c);

    auto 𝑆 = work_Σ.batch(c); // \mathcal{S}_j in the paper


    // u(0) is mostly independent, since there is no coupling S(0) or A(0). Without vectorization

    // (v=1), we can handle it as a special case. This not only saves computation during the Riccati

    // update, but also introduces structural zeros that can be exploited during the CR updates.

    // Its contribution just has to be applied to LB(0) (which is done in this function), and to

    // M(0)/L(0) (which is done in update_L).

    const bool isolate_u0 = v == 1 && dn == 0;


    index_t m    = 0; // Total update rank so far

    index_t mu0  = 0; // Update rank for u(0)

    auto Υ_first = Υ2.left_cols(nyM), Υu0_first = Υ2.right_cols(ny_0);

    if (!isolate_u0) {

        GUANAQO_TRACE("Riccati update compress", jn);

        //  4|  [ Υu(jₙ) ]   [ D(jₙ)ᵀ ]

        //   |  [ Υx(jₙ) ] = [ C(jₙ)ᵀ ],    𝑆(jₙ) = ΔΣ(jₙ)

        //   |  [ Υλ(jₙ) ]   [   0    ]

        //  6|  m(j) = rank 𝑆(j)

        // Note that we only need to consider the columns corresponding to changing constraints,

        // i.e. where ΔΣ is nonzero, which is why we compress them.

        auto Υux = Υ_first.top_rows(nu + nx); // we don't know the number of columns yet

        if (nyM > 0)

            m = compress_masks(data_Gᵀ.batch(dn), ΔΣ.batch(dn), //

                               Υux, 𝑆.top_rows(nyM));

        auto Υλ = Υ_first.bottom_left(nx, m);

        Υλ.set_constant(0);

    } else {

        // Exploit the block-diagonal structure of G₀ = [ D₀ 0 ]  ny_0

        //                                              [ 0  Cₙ]  ny_N

        auto D0ᵀ = data_Gᵀ.batch(dn).top_left(nu, ny_0),

             C0ᵀ = data_Gᵀ.batch(dn).bottom_rows(nx).middle_cols(ny_0, ny_N);

        auto Υu0 = Υu0_first.top_rows(nu), Υx = Υ_first.middle_rows(nu, nx).left_cols(ny_N);

        if (ny_0 > 0)

            mu0 = compress_masks(D0ᵀ, ΔΣ.batch(dn).top_rows(ny_0), //

                                 Υu0, 𝑆.bottom_rows(ny_0));

        if (ny_N > 0)

            m = compress_masks(C0ᵀ, ΔΣ.batch(dn).middle_rows(ny_0, ny_N), //

                               Υx, 𝑆.top_rows(ny_N));

        auto Υλ = Υ_first.bottom_left(nx, m), Υλ0 = Υu0_first.bottom_left(nx, mu0);

        Υλ.set_constant(0);

        Υλ0.set_constant(0);

    }

    auto Υu0 = Υu0_first.top_left(nu, mu0), Υλ0 = Υu0_first.bottom_left(nx, mu0);

    auto 𝑆u0 = 𝑆.bottom_rows(ny_0).top_rows(mu0);


    // Iterate over all stages in the interval (in reverse order)

    for (index_t i = 0; i < n; ++i) {

        //  5|  for j = jₙ downto j₁

        const index_t j  = sub_wrap_ceil_N(jn, i); // stage index j ≡ jₙ - i mod N

        const index_t di = dn + i;                 // data batch index

        auto LH = LHs.middle_cols(i * nux, nux), LRS = LH.left_cols(nu);

        auto LR = tril(LRS.top_rows(nu)), LQ = tril(LH.bottom_right(nx, nx));

        auto LB = B̂s.middle_cols(i * nu, nu), Acl = Âs.middle_cols(i * nx, nx);


        index_t mj = m;

        auto Υ     = (i & 1 ? Υ1 : Υ2).left_cols(mj); // alternate between Υ1 and Υ2 workspaces

        auto Υux = Υ.top_rows(nu + nx), Υλ = Υ.bottom_rows(nx);

        if (!isolate_u0 || i != 0) {

            GUANAQO_TRACE("Riccati update RS", j);

            if (mj > 0)

                //  7|  [ L̃R(j)    0   ]   [ LR(j)  Υu(j) ]

                //   |  [ L̃S(j)  Φx(j) ] = [ LS(j)  Υx(j) ] Q̆u(j),  blkdiag(I, 𝑆(j))-orthogonal

                //   |  [ L̃B(j)  Φλ(j) ]   [ LB(j)  Υλ(j) ]

                hyhound_diag_2(tril(LRS), Υux, //

                               LB, Υλ, 𝑆.top_rows(mj));

        } else {

            GUANAQO_TRACE("Riccati update R", j);

            if (mu0 > 0)

                // Same as above, but using LS(j) = 0 = L̃S(j), Υx(j) = 0 = Φx(j)

                hyhound_diag_2(LR, Υu0, //

                               LB, Υλ0, 𝑆u0);

        }

        auto Φx = Υ.middle_rows(nu, nx), Φλ = Υ.bottom_rows(nx);

        if constexpr (Solve) {

            // Solve u ← LR̂⁻¹ u, x ← x - Ŝ u

            auto ui = ux.batch(di).top_rows(nu), xi = ux.batch(di).bottom_rows(nx);

            trsm(LR, ui);

            auto S = LRS.bottom_rows(nx);

            gemv_sub(S, ui, xi);

            auto λ_last = λ.batch(dn);

            gemv_add(LB, ui, λ_last);

        }

        //  8|  if j > j₁

        if (i + 1 < n) {

            [[maybe_unused]] const auto j_next = sub_wrap_ceil_N(j, 1);

            const auto di_next                 = dn + i + 1;

            auto Υ_next                        = (i & 1 ? Υ2 : Υ1).left_cols(mj + nyM);

            auto Υux_next = Υ_next.top_rows(nu + nx), Υλ_next = Υ_next.bottom_rows(nx);

            auto F_next = data_F.batch(di_next);

            if (mj > 0) {

                GUANAQO_TRACE("Riccati update prop", j_next);

                // 10|  [ Υu(j-1) ]   [ B(j-1)ᵀ Φx(j)   D(j-1)ᵀ ]

                //   |  [ Υx(j-1) ] = [ A(j-1)ᵀ Φx(j)   C(j-1)ᵀ ]

                //   |  [ Υλ(j-1) ]   [    Φλ(j)          0     ]

                // Left block column first

                gemm(F_next.transposed(), Φx, Υux_next.left_cols(mj));

                copy(Φλ, Υλ_next.left_cols(mj));

                // TODO: we may not have to copy Φλ every time. In fact, we can already write it in

                //       the CR workspace.

            }

            {

                GUANAQO_TRACE("Riccati update compress", j_next);

                // Now the right block column, again compressing to only the changing constraints

                if (nyM > 0)

                    m += compress_masks(data_Gᵀ.batch(di_next), ΔΣ.batch(di_next),

                                        Υux_next.right_cols(nyM), 𝑆.middle_rows(mj, nyM));

                Υλ_next.middle_cols(mj, m - mj).set_constant(0);

            }

            if (mj > 0) {

                GUANAQO_TRACE("Riccati update Q", j);

                //  9|  Ãcl(j) = Acl(j) + Φλ(j) 𝑆(j) Φx(j)ᵀ

                gemm_diag_add(Φλ, Φx.transposed(), Acl, 𝑆.top_rows(mj));

                // 12|  [ L̃Q(j)  0 ] = [ LQ(j)  Φx(j) ] Q̆x(j),  blkdiag(I, 𝑆(j))-orthogonal

                hyhound_diag(LQ, Φx, 𝑆.top_rows(mj));

            }

            if constexpr (Solve) {

                auto xi = ux.batch(di).bottom_rows(nx), ux_next = ux.batch(di_next),

                     λ_next = λ.batch(di_next), λ_last = λ.batch(dn);

                gemv_add(Acl, λ_next, λ_last); // λ(jn) += Â λ(j-1)

                auto w = tricyqle.work_cr.batch(c).left_cols(1);

                trmm(LQ.transposed(), λ_next, w);          // w = LQᵀ(j) λ(j-1)

                trmm(LQ, w);                               // w = LQ(j) LQᵀ(j) λ(j-1)

                sub(xi, w, w);                             // w = x(j) - LQ(j) LQᵀ(j) λ(j-1)

                gemv_add(F_next.transposed(), w, ux_next); // u(j-1) += BAᵀ(j-1) w

            }

        } else {

            const auto c_prev = sub_wrap_p(c, 1); // c-1

            // Communicate the update ranks mj to all threads and compute the column offsets in the

            // global update workspace we'll write Υ(c) and Υ(c-1) to.

            tricyqle.set_thread_update_rank(ctx, c_prev, mj);

            const index_t i_fwd = c, i_bwd = c_prev;

            const bool rotate = c == 0;

            GUANAQO_TRACE("Riccati update Q", j);

            CYQ_TRACE_WRITE(Upf, i_fwd, 0);

            CYQ_TRACE_WRITE(Upb, i_bwd, 0);

            if (mj > 0) {

                auto Tc    = LH.block(nu - 1, nu, nx, nx); // T(c) = LQ(j₁)⁻ᵀ, see compute_schur

                auto Υ_fwd = tricyqle.work_Ups_fwd(0, i_fwd).left_cols(mj),

                     Υ_bwd_prev = tricyqle.work_Ups_bwd(0, i_bwd).left_cols(mj);

                auto 𝒮cr = tricyqle.work_Σ_fwd(0, i_fwd).top_rows(mj); // \mathscr{S}_c in the paper

                // 12|  [ L̃Q(j)  0 ] = [ LQ(j)  Φx(j) ] Q̆x(j),  blkdiag(I, 𝑆(j))-orthogonal


                // Fused with:

                // 14|  [ L̃A(j₁)  Υ˃(c)   ] = [ LA(j₁)  Φλ(j₁) ] Q̆x(j₁),


                //   |  [ -T̃(c)   Υ˂(c-1) ]   [ -T(c)     0    ]

                hyhound_diag_riccati(LQ, Φx,                  //


                                     Acl, Φλ, Υ_fwd,          //

                                     Tc, /*0*/ Υ_bwd_prev,    // note the lack of a minus sign ...


                                     𝑆.top_rows(mj), rotate); //


                negate(Υ_bwd_prev);                           // which is fixed here (TODO: fuse)


                // 13|  𝒮(c) = 𝑆(j₁)


                rotate ? negate(𝑆.top_rows(mj), 𝒮cr, with_rotate<1>) //


                       : negate(𝑆.top_rows(mj), 𝒮cr);


                // We negate 𝒮(c) because in the CR update, we need blkdiag(-I, 𝒮(c))-orthogonal


                // or blkdiag(I, -𝒮(c))-orthogonal transformations.


            }


            if constexpr (Solve) {


                auto xi = ux.batch(di).bottom_rows(nx), λ_last = λ.batch(dn);


                trsm(LQ, xi);


                gemv_add(Acl, xi, λ_last);


                trsm(LQ.transposed(), xi);


            }


            if (dn == 0) {


                // Add the contribution from the isolated update for u(0) as well


                if (isolate_u0) {

                    tricyqle.set_update_rank_extra(mu0);

                    copy(Υλ0, tricyqle.work_Ups_extra());


                    negate(𝑆u0, tricyqle.work_Σ_extra());


                } else {


                    tricyqle.clear_update_rank_extra();


                }

            }

        }


    }


}

//! [Cyqlone update Riccati]


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::set_thread_update_rank(Context &ctx, index_t c,

                                                                      index_t m) {

    m_update[c] = m;

    ctx.run_single_sync(

        [this] { std::inclusive_scan(begin(m_update), end(m_update), begin(m_update)); });

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::set_update_rank_extra(index_t m) {

    m_update_u0 = m;

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


void TricyqleSolver<VL, T, DefaultOrder, Ctx>::clear_update_rank_extra() {

    m_update_u0 = -1;

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>

[[nodiscard]] std::pair<index_t, index_t>


TricyqleSolver<VL, T, DefaultOrder, Ctx>::cols_Ups_fwd(index_t l, index_t i) const {

    BATMAT_ASSUME(ν2p(i) >= l); // i % offset = 0

    const index_t offset = 1 << l, floor_mask = offset - 1;

    // Current block ends at i (or at p if i == 0),

    // minus one because m_update is an inclusive sum.

    const index_t ip  = i == 0 ? p : i;

    const index_t end = m_update[ip - 1];

    // Current block starts at the previous multiple of offset.

    const index_t i_start = (ip - 1) & ~floor_mask;

    const index_t start   = i_start > 0 ? m_update[i_start - 1] : 0;

    return {start, end};

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>

[[nodiscard]] std::pair<index_t, index_t>


TricyqleSolver<VL, T, DefaultOrder, Ctx>::cols_Ups_bwd(index_t l, index_t i) const {

    BATMAT_ASSUME(ν2p(i) >= l); // i % offset = 0

    const index_t offset = 1 << l;

    // The start index of the next block (at i + offset),

    // minus one because m_update is an inclusive sum.

    // If p is not a power of two, we need to clamp to p.

    const index_t i_end = std::min(i + offset, p);

    const index_t end   = m_update[i_end - 1];

    // The start index of the current block is i.

    const index_t start = i > 0 ? m_update[i - 1] : 0;

    return {start, end};

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>

[[nodiscard]] std::pair<index_t, index_t>


TricyqleSolver<VL, T, DefaultOrder, Ctx>::cols_Q_cr(index_t l, index_t i) const {

    return {cols_Ups_fwd(l, i).first, cols_Ups_bwd(l, i).second};

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] index_t TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Ups_fwd_w(index_t l,

                                                                               index_t i) const {

    const index_t offset = 1 << l, floor_mask = offset - 1;

    if (i == 0 && l + 2 <= lp()) {

        i = (p - 1) & ~floor_mask; // beginning of the last block

        i += offset;               // make sure we don't overlap with it

    }

    return i == 0 ? l + 2 : std::min(l + 2, ν2(i));

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] index_t TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Ups_bwd_w(index_t l,

                                                                               index_t i) const {

    if (l == lp())

        return l; // Keep Υ˃(0) @ [l+2] and Υ˂(0) @ [l] in separate workspaces at the last level

    return i == 0 ? l + 2 : std::min(l + 2, ν2(i));

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Ups_fwd(index_t l, index_t i)

    -> mut_batch_view<column_major> {

    auto [start, end] = cols_Ups_fwd(l, i);

    index_t w         = work_Ups_fwd_w(l, i);

    return work_update.batch(w & 3).middle_cols(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Ups_bwd(index_t l, index_t i)

    -> mut_batch_view<column_major> {

    auto [start, end] = cols_Ups_bwd(l, i);

    const index_t w   = work_Ups_bwd_w(l, i);

    return work_update.batch(w & 3).middle_cols(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Q_cr(index_t l, index_t i)

    -> mut_batch_view<column_major> {

    auto [start, end] = cols_Q_cr(l, i);

    const index_t w   = l;

    return work_update.batch(w & 3).middle_cols(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Σ_fwd(index_t l, index_t i)

    -> mut_batch_view<column_major> {

    auto [start, end] = cols_Ups_fwd(l, i);

    return work_update_Σ.batch(0).middle_rows(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Σ_bwd(index_t l, index_t i)

    -> mut_batch_view<column_major> {

    auto [start, end] = cols_Ups_bwd(l, i);

    return work_update_Σ.batch(0).middle_rows(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Σ_Q(index_t l, index_t i)

    -> mut_batch_view<column_major> {

    auto [start, end] = cols_Q_cr(l, i);

    return work_update_Σ.batch(0).middle_rows(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Ups_fwd_last()

    -> mut_batch_view<column_major> {

    const index_t l = lp(), i = 0;

    auto [start, end] = cols_Ups_fwd(l, i);

    index_t w         = work_Ups_fwd_w(l, i);

    if (m_update_u0 >= 0)

        return work_update.batch(w & 3).middle_cols(start, 0);

    return work_update.batch(w & 3).middle_cols(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Ups_bwd_last()

    -> mut_batch_view<column_major> {

    const index_t l = lp(), i = 0;

    auto [start, end] = cols_Ups_bwd(l, i);

    const index_t w   = work_Ups_bwd_w(l, i);

    if (m_update_u0 >= 0)

        end += m_update_u0; // include extra columns in Υ˂(0) in the last level

    return work_update.batch(w & 3).middle_cols(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Σ_fwd_last()

    -> mut_batch_view<column_major> {

    const index_t l = lp(), i = 0;

    auto [start, end] = cols_Ups_fwd(l, i);

    if (m_update_u0 >= 0)

        return work_update_Σ.batch(0).middle_rows(start, 0);

    return work_update_Σ.batch(0).middle_rows(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Σ_bwd_last()

    -> mut_batch_view<column_major> {

    const index_t l = lp(), i = 0;

    auto [start, end] = cols_Ups_bwd(l, i);

    if (m_update_u0 >= 0)

        end += m_update_u0;

    return work_update_Σ.batch(0).middle_rows(start, end - start);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Ups_extra()

    -> mut_batch_view<column_major> {

    BATMAT_ASSERT(m_update_u0 >= 0);

    return work_Ups_bwd_last().right_cols(m_update_u0);

}


template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>


[[nodiscard]] auto TricyqleSolver<VL, T, DefaultOrder, Ctx>::work_Σ_extra()

    -> mut_batch_view<column_major> {

    BATMAT_ASSERT(m_update_u0 >= 0);

    return work_Σ_bwd_last().bottom_rows(m_update_u0);

}


} // namespace CYQLONE_NS(cyqlone)


BATMAT_ASSUME
#define BATMAT_ASSUME(x)

BATMAT_ASSERT
#define BATMAT_ASSERT(x)

cyqlone.hpp
The main header for the Cyqlone and Tricyqle linear solvers.

cyqlone::SolveMethod::PCR
@ PCR
Parallel Cyclic Reduction (direct).
Definition cyqlone-params.hpp:14

batmat::linalg::gemm_diag_add
void gemm_diag_add(VA &&A, VB &&B, VC &&C, VD &&D, Vd &&d, Opts... opts)

batmat::linalg::trsm
void trsm(Structured< VA, SA > A, VB &&B, VD &&D, with_rotate_B_t< RotB >={})

batmat::linalg::gemm_neg
void gemm_neg(VA &&A, VB &&B, VD &&D, TilingOptions packing={}, Opts... opts)

batmat::linalg::hyhound_diag_apply
void hyhound_diag_apply(VL &&L, VA &&A, VD &&D, VB &&B, Vd &&d, VW &&W, index_t kA_in_offset=0)

batmat::linalg::gemv_add
void gemv_add(VA &&A, VB &&B, VC &&C, VD &&D, Opts... opts)

batmat::linalg::gemm
void gemm(VA &&A, VB &&B, VD &&D, TilingOptions packing={}, Opts... opts)

batmat::linalg::hyhound_diag_riccati
void hyhound_diag_riccati(Structured< VL11, SL > L11, VA1 &&A1, VL21 &&L21, VA2 &&A2, VA2o &&A2_out, VLu1 &&Lu1, VAuo &&Au_out, Vd &&d, bool shift_A_out=false)

batmat::linalg::trmm
void trmm(Structured< VA, SA > A, Structured< VB, SB > B, Structured< VD, SD > D, Opts... opts)

batmat::linalg::compress_masks
index_t compress_masks(VA &&Ain, VS &&Sin, VAo &&Aout, VSo &&Sout)

cyqlone::linalg::negate
void negate(VA &&A, VB &&B, with_rotate_t< Rotate >={})
Negate a matrix or vector B = -A.
Definition linalg.hpp:386

batmat::linalg::syrk_diag_add
void syrk_diag_add(VA &&A, Structured< VC, SC > C, Structured< VD, SC > D, Vd &&d, Opts... opts)

batmat::linalg::copy
void copy(VA &&A, VB &&B, Opts... opts)

batmat::linalg::gemv_sub
void gemv_sub(VA &&A, VB &&B, VC &&C, VD &&D, Opts... opts)

batmat::linalg::hyhound_diag_2
void hyhound_diag_2(Structured< VL1, SL > L1, VA1 &&A1, VL2 &&L2, VA2 &&A2, Vd &&d)

batmat::linalg::hyhound_diag
void hyhound_diag(Structured< VL, SL > L, VA &&A, Vd &&d)

batmat::linalg::hyhound_diag_cyclic
void hyhound_diag_cyclic(Structured< VL11, SL > L11, VA1 &&A1, VL21 &&L21, VA2 &&A22, VA2o &&A2_out, VU &&L31, VA3 &&A31, VA3o &&A3_out, Vd &&d)

cyqlone::linalg::sub
void sub(VA &&A, VB &&B, VC &&C, with_rotate_t< Rotate >={})
Subtract two matrices or vectors C = A - B. Rotate affects B.
Definition linalg.hpp:401

batmat::linalg::tril
constexpr auto tril(M &&m)

rot
datapar::simd< F, Abi > rot(datapar::simd< F, Abi > x, int s)

GUANAQO_TRACE
#define GUANAQO_TRACE(name, instance,...)

batmat::linalg::with_rotate
constexpr with_rotate_t< I > with_rotate

batmat::matrix::View::bottom_rows
row_slice_view_type bottom_rows(index_type n) const

batmat::matrix::View::cols
constexpr index_type cols() const

batmat::matrix::View::right_cols
col_slice_view_type right_cols(index_type n) const

batmat::matrix::View::left_cols
col_slice_view_type left_cols(index_type n) const

batmat::matrix::View::batch
batch_view_type batch(index_type b) const

cyqlone::CyqloneSolver::n
const index_t n
Number of stages per thread per vector lane (rounded up).
Definition cyqlone.hpp:605

cyqlone::CyqloneSolver::update
void update(Context &ctx, view<> ΔΣ)
Perform factorization updates of the Cyqlone factorization as described by Algorithm 4 in the paper.
Definition update.tpp:284

cyqlone::CyqloneSolver::view
typename tricyqle_t::template view< O > view
Non-owning immutable view type for matrix.
Definition cyqlone.hpp:693

cyqlone::CyqloneSolver::data_F
matrix< default_order > data_F
Stage-wise dynamics matrices F(j) = [ B(j) A(j) ] of the OCP.
Definition cyqlone.hpp:766

cyqlone::CyqloneSolver::data_Gᵀ
matrix< default_order > data_Gᵀ
Stage-wise constraint Jacobians G(j)ᵀ = [ D(j) C(j) ]ᵀ of the OCP.
Definition cyqlone.hpp:770

cyqlone::CyqloneSolver::update_solve
void update_solve(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
Fused variant of update and solve_forward.
Definition update.tpp:289

cyqlone::CyqloneSolver::update_solve_impl
void update_solve_impl(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
[PCR update]
Definition update.tpp:263

cyqlone::CyqloneSolver::sub_wrap_ceil_N
index_t sub_wrap_ceil_N(index_t a, index_t b) const
Subtract b from a modulo N_horiz.
Definition indexing.tpp:53

cyqlone::CyqloneSolver::add_wrap_p
index_t add_wrap_p(index_t a, index_t b) const
Add b to a modulo p.
Definition indexing.tpp:73

cyqlone::CyqloneSolver::Context
tricyqle_t::Context Context
Definition cyqlone.hpp:596

cyqlone::CyqloneSolver::ny
const index_t ny
Number of general constraints of the OCP per stage.
Definition cyqlone.hpp:570

cyqlone::CyqloneSolver::riccati_Υ2
matrix< column_major > riccati_Υ2
Alternate workspace to riccati_Υ1.
Definition cyqlone.hpp:820

cyqlone::CyqloneSolver::riccati_thread_assignment
index_t riccati_thread_assignment(Context &ctx) const
Definition cyqlone.hpp:972

cyqlone::CyqloneSolver::riccati_Υ1
matrix< column_major > riccati_Υ1
Workspace to store the update matrices Υu, Υx, Υλ, Φu, Φx and Φλ during the factorization update of t...
Definition cyqlone.hpp:815

cyqlone::CyqloneSolver::update_riccati_solve
void update_riccati_solve(Context &ctx, view<> ΔΣ, mut_view<> ux, mut_view<> λ)
Update the modified Riccati factorization of a single block column as described by Algorithm 3 in the...
Definition update.tpp:352

cyqlone::CyqloneSolver::sub_wrap_p
index_t sub_wrap_p(index_t a, index_t b) const
Subtract b from a modulo p.
Definition indexing.tpp:64

cyqlone::CyqloneSolver::mut_view
typename tricyqle_t::template mut_view< O > mut_view
Non-owning mutable view type for matrix.
Definition cyqlone.hpp:696

cyqlone::CyqloneSolver::ny_0
const index_t ny_0
Number of general constraints at stage 0, D(0) u(0).
Definition cyqlone.hpp:571

cyqlone::CyqloneSolver::nu
const index_t nu
Number of controls of the OCP.
Definition cyqlone.hpp:569

cyqlone::CyqloneSolver::riccati_LH
matrix< default_order > riccati_LH
Cholesky factors of the Hessian blocks for the Riccati recursion.
Definition cyqlone.hpp:782

cyqlone::CyqloneSolver::work_Σ
matrix< column_major > work_Σ
Compressed representation of the nonzero diagonal elements of the matrix Σ, populated for each thread...
Definition cyqlone.hpp:808

cyqlone::CyqloneSolver::tricyqle
tricyqle_t tricyqle
Block-tridiagonal solver (CR/PCR/PCG).
Definition cyqlone.hpp:747

cyqlone::CyqloneSolver::ny_N
const index_t ny_N
Number of general constraints at the final stage, C(N) x(N).
Definition cyqlone.hpp:572

cyqlone::CyqloneSolver::v
static constexpr index_t v
Vector length.
Definition cyqlone.hpp:603

cyqlone::CyqloneSolver::nx
const index_t nx
Number of states of the OCP.
Definition cyqlone.hpp:568

cyqlone::CyqloneSolver::riccati_LAB
matrix< default_order > riccati_LAB
Storage for the matrices LB(j), Acl(j) and LA(j₁) for the Riccati recursion.
Definition cyqlone.hpp:788

cyqlone::TricyqleSolver::update_pcr
void update_pcr(batch_view<> fwd, batch_view<> bwd, batch_view<> Σ)
[Cyqlone update CR helper]
Definition update.tpp:180

cyqlone::TricyqleSolver::lp
constexpr index_t lp() const
log₂(p), logarithm of the number of processors/threads p, rounded up.
Definition cyqlone.hpp:105

cyqlone::TricyqleSolver::work_Σ_extra
mut_batch_view< column_major > work_Σ_extra()
Definition update.tpp:706

cyqlone::TricyqleSolver::lv
static constexpr index_t lv()
log₂(v), logarithm of the vector length v.
Definition cyqlone.hpp:111

cyqlone::TricyqleSolver::mut_batch_view
batmat::matrix::View< value_type, index_t, vl_t, vl_t, layer_stride, O > mut_batch_view
Non-owning mutable view type for a single batch of v matrices.
Definition cyqlone.hpp:165

cyqlone::TricyqleSolver::work_Σ_fwd
mut_batch_view< column_major > work_Σ_fwd(index_t l, index_t i)
Definition update.tpp:636

cyqlone::TricyqleSolver::work_Ups_fwd_last
mut_batch_view< column_major > work_Ups_fwd_last()
Definition update.tpp:657

cyqlone::TricyqleSolver::work_Σ_bwd_last
mut_batch_view< column_major > work_Σ_bwd_last()
Definition update.tpp:689

cyqlone::TricyqleSolver::work_Σ_Q
mut_batch_view< column_major > work_Σ_Q(index_t l, index_t i)
Definition update.tpp:650

cyqlone::TricyqleSolver::work_Ups_extra
mut_batch_view< column_major > work_Ups_extra()
Definition update.tpp:699

cyqlone::TricyqleSolver::work_update_pcr_UY
matrix< column_major > work_update_pcr_UY
Update matrices to apply to the subdiagonal blocks U and Y during PCR updates.
Definition cyqlone.hpp:351

cyqlone::TricyqleSolver::ν2
index_t ν2(index_t i) const
2-adic valuation ν₂.
Definition indexing.tpp:30

cyqlone::TricyqleSolver::work_update_pcr_L
matrix< column_major > work_update_pcr_L
Update matrices to apply to the diagonal blocks L during PCR updates.
Definition cyqlone.hpp:347

cyqlone::TricyqleSolver::work_Q_cr
mut_batch_view< column_major > work_Q_cr(index_t l, index_t i)
Definition update.tpp:628

cyqlone::TricyqleSolver::solve_y_forward
void solve_y_forward(index_t l, index_t iY, mut_view<> λ, mut_view<> w, index_t stride) const
Update the right-hand side λ during the forward solve phase of CR after computing block iY of λ at le...
Definition cr.tpp:177

cyqlone::TricyqleSolver::mut_view
batmat::matrix::View< value_type, index_t, vl_t, index_t, index_t, O > mut_view
Non-owning mutable view type for matrix.
Definition cyqlone.hpp:158

cyqlone::TricyqleSolver::ν2p
index_t ν2p(index_t i) const
2-adic valuation modulo p, i.e. ν2p(0) = ν2p(p) = lp().
Definition indexing.tpp:36

cyqlone::TricyqleSolver::add_wrap_ceil_p
index_t add_wrap_ceil_p(index_t a, index_t b) const
Add b to a modulo ceil_p().
Definition indexing.tpp:19

cyqlone::TricyqleSolver::work_Σ_bwd
mut_batch_view< column_major > work_Σ_bwd(index_t l, index_t i)
Definition update.tpp:643

cyqlone::TricyqleSolver::sub_wrap_ceil_p
index_t sub_wrap_ceil_p(index_t a, index_t b) const
Subtract b from a modulo ceil_p().
Definition indexing.tpp:8

cyqlone::TricyqleSolver::cr_thread_assignment
index_t cr_thread_assignment(index_t l, index_t c) const
Adjust thread assignment for non-power-of-two p: The diagonal blocks M(⌊p/2⌋2) are usually mapped to ...
Definition factor.tpp:277

cyqlone::TricyqleSolver::pcr_U
matrix< default_order > pcr_U
Subdiagonal blocks U of the PCR Cholesky factorizations.
Definition cyqlone.hpp:305

cyqlone::TricyqleSolver::work_Ups_bwd
mut_batch_view< column_major > work_Ups_bwd(index_t l, index_t i)
Definition update.tpp:620

cyqlone::TricyqleSolver::pcr_L
matrix< default_order > pcr_L
Diagonal blocks of the PCR Cholesky factorizations.
Definition cyqlone.hpp:296

cyqlone::TricyqleSolver::cols_Ups_fwd
std::pair< index_t, index_t > cols_Ups_fwd(index_t l, index_t i) const
Definition update.tpp:558

cyqlone::TricyqleSolver::update_L
void update_L(index_t l, index_t i)
[Cyqlone update CR helper]
Definition update.tpp:23

cyqlone::TricyqleSolver::work_update
matrix< column_major > work_update
Workspace to store the update matrices Ξ(Υ) for the factorization update.
Definition cyqlone.hpp:332

cyqlone::TricyqleSolver::cr_Y
matrix< default_order > cr_Y
Subdiagonal blocks Y of the Cholesky factor of the Schur complement (used during CR).
Definition cyqlone.hpp:282

cyqlone::TricyqleSolver::m_update
std::vector< index_t > m_update
Update rank (number of changing constraints) per thread.
Definition cyqlone.hpp:323

cyqlone::TricyqleSolver::cols_Q_cr
std::pair< index_t, index_t > cols_Q_cr(index_t l, index_t i) const
Definition update.tpp:588

cyqlone::TricyqleSolver::update_pcr_level
void update_pcr_level(index_t m, mut_batch_view<> WYU, mut_batch_view<> WΣ)
Definition update.tpp:198

cyqlone::TricyqleSolver::work_cr
matrix< column_major > work_cr
Temporary workspace for the CR solve phase.
Definition cyqlone.hpp:286

cyqlone::TricyqleSolver::work_Ups_bwd_last
mut_batch_view< column_major > work_Ups_bwd_last()
Definition update.tpp:668

cyqlone::TricyqleSolver::work_Σ_fwd_last
mut_batch_view< column_major > work_Σ_fwd_last()
Definition update.tpp:679

cyqlone::TricyqleSolver::work_Ups_bwd_w
index_t work_Ups_bwd_w(index_t l, index_t i) const
Definition update.tpp:604

cyqlone::TricyqleSolver::set_update_rank_extra
void set_update_rank_extra(index_t m)
Definition update.tpp:547

cyqlone::TricyqleSolver::solve_λ_forward
void solve_λ_forward(index_t l, index_t iL, mut_view<> λ, view<> w, index_t stride) const
Apply the updates to block iL of the right-hand side from solve_u_forward and solve_y_forward,...
Definition cr.tpp:190

cyqlone::TricyqleSolver::work_Ups_fwd
mut_batch_view< column_major > work_Ups_fwd(index_t l, index_t i)
Definition update.tpp:612

cyqlone::TricyqleSolver::update_solve_cr
void update_solve_cr(Context &ctx, mut_view<> λ, index_t stride)
[Cyqlone update CR]
Definition update.tpp:297

cyqlone::TricyqleSolver::set_thread_update_rank
void set_thread_update_rank(Context &ctx, index_t c, index_t m)
[Cyqlone update Riccati]
Definition update.tpp:539

cyqlone::TricyqleSolver::factor_pcr
void factor_pcr()
Compute the parallel cyclic reduction factorization of the final block tridiagonal system of size v.
Definition pcr.tpp:28

cyqlone::TricyqleSolver::batch_view
batmat::matrix::View< const value_type, index_t, vl_t, vl_t, layer_stride, O > batch_view
Non-owning immutable view type for a single batch of v matrices.
Definition cyqlone.hpp:162

cyqlone::TricyqleSolver::work_update_Σ
matrix< column_major > work_update_Σ
Compressed reprentation of the nonzero diagonal elements of the matrix Σ.
Definition cyqlone.hpp:328

cyqlone::TricyqleSolver::solve_pcg
void solve_pcg(mut_batch_view<> λ, mut_batch_view<> work_pcg) const
Solve a linear system with the final block tridiagonal system of size v using the preconditioned conj...
Definition pcg.tpp:54

cyqlone::TricyqleSolver::v
static constexpr index_t v
Vector length.
Definition cyqlone.hpp:103

cyqlone::TricyqleSolver::m_update_u0
index_t m_update_u0
Update rank from D(0). Negative if D(0) is not handled separately.
Definition cyqlone.hpp:325

cyqlone::TricyqleSolver::work_pcg
matrix< column_major > work_pcg
Temporary workspace for CG vectors.
Definition cyqlone.hpp:313

cyqlone::TricyqleSolver::work_hyh
matrix< column_major > work_hyh
Storage for the hyperbolic Householder transformations.
Definition cyqlone.hpp:336

cyqlone::TricyqleSolver::block_size
const index_t block_size
Block size of the block-tridiagonal system.
Definition cyqlone.hpp:75

cyqlone::TricyqleSolver::params
Params params
Solver parameters for Tricyqle-specific settings.
Definition cyqlone.hpp:87

cyqlone::TricyqleSolver::solve_u_forward
void solve_u_forward(index_t l, index_t iU, mut_view<> λ, index_t stride) const
Update the right-hand side λ during the forward solve phase of CR after computing block iU of λ at le...
Definition cr.tpp:163

cyqlone::TricyqleSolver::p
const index_t p
Number of processors/threads.
Definition cyqlone.hpp:101

cyqlone::TricyqleSolver::solve_pcr
void solve_pcr(mut_batch_view<> λ, mut_batch_view<> work_pcr) const
Solve a linear system with the final block tridiagonal system of size v using the PCR factorization.
Definition pcr.tpp:181

cyqlone::TricyqleSolver::work_Ups_fwd_w
index_t work_Ups_fwd_w(index_t l, index_t i) const
Definition update.tpp:593

cyqlone::TricyqleSolver::update_Y
void update_Y(index_t l, index_t i)
Definition update.tpp:157

cyqlone::TricyqleSolver::cols_Ups_bwd
std::pair< index_t, index_t > cols_Ups_bwd(index_t l, index_t i) const
Definition update.tpp:573

cyqlone::TricyqleSolver::cr_U
matrix< default_order > cr_U
Subdiagonal blocks U of the Cholesky factor of the Schur complement (used during CR).
Definition cyqlone.hpp:277

cyqlone::TricyqleSolver::pcr_Y
matrix< default_order > pcr_Y
Subdiagonal blocks Y of the PCR Cholesky factorizations.
Definition cyqlone.hpp:301

cyqlone::TricyqleSolver::cr_L
matrix< default_order > cr_L
Diagonal blocks of the Cholesky factor of the Schur complement (used during CR).
Definition cyqlone.hpp:272

cyqlone::TricyqleSolver::work_update_pcr_Σ
matrix< column_major > work_update_pcr_Σ
Two copies of work_update_Σ for PCR updates.
Definition cyqlone.hpp:343

cyqlone::TricyqleSolver::update_U
void update_U(index_t l, index_t i)
Definition update.tpp:123

cyqlone::TricyqleSolver::clear_update_rank_extra
void clear_update_rank_extra()
Definition update.tpp:552

cyqlone::TricyqleSolver::Context
Ctx Context
Definition cyqlone.hpp:69

tracing.hpp

CYQ_TRACE_WRITE
#define CYQ_TRACE_WRITE(...)
Definition tracing.hpp:62

CYQ_TRACE_READ
#define CYQ_TRACE_READ(...)
Definition tracing.hpp:63