This page lists the implementations of the algorithms described in the Cyqlone paper [1], with discussions of the differences compared to the pseudo-code, and with line-by-line comments referencing the corresponding steps in the paper.

Algorithm 1: Factorization of a single modified Riccati block column

Factorization of the smaller OCPs on each sub-interval using a modified Riccati recursion. Optionally fused with the forward solve.

Differences compared to the pseudo-code in the paper:

Many operations are performed in-place to reduce memory usage. For example, the matrices R̂, Ŝ and Q̂ are replaced by their Cholesky factors LR, LS and LQ.
Solution is fused/interleaved with the factorization steps to improve temporal locality and reduce memory bandwidth. This is controlled by the Factor and Solve template parameters.
The addition of the penalty term DCᵀ Σ DC is fused with the rest of the operations, avoiding an explicit formation of the intermediate matrix and improving cache locality. See §5.1 “The augmented Lagrangian inner problem” for details about the penalty term.
The product V(j-1) V(j-1)ᵀ is not added to the Hessian at the end of an iteration, but rather at the beginning of the next iteration, so it can be fused with the addition of DCᵀ Σ DC and the Cholesky factorization of the sum.
Data batch indices where the problem data and the factorization are stored are reversed compared to the stage indices, matching the iteration order and simplifying the per-thread contiguous storage.

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <bool Factor, bool Solve>
// NOLINTNEXTLINE(*-cognitive-complexity) // Needs to match pseudocode structure
void CyqloneSolver<VL, T, DefaultOrder, Ctx>::factor_riccati_solve(Context &ctx, value_type γ,
                                                                   view<> Σ, mut_view<> ux,
                                                                   mut_view<> λ) {
    using batmat::linalg::compress_masks_sqrt;
    const index_t c = riccati_thread_assignment(ctx);
    //  3|  j₁ = n(c-1)+1, jₙ = nc
    const index_t dn  = c * n;                                    // data batch index
    const index_t jn  = c * n;                                    // stage index
    const index_t nux = nu + nx, nyM = std::max(ny, ny_0 + ny_N); // max active constraints/stage
    // TODO: special case nyM for c == 0
    auto LHs = riccati_LH.batch(c);
    auto B̂s = riccati_LAB.batch(c).right_cols(n * nu), Âs = riccati_LAB.batch(c).left_cols(n * nx);
    auto VGᵀ       = riccati_V.batch(c);
    index_t m_syrk = 0; // number of columns of VDCᵀ (depends on active constraints)
    if constexpr (Factor) {
        GUANAQO_TRACE("Riccati init", jn);
        //  4|  B̂(jₙ) = B(jₙ)
        // Note that Â(jₙ) is not copied explicitly, as it is not modified in-place
        copy(data_F.batch(dn).left_cols(nu), B̂s.left_cols(nu));
        // Compress the active constraint Jacobians to add them to the Hessian later
        if (nyM > 0)
            m_syrk = compress_masks_sqrt(data_Gᵀ.batch(dn), Σ.batch(dn), VGᵀ.left_cols(nyM));
    }
    // Iterate over all stages in the interval (in reverse order)
    for (index_t i = 0; i < n; ++i) {
        //  6|  for j = jₙ downto j₁
        const index_t j  = sub_wrap_ceil_N(jn, i); // stage index j ≡ jₙ - i mod N
        const index_t di = dn + i;                 // data batch index
        auto LH          = LHs.middle_cols(i * nux, nux);
        auto RS          = LH.left_cols(nu);
        auto R = RS.top_rows(nu), S = RS.bottom_rows(nx), Q = LH.bottom_right(nx, nx);
        auto B̂ = B̂s.middle_cols(i * nu, nu), Acl = Âs.middle_cols(i * nx, nx);
        {
            GUANAQO_TRACE("Riccati QRS", j);
            // Compute and factor R̂, update Ŝ, factor Q̂
            //
            // 13|  [ R̂(j)  Ŝ(j) ] = [ R(j)  S(j) ] + [ D(j)ᵀ ] Σ(j) [ D(j)  C(j) ] + V(j) V(j)ᵀ
            //   |  [ Ŝ(j)ᵀ Q̂(j) ]   [ S(j)ᵀ Q(j) ]   [ C(j)ᵀ ]
            //
            //  7|  [ LR(j)       ] = chol [ R̂(j)  Ŝ(j) ]
            //   |  [ LS(j) LQ(j) ]        [ Ŝ(j)ᵀ Q̂(j) ]
            if constexpr (Factor) {
                // VGᵀprev = [ B(j+1)ᵀ LQ(j+1)   D(j)ᵀ √Σ(j) ]
                //           [ A(j+1)ᵀ LQ(j+1)   C(j)ᵀ √Σ(j) ]
                auto VGᵀprev = VGᵀ.left_cols(m_syrk);
                syrk_add_potrf(VGᵀprev, tril(data_H.batch(di)), tril(LH), 1 / γ);
            }
            if constexpr (Solve) {
                // Solve u ← LR̂⁻¹ u, x ← x - Ŝ u
                auto ui = ux.batch(di).top_rows(nu), xi = ux.batch(di).bottom_rows(nx);
                trsm(tril(R), ui);
                gemv_sub(S, ui, xi);
            }
            //  8|  LB(j) = B̂(j) LR(j)⁻ᵀ
            if constexpr (Factor) {
                trsm(B̂, tril(R).transposed());
            }
            if constexpr (Solve) {
                auto ui = ux.batch(di).top_rows(nu), λ_last = λ.batch(dn);
                gemv_add(B̂, ui, λ_last);
            }
            //  9|  Acl(j) = Â(j) - LB(j) LS(j)ᵀ
            if constexpr (Factor) {
                //  4|  Â(jₙ) = A(jₙ)
                auto An = data_F.batch(dn).right_cols(nx);
                i == 0 ? gemm_sub(B̂, S.transposed(), An, Acl) //
                       : gemm_sub(B̂, S.transposed(), Acl);
            }
        }
        // 10|  if j > j₁
        if (i + 1 < n) {
            [[maybe_unused]] const auto j_next = sub_wrap_ceil_N(j, 1);
            GUANAQO_TRACE("Riccati update AB", j_next);
            const auto di_next = dn + i + 1;
            auto VGᵀnext = VGᵀ.left_cols(nx + nyM), V_next = VGᵀnext.left_cols(nx),
                 Gᵀnext = VGᵀnext.right_cols(nyM);
            auto F_next = data_F.batch(di_next), B_next = F_next.left_cols(nu),
                 A_next = F_next.right_cols(nx);
            // 11|  [ B̂(j-1)  Â(j-1) ] = Acl(j) [ B(j-1)  A(j-1) ]
            if constexpr (Factor) {
                auto B̂_next = B̂s.middle_cols((i + 1) * nu, nu),
                     Â_next = Âs.middle_cols((i + 1) * nx, nx);
                gemm(Acl, B_next, B̂_next);
                gemm(Acl, A_next, Â_next);
            }
            if constexpr (Solve) {
                auto xi = ux.batch(di).bottom_rows(nx), ux_next = ux.batch(di_next),
                     λ_next = λ.batch(di_next), λ_last = λ.batch(dn);
                gemv_add(Acl, λ_next, λ_last); // λ(jn) += Â λ(j-1)
                auto w = tricyqle.work_cr.batch(c).left_cols(1);
                trmm(tril(Q).transposed(), λ_next, w);     // w = LQᵀ(j) λ(j-1)
                trmm(tril(Q), w);                          // w = LQ(j) LQᵀ(j) λ(j-1)
                sub(xi, w, w);                             // w = x(j) - LQ(j) LQᵀ(j) λ(j-1)
                gemv_add(F_next.transposed(), w, ux_next); // u(j-1) += BAᵀ(j-1) w
            }
            // 12|  V(j-1) = [ B(j-1)ᵀ ] LQ(j)
            //   |           [ A(j-1)ᵀ ]
            if constexpr (Factor) {
                trmm(F_next.transposed(), tril(Q), V_next);
                m_syrk = nx; // columns of V(j-1)
                // Compress the active constraint Jacobians to add them to the Hessian later
                if (nyM > 0)
                    m_syrk += compress_masks_sqrt(data_Gᵀ.batch(di_next), Σ.batch(di_next), Gᵀnext);
            }
        } else {
            GUANAQO_TRACE("Riccati last", j);
            // 14|  LA(j₁) = Â(j₁) LQ(j₁)⁻ᵀ
            if constexpr (Factor) {
                trsm(Acl, tril(Q).transposed());
            }
            if constexpr (Solve) {
                auto xi = ux.batch(di).bottom_rows(nx), λ_last = λ.batch(dn);
                trsm(tril(Q), xi);
                gemv_add(Acl, xi, λ_last);
                trsm(tril(Q).transposed(), xi);
            }
        }
    }
}

Algorithm 2: Cyqlone factorization

Factorization of the entire KKT system using the Cyqlone algorithm. Optionally fused with the forward solve.

Described by §4 “Cyqlone: Parallel factorization and solution of KKT systems with optimal control structure”

Differences compared to the pseudo-code in the paper:

The penalty terms DCᵀ Σ DC and the regularizers Γₓ = γI are added to the cost Hessians during the Riccati factorization step, as described in §5.1 “The augmented Lagrangian inner problem”.
Solution is fused/interleaved with the factorization steps to improve temporal locality and reduce memory bandwidth.
The factorization and solution are done mostly in-place (without overwriting the OCP data).
Factorization of the odd diagonal blocks M(i) is performed in the compute_schur function instead of at the first level of the CR code.
The last level is factored and solved using PCR or PCG, as described in §7.5.2 “Handling of the final scalar levels”.

High-level factorization procedure

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <bool Factor, bool Solve>
void CyqloneSolver<VL, T, DefaultOrder, Ctx>::factor_solve_impl(Context &ctx, value_type γ,
                                                                view<> Σ, mut_view<> ux,
                                                                mut_view<> λ) {
    //  2|  factor-block-column-riccati(c)    -- steps 1 and 2
    factor_riccati_solve<Factor, Solve>(ctx, γ, Σ, ux, λ);
    //  3|  compute-schur(c)                  -- step 3
    compute_schur<Factor, Solve>(ctx, ux, λ);
    //  4|  factor-schur(c)                   -- step 4
    tricyqle.template factor_solve_skip_first<Factor, Solve>(ctx, λ, n);
}

Schur complement computation

This is the compute-schur function in the paper, including the factorization of the first level of CR.

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <bool Factor, bool Solve>
// NOLINTNEXTLINE(*-cognitive-complexity) // Needs to match pseudocode structure
void CyqloneSolver<VL, T, DefaultOrder, Ctx>::compute_schur(Context &ctx, mut_view<> ux,
                                                            mut_view<> λ) {
    const index_t c   = riccati_thread_assignment(ctx);
    const auto c_next = add_wrap_p(c, 1);
    //  7|  j₁ = n(c-1)+1,  jₙ = nc
    const auto dn = c * n, dn_next = c_next * n, d1_next = dn_next + n - 1;
    //  8|  i˃ = c,  i˂ = c-1
    const index_t i_fwd = c, i_bwd = sub_wrap_ceil_p(c, 1);
    auto M = tril(tricyqle.cr_L.batch(c));
    // 13|  W = [ LB(jₙ) ... LB(j₁) LA(j₁) ]    -- The order here is [ LA(j₁) LB(jₙ) ... LB(j₁) ]
    auto W = riccati_LAB.batch(c).right_cols(nx + nu * n);
    if constexpr (Factor) {
        auto LH = riccati_LH.batch(c);
        auto LQ = tril(LH.bottom_right(nx, nx));
        //  9|  T(c) = LQ(j₁)⁻ᵀ
        BATMAT_ASSERT(nu >= 1); // T = LQ⁻ᵀ is upper triangular, stored one row up from LQ itself
        auto Tc = triu(LH.right_cols(nx).middle_rows(nu - 1, nx));
        {
            GUANAQO_TRACE("Invert Q", c);
            CYQ_TRACE_WRITE(T, c, 0);
            trtri(LQ, Tc.transposed());
        }
        auto T_ready = ctx.arrive();
        auto LA1     = riccati_LAB.batch(c).middle_cols(nx * (n - 1), nx); // LA(j₁)
        // 10|  if ν2(i˂) > ν2(i˃)    K˂(i˃) = -T(c) LA(j₁)ᵀ    else    K˃(i˂) = -LA(j₁) T(c)ᵀ
        if (ν2p(i_bwd) > ν2p(i_fwd)) {
            GUANAQO_TRACE("Compute first U", i_fwd);
            CYQ_TRACE_WRITE(Kb, i_fwd, 0);
            trmm_neg(Tc, LA1.transposed(), tricyqle.cr_U.batch(i_fwd));
        } else {
            GUANAQO_TRACE("Compute first Y", i_bwd);
            CYQ_TRACE_WRITE(Kf, i_bwd, 0);
            if (i_fwd > 0)
                trmm_neg(LA1, Tc.transposed(), tricyqle.cr_Y.batch(i_bwd));
            else if constexpr (v > 1)
                trmm_neg(LA1, Tc.transposed(), tricyqle.cr_Y.batch(i_bwd), //
                         with_rotate_C<-1>, with_rotate_D<-1>, with_mask_D<-1>);
        }
        // 11|  -- sync --
        //      Wait for the inversion in the next interval
        ctx.wait(std::move(T_ready));
        //      Each column of the cyclic part with coupling equations is updated by two threads:
        //      one for the forward, and one for the backward coupling. Update the diagonal blocks
        //      of the coupling equations, first forward in time ...
        auto R̂ŜQ̂_next = riccati_LH.batch(c_next);
        // 12|  M(c)˂ = T(c+1) T(c+1)ᵀ
        auto Tc_next = triu(R̂ŜQ̂_next.right_cols(nx).middle_rows(nu - 1, nx));
        {
            CYQ_TRACE_READ(T, c_next, 0);
            GUANAQO_TRACE("Compute TTᵀ", c_next);
            if (c_next > 0 || v == 1)
                trmm(Tc_next, Tc_next.transposed(), M);
            else
                trmm(Tc_next, Tc_next.transposed(), M, with_rotate_C<-1>, with_rotate_D<-1>);
        }
        //      And finally backward in time, optionally fused with the factorization.
        if (p == 1) { // no multi-threading
            GUANAQO_TRACE("Factor M last", c);
            CYQ_TRACE_WRITE(L, c, 0);
            auto L0 = tril(tricyqle.pcr_L.batch(0));
            // 13|  M(c)˃ = WWᵀ
            // 14|  M(c) = M(c)˂ + M(c)˃
            syrk_add(W, M);
            // 16|  L(c) = chol(M(c))
            potrf(M, L0); // Final block is stored separately (for PCR/PCG later)
        } else if (ν2p(i_fwd) == 0) {
            GUANAQO_TRACE("Factor M", c);
            CYQ_TRACE_WRITE(L, c, 0);
            CYQ_TRACE_WRITE(L, c, 1);
            // 13|  M(c)˃ = WWᵀ
            // 14|  M(c) = M(c)˂ + M(c)˃
            // 16|  L(c) = chol(M(c))
            syrk_add_potrf(W, M);
        } else {
            GUANAQO_TRACE("Compute WWᵀ", c);
            CYQ_TRACE_WRITE(M, c, 0);
            // 13|  M(c)˃ = WWᵀ
            // 14|  M(c) = M(c)˂ + M(c)˃
            syrk_add(W, M);
        }
    }
    if constexpr (Solve) {
        if (!Factor)
            ctx.arrive_and_wait(); // Wait for x_next
        {
            GUANAQO_TRACE("Update λ", dn);
            auto x_next = ux.batch(d1_next).bottom_rows(nx);
            if (c_next > 0 || v == 1)
                sub(λ.batch(dn), x_next);
            else
                sub(λ.batch(dn), x_next, with_rotate<1>);
        }
        {
            // TODO: λ(dn) here has a different thread assignment than in TricyqleSolver
            GUANAQO_TRACE("Solve λ", dn);
            if (ν2p(i_fwd) == 0 && p != 1)
                trsm(M, λ.batch(dn));
        }
    }
}

Schur complement factorization

This is the factor-schur function in the paper, but without the first level of CR, which is fused with the compute-schur function above.

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <bool Factor, bool Solve>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::factor_solve_skip_first(Context &ctx, mut_view<> λ,
                                                                       index_t stride) {
    // When vectorization is enabled, the number of threads p must be a power of two.
    // TODO: allow circular coupling for v=1 and non-power-of-two p, which requires wrapping of
    //       the indices in the CR code.
    BATMAT_ASSERT(is_pow_2(p) || (v == 1 && !circular));
    const index_t c = ctx.index;
    // 17|  for l = 0 ... log₂(P)-1
    for (index_t l = 0; l < lp(); ++l) { // Recursion level of cyclic reduction
        const auto c_ = cr_thread_assignment(l, c);
        // 18|  iU = c+1, iY = c+1-2^l
        const auto iU = add_wrap_ceil_p(c_, 1), iY = sub_wrap_ceil_p(c_, (1 << l) - 1);
        // 19|  -- sync --
        ctx.arrive_and_wait(); // Wait for L
        // 20|  if ν₂(iU) = l:  U(iU) = K˂(iU) L(iU)⁻ᵀ
        if (ν2p(iU) == l) {
            if constexpr (Factor)
                factor_U(l, iU);
            if constexpr (Solve)
                solve_u_forward(l, iU, λ, stride);
        }
        // 21|  elif ν₂(iY) = l:  Y(iY) = K˃(iY) L(iY)⁻ᵀ
        else if (ν2p(iY) == l) {
            if constexpr (Factor)
                factor_Y(l, iY);
            if constexpr (Solve)
                solve_y_forward(l, iY, λ, work_cr, stride);
        }
        // 22|  -- sync --
        ctx.arrive_and_wait(); // Wait for U, Y
        // 23|  if ν₂(iU) = l:  factor-L(l, iY)
        if (ν2p(iU) == l) {
            if constexpr (Factor)
                factor_L(l, iY);
            if constexpr (Solve)
                solve_λ_forward(l, iY, λ, work_cr, stride);
        }
        // 24|  elif ν₂(iY) = l:  update-K(l, iY)
        else if (ν2p(iY) == l) {
            if constexpr (Factor)
                update_K(l, iY);
        }
    }
    // Factor or solve the last level using PCR or PCG
    if constexpr (Factor) {
        if (params.solve_method == SolveMethod::PCR) {
            ctx.arrive_and_wait(); // wait for off-diagonal block
            if (block_size >= params.parallel_factor_pcr_threshold && p > 1)
                factor_pcr_parallel(ctx);
            else if (ν2p(c + 1) + 1 == lp() || p == 1)
                factor_pcr();
        }
    }
    if constexpr (Solve) {
        if (params.solve_method == SolveMethod::PCR) {
            if constexpr (!Factor)
                ctx.arrive_and_wait(); // wait for off-diagonal block TODO: necessary?
            if (ν2p(c + 1) + 1 == lp() || p == 1)
                solve_pcr(λ.batch(0), work_pcg.batch(0).left_cols(1));
        } else {
            ctx.arrive_and_wait(); // wait for off-diagonal block
            if (ν2p(c + 1) + 1 == lp() || p == 1)
                solve_pcg(λ.batch(0), work_pcg.batch(0));
        }
    }
}

CR helper functions

Differences compared to the pseudo-code in the paper:

The factorization is done in-place on cr_L, cr_U, and cr_Y. Subdiagonal blocks K˂ and K˃ are temporarily stored in cr_U and cr_Y respectively.
Syrk and potrf operations are fused where possible to improve performance.
Additional masking is performed for the scalar case (v == 1), corresponding to the boundary conditions K˃(p-2^l)=0 (i.e. no periodic coupling between the last and first stages). This serves two main purposes: it avoids unnecessary computations on zero blocks, and it allows for processor counts p that are not powers of two. In contrast, the vectorized case requires periodic boundary conditions, so this masking is not applied for v > 1.

// 20|  U(iU) = K˂(iU) L(iU)⁻ᵀ
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::factor_U([[maybe_unused]] index_t l, index_t iU) {
    if constexpr (v == 1)
        if (iU >= p && !circular) // happens in cases where p is not a power of two
            return;
    CYQ_TRACE_READ(Kb, iU, 0);
    CYQ_TRACE_READ(L, iU, 1);
    GUANAQO_TRACE("Trsm U", iU);
    CYQ_TRACE_WRITE(U, iU, 0);
    CYQ_TRACE_WRITE(U, iU, 1);
    trsm(cr_U.batch(iU), tril(cr_L.batch(iU)).transposed());
}
 
// 21|  Y(iY) = K˃(iY) L(iY)⁻ᵀ
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::factor_Y([[maybe_unused]] index_t l, index_t iY) {
    if constexpr (v == 1)
        if (iY + (1 << l) >= p && !circular) // Y(iY)=0 for scalar case
            return;
    CYQ_TRACE_READ(Kf, iY, 0);
    CYQ_TRACE_READ(L, iY, 0);
    GUANAQO_TRACE("Trsm Y", iY);
    CYQ_TRACE_WRITE(Y, iY, 0);
    CYQ_TRACE_WRITE(Y, iY, 1);
    trsm(cr_Y.batch(iY), tril(cr_L.batch(iY)).transposed());
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_K(index_t l, index_t i) {
    const index_t i_prev = sub_wrap_ceil_p(i, 1 << l), i_next = add_wrap_ceil_p(i, 1 << l);
    if constexpr (v == 1)
        if (i + (1 << l) >= p && !circular) // Y(i)=0 for scalar case
            return;
    CYQ_TRACE_READ(U, i, 1);
    CYQ_TRACE_READ(Y, i, 1);
    if (ν2p(i_prev) > ν2p(i_next)) {
        // 31|  K˂(i˃) = -U(i) Y(i)ᵀ
        GUANAQO_TRACE("Compute U", i_next);
        CYQ_TRACE_WRITE(Kb, i_next, 0);
        gemm_neg(cr_U.batch(i), cr_Y.batch(i).transposed(), cr_U.batch(i_next));
    } else {
        // 31|  K˃(i˂) = -Y(i) U(i)ᵀ
        GUANAQO_TRACE("Compute Y", i_prev);
        CYQ_TRACE_WRITE(Kf, i_prev, 0);
        gemm_neg(cr_Y.batch(i), cr_U.batch(i).transposed(), cr_Y.batch(i_prev));
    }
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::factor_L(index_t l, index_t i) {
    const index_t offset = 1 << l;
    const index_t iU     = add_wrap_ceil_p(i, offset);
    const index_t iY     = sub_wrap_ceil_p(i, offset);
    // Final block L(0) is stored separately (for PCR/PCG later)
    auto M = tril(cr_L.batch(i)), L0 = tril(pcr_L.batch(0));
    // 28|  if ν₂(i) = l+1:  L(i) = chol(M(i)⁺)
    const bool factor_next = ν2p(i) == l + 1;
    if constexpr (v == 1) {
        if (i == 0 && !circular) { // Y(iY)=0 for M on the first thread
            CYQ_TRACE_READ(M, i, 0);
            CYQ_TRACE_READ(U, iU, 0);
            GUANAQO_TRACE("Subtract UUᵀ", i);
            if (factor_next) {
                CYQ_TRACE_WRITE(L, i, 0);
                CYQ_TRACE_WRITE(L, i, 1);
            } else {
                CYQ_TRACE_WRITE(M, i, 0);
            }
            auto U = cr_U.batch(iU);
            // 27|  M(i)⁺ = M(i) - U(iU) U(iU)ᵀ - Y(iY) Y(iY)ᵀ
            // 28| if ν₂(i) = l+1:  L(i) = chol(M(i)⁺)
            factor_next ? syrk_sub_potrf(U, M, L0) // chol(M - UUᵀ)
                        : syrk_sub(U, M);
            return;
        } else if (iU >= p && !circular) { // happens in cases where p is not a power of two
            CYQ_TRACE_READ(M, i, 0);
            CYQ_TRACE_READ(Y, iY, 0);
            GUANAQO_TRACE("Subtract YYᵀ", i);
            if (factor_next) {
                CYQ_TRACE_WRITE(L, i, 0);
                CYQ_TRACE_WRITE(L, i, 1);
            } else {
                CYQ_TRACE_WRITE(M, i, 0);
            }
            auto Y = cr_Y.batch(iY);
            // 27|  M(i)⁺ = M(i) - U(iU) U(iU)ᵀ - Y(iY) Y(iY)ᵀ
            // 28| if ν₂(i) = l+1:  L(i) = chol(M(i)⁺)
            factor_next ? syrk_sub_potrf(Y, M) // chol(M - YYᵀ)
                        : syrk_sub(Y, M);
            return;
        }
    }
    auto U = cr_U.batch(iU), Y = cr_Y.batch(iY);
    {
        CYQ_TRACE_READ(M, i, 0);
        CYQ_TRACE_READ(U, iU, 0);
        GUANAQO_TRACE("Subtract UUᵀ", i);
        CYQ_TRACE_WRITE(M, i, 0);
        // 27|  M(i)⁺ = M(i) - U(iU) U(iU)ᵀ - Y(iY) Y(iY)ᵀ
        syrk_sub(U, M);
    }
    if (factor_next && i != 0) {
        CYQ_TRACE_READ(M, i, 0);
        CYQ_TRACE_READ(Y, iY, 0);
        GUANAQO_TRACE("Factor M", i);
        CYQ_TRACE_WRITE(L, i, 0);
        CYQ_TRACE_WRITE(L, i, 1);
        // 27|  M(i)⁺ = M(i) - U(iU) U(iU)ᵀ - Y(iY) Y(iY)ᵀ
        // 28|  if ν₂(i) = l+1:  L(i) = chol(M(i)⁺)
        syrk_sub_potrf(Y, M); // chol(M - YYᵀ)
    } else {
        CYQ_TRACE_READ(M, i, 0);
        CYQ_TRACE_READ(Y, iY, 0);
        GUANAQO_TRACE("Subtract YYᵀ", i);
        CYQ_TRACE_WRITE(M, i, 0);
        // 27|  M(i)⁺ = M(i) - U(iU) U(iU)ᵀ - Y(iY) Y(iY)ᵀ
        if (i != 0)
            syrk_sub(Y, M);
        else if constexpr (v > 1)
            syrk_sub(Y, M, with_rotate_C<1>, with_rotate_D<1>);
        else if (circular)
            syrk_sub(Y, M);
    }
    // 28| if ν₂(i) = l+1:  L(i) = chol(M(i)⁺)
    if (factor_next && i == 0) {
        CYQ_TRACE_READ(M, i, 0);
        GUANAQO_TRACE("Factor M", i);
        CYQ_TRACE_WRITE(L, i, 0);
        CYQ_TRACE_WRITE(L, i, 1);
        potrf(M, L0);
    }
}

Algorithm 3: Factorization update of a single modified Riccati block column

Differences compared to the pseudo-code in the paper:

Many operations are performed in-place to reduce memory usage. For example, all original Cholesky factors are replaced by the updated ones.
Solution is fused/interleaved with the factorization steps to improve temporal locality and reduce memory bandwidth.
The workspaces Υ1 and Υ2 are reused for the variables Υ and Φ in the paper. Two workspaces are required because the matrix multiplication by Φx(j) cannot be done in-place.
Only the constraints for which ΔΣ is nonzero are used during the update. This is done by compressing the relevant columns of Dᵀ and Cᵀ into Υu and Υx respectively.
A global communication step is used at the end to compute the total update rank for the entire problem, and to partition the workspace for Υ˃ and Υ˂ to prepare for the CR phase.
The update for u(0) is handled as a special case to exploit its mostly independent structure.
If the number of processors p is not a power of two, the workspace allocation of Υ˃(0) is adjusted to ensure that it does not overlap with Υ˂(p-2^l). Note that this is only necessary when u(0) is not isolated. See work_Ups_fwd_w.
In the vectorized case, Υ˃(0) and Υ˂(0) are stored in different workspaces in the last level of CR, since this is not actually the last level of the full reduction (PCR handles the rest). See work_Ups_bwd_w.

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <bool Solve>
// NOLINTNEXTLINE(*-cognitive-complexity) // Needs to match pseudocode structure
void CyqloneSolver<VL, T, DefaultOrder, Ctx>::update_riccati_solve(Context &ctx, view<> ΔΣ,
                                                                   mut_view<> ux, mut_view<> λ) {
    const index_t c = riccati_thread_assignment(ctx);
    //  3|  j₁ = n(c-1)+1, jₙ = nc
    const index_t dn  = c * n; // data batch index
    const index_t jn  = c * n; // stage index
    const index_t nux = nu + nx, nyM = std::max(ny, ny_0 + ny_N);
    auto LHs = riccati_LH.batch(c);
    auto B̂s = riccati_LAB.batch(c).right_cols(n * nu), Âs = riccati_LAB.batch(c).left_cols(n * nx);
    auto Υ1 = riccati_Υ1.batch(c), Υ2 = riccati_Υ2.batch(c);
    auto 𝑆 = work_Σ.batch(c); // \mathcal{S}_j in the paper
 
    // u(0) is mostly independent, since there is no coupling S(0) or A(0). Without vectorization
    // (v=1), we can handle it as a special case. This not only saves computation during the Riccati
    // update, but also introduces structural zeros that can be exploited during the CR updates.
    // Its contribution just has to be applied to LB(0) (which is done in this function), and to
    // M(0)/L(0) (which is done in update_L).
    const bool isolate_u0 = v == 1 && dn == 0;
 
    index_t m    = 0; // Total update rank so far
    index_t mu0  = 0; // Update rank for u(0)
    auto Υ_first = Υ2.left_cols(nyM), Υu0_first = Υ2.right_cols(ny_0);
    if (!isolate_u0) {
        GUANAQO_TRACE("Riccati update compress", jn);
        //  4|  [ Υu(jₙ) ]   [ D(jₙ)ᵀ ]
        //   |  [ Υx(jₙ) ] = [ C(jₙ)ᵀ ],    𝑆(jₙ) = ΔΣ(jₙ)
        //   |  [ Υλ(jₙ) ]   [   0    ]
        //  6|  m(j) = rank 𝑆(j)
        // Note that we only need to consider the columns corresponding to changing constraints,
        // i.e. where ΔΣ is nonzero, which is why we compress them.
        auto Υux = Υ_first.top_rows(nu + nx); // we don't know the number of columns yet
        if (nyM > 0)
            m = compress_masks(data_Gᵀ.batch(dn), ΔΣ.batch(dn), //
                               Υux, 𝑆.top_rows(nyM));
        auto Υλ = Υ_first.bottom_left(nx, m);
        Υλ.set_constant(0);
    } else {
        // Exploit the block-diagonal structure of G₀ = [ D₀ 0 ]  ny_0
        //                                              [ 0  Cₙ]  ny_N
        auto D0ᵀ = data_Gᵀ.batch(dn).top_left(nu, ny_0),
             C0ᵀ = data_Gᵀ.batch(dn).bottom_rows(nx).middle_cols(ny_0, ny_N);
        auto Υu0 = Υu0_first.top_rows(nu), Υx = Υ_first.middle_rows(nu, nx).left_cols(ny_N);
        if (ny_0 > 0)
            mu0 = compress_masks(D0ᵀ, ΔΣ.batch(dn).top_rows(ny_0), //
                                 Υu0, 𝑆.bottom_rows(ny_0));
        if (ny_N > 0)
            m = compress_masks(C0ᵀ, ΔΣ.batch(dn).middle_rows(ny_0, ny_N), //
                               Υx, 𝑆.top_rows(ny_N));
        auto Υλ = Υ_first.bottom_left(nx, m), Υλ0 = Υu0_first.bottom_left(nx, mu0);
        Υλ.set_constant(0);
        Υλ0.set_constant(0);
    }
    auto Υu0 = Υu0_first.top_left(nu, mu0), Υλ0 = Υu0_first.bottom_left(nx, mu0);
    auto 𝑆u0 = 𝑆.bottom_rows(ny_0).top_rows(mu0);
 
    // Iterate over all stages in the interval (in reverse order)
    for (index_t i = 0; i < n; ++i) {
        //  5|  for j = jₙ downto j₁
        const index_t j  = sub_wrap_ceil_N(jn, i); // stage index j ≡ jₙ - i mod N
        const index_t di = dn + i;                 // data batch index
        auto LH = LHs.middle_cols(i * nux, nux), LRS = LH.left_cols(nu);
        auto LR = tril(LRS.top_rows(nu)), LQ = tril(LH.bottom_right(nx, nx));
        auto LB = B̂s.middle_cols(i * nu, nu), Acl = Âs.middle_cols(i * nx, nx);
 
        index_t mj = m;
        auto Υ     = (i & 1 ? Υ1 : Υ2).left_cols(mj); // alternate between Υ1 and Υ2 workspaces
        auto Υux = Υ.top_rows(nu + nx), Υλ = Υ.bottom_rows(nx);
        if (!isolate_u0 || i != 0) {
            GUANAQO_TRACE("Riccati update RS", j);
            if (mj > 0)
                //  7|  [ L̃R(j)    0   ]   [ LR(j)  Υu(j) ]
                //   |  [ L̃S(j)  Φx(j) ] = [ LS(j)  Υx(j) ] Q̆u(j),  blkdiag(I, 𝑆(j))-orthogonal
                //   |  [ L̃B(j)  Φλ(j) ]   [ LB(j)  Υλ(j) ]
                hyhound_diag_2(tril(LRS), Υux, //
                               LB, Υλ, 𝑆.top_rows(mj));
        } else {
            GUANAQO_TRACE("Riccati update R", j);
            if (mu0 > 0)
                // Same as above, but using LS(j) = 0 = L̃S(j), Υx(j) = 0 = Φx(j)
                hyhound_diag_2(LR, Υu0, //
                               LB, Υλ0, 𝑆u0);
        }
        auto Φx = Υ.middle_rows(nu, nx), Φλ = Υ.bottom_rows(nx);
        if constexpr (Solve) {
            // Solve u ← LR̂⁻¹ u, x ← x - Ŝ u
            auto ui = ux.batch(di).top_rows(nu), xi = ux.batch(di).bottom_rows(nx);
            trsm(LR, ui);
            auto S = LRS.bottom_rows(nx);
            gemv_sub(S, ui, xi);
            auto λ_last = λ.batch(dn);
            gemv_add(LB, ui, λ_last);
        }
        //  8|  if j > j₁
        if (i + 1 < n) {
            [[maybe_unused]] const auto j_next = sub_wrap_ceil_N(j, 1);
            const auto di_next                 = dn + i + 1;
            auto Υ_next                        = (i & 1 ? Υ2 : Υ1).left_cols(mj + nyM);
            auto Υux_next = Υ_next.top_rows(nu + nx), Υλ_next = Υ_next.bottom_rows(nx);
            auto F_next = data_F.batch(di_next);
            if (mj > 0) {
                GUANAQO_TRACE("Riccati update prop", j_next);
                // 10|  [ Υu(j-1) ]   [ B(j-1)ᵀ Φx(j)   D(j-1)ᵀ ]
                //   |  [ Υx(j-1) ] = [ A(j-1)ᵀ Φx(j)   C(j-1)ᵀ ]
                //   |  [ Υλ(j-1) ]   [    Φλ(j)          0     ]
                // Left block column first
                gemm(F_next.transposed(), Φx, Υux_next.left_cols(mj));
                copy(Φλ, Υλ_next.left_cols(mj));
                // TODO: we may not have to copy Φλ every time. In fact, we can already write it in
                //       the CR workspace.
            }
            {
                GUANAQO_TRACE("Riccati update compress", j_next);
                // Now the right block column, again compressing to only the changing constraints
                if (nyM > 0)
                    m += compress_masks(data_Gᵀ.batch(di_next), ΔΣ.batch(di_next),
                                        Υux_next.right_cols(nyM), 𝑆.middle_rows(mj, nyM));
                Υλ_next.middle_cols(mj, m - mj).set_constant(0);
            }
            if (mj > 0) {
                GUANAQO_TRACE("Riccati update Q", j);
                //  9|  Ãcl(j) = Acl(j) + Φλ(j) 𝑆(j) Φx(j)ᵀ
                gemm_diag_add(Φλ, Φx.transposed(), Acl, 𝑆.top_rows(mj));
                // 12|  [ L̃Q(j)  0 ] = [ LQ(j)  Φx(j) ] Q̆x(j),  blkdiag(I, 𝑆(j))-orthogonal
                hyhound_diag(LQ, Φx, 𝑆.top_rows(mj));
            }
            if constexpr (Solve) {
                auto xi = ux.batch(di).bottom_rows(nx), ux_next = ux.batch(di_next),
                     λ_next = λ.batch(di_next), λ_last = λ.batch(dn);
                gemv_add(Acl, λ_next, λ_last); // λ(jn) += Â λ(j-1)
                auto w = tricyqle.work_cr.batch(c).left_cols(1);
                trmm(LQ.transposed(), λ_next, w);          // w = LQᵀ(j) λ(j-1)
                trmm(LQ, w);                               // w = LQ(j) LQᵀ(j) λ(j-1)
                sub(xi, w, w);                             // w = x(j) - LQ(j) LQᵀ(j) λ(j-1)
                gemv_add(F_next.transposed(), w, ux_next); // u(j-1) += BAᵀ(j-1) w
            }
        } else {
            const auto c_prev = sub_wrap_p(c, 1); // c-1
            // Communicate the update ranks mj to all threads and compute the column offsets in the
            // global update workspace we'll write Υ(c) and Υ(c-1) to.
            tricyqle.set_thread_update_rank(ctx, c_prev, mj);
            const index_t i_fwd = c, i_bwd = c_prev;
            const bool rotate = c == 0;
            GUANAQO_TRACE("Riccati update Q", j);
            CYQ_TRACE_WRITE(Upf, i_fwd, 0);
            CYQ_TRACE_WRITE(Upb, i_bwd, 0);
            if (mj > 0) {
                auto Tc    = LH.block(nu - 1, nu, nx, nx); // T(c) = LQ(j₁)⁻ᵀ, see compute_schur
                auto Υ_fwd = tricyqle.work_Ups_fwd(0, i_fwd).left_cols(mj),
                     Υ_bwd_prev = tricyqle.work_Ups_bwd(0, i_bwd).left_cols(mj);
                auto 𝒮cr = tricyqle.work_Σ_fwd(0, i_fwd).top_rows(mj); // \mathscr{S}_c in the paper
                // 12|  [ L̃Q(j)  0 ] = [ LQ(j)  Φx(j) ] Q̆x(j),  blkdiag(I, 𝑆(j))-orthogonal
                // Fused with:
                // 14|  [ L̃A(j₁)  Υ˃(c)   ] = [ LA(j₁)  Φλ(j₁) ] Q̆x(j₁),
                //   |  [ -T̃(c)   Υ˂(c-1) ]   [ -T(c)     0    ]
                hyhound_diag_riccati(LQ, Φx,                  //
                                     Acl, Φλ, Υ_fwd,          //
                                     Tc, /*0*/ Υ_bwd_prev,    // note the lack of a minus sign ...
                                     𝑆.top_rows(mj), rotate); //
                negate(Υ_bwd_prev);                           // which is fixed here (TODO: fuse)
                // 13|  𝒮(c) = 𝑆(j₁)
                rotate ? negate(𝑆.top_rows(mj), 𝒮cr, with_rotate<1>) //
                       : negate(𝑆.top_rows(mj), 𝒮cr);
                // We negate 𝒮(c) because in the CR update, we need blkdiag(-I, 𝒮(c))-orthogonal
                // or blkdiag(I, -𝒮(c))-orthogonal transformations.
            }
            if constexpr (Solve) {
                auto xi = ux.batch(di).bottom_rows(nx), λ_last = λ.batch(dn);
                trsm(LQ, xi);
                gemv_add(Acl, xi, λ_last);
                trsm(LQ.transposed(), xi);
            }
            if (dn == 0) {
                // Add the contribution from the isolated update for u(0) as well
                if (isolate_u0) {
                    tricyqle.set_update_rank_extra(mu0);
                    copy(Υλ0, tricyqle.work_Ups_extra());
                    negate(𝑆u0, tricyqle.work_Σ_extra());
                } else {
                    tricyqle.clear_update_rank_extra();
                }
            }
        }
    }
}

Algorithm 4: Cyqlone factorization updates

Differences compared to the pseudo-code in the paper:

The update of the last has been modified to allow for vectorization (v>1), updating the PCR factorization if necessary.
Solution is fused/interleaved with the factorization steps to improve temporal locality and reduce memory bandwidth.
A heuristic rank check is used to decide whether to update or re-factorize the last level.
The update matrices Y˃(0) are skipped when they are zero (i.e. when the updates to u(0) are handled separately). This saves some unnecessary computation in the scalar case.

High-level update procedure

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <bool Solve>
void CyqloneSolver<VL, T, DefaultOrder, Ctx>::update_solve_impl(Context &ctx, view<> ΔΣ,
                                                                mut_view<> ux, mut_view<> λ) {
    //  2|  Υ˃(c;0), Υ˂(c-1;0), 𝒮(c;0) = update-block-column-riccati(c)
    //  3|  update-schur(c)
    update_riccati_solve<Solve>(ctx, ΔΣ, ux, λ);
    //  5|  -- sync --
    ctx.arrive_and_wait(); // wait for Υ˃, Υ˂, x_next
    if constexpr (Solve) {
        const index_t c   = ctx.index; // different assignment than compute_schur
        const auto c_next = add_wrap_p(c, 1);
        const auto dn = c * n, dn_next = c_next * n, d1_next = dn_next + n - 1; // see compute_schur
        auto x_next = ux.batch(d1_next).bottom_rows(nx);
        c_next > 0 || v == 1 ? sub(λ.batch(dn), x_next) //
                             : sub(λ.batch(dn), x_next, with_rotate<1>);
    }
    // Update the block-tridiagonal Schur complement using CR
    tricyqle.template update_solve_cr<Solve>(ctx, λ, n);
}

Update of the CR factorization

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <bool Solve>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_solve_cr(Context &ctx, mut_view<> λ,
                                                               index_t stride) {
    const index_t c = ctx.index;
    //  6|  if ν₂(c) = 0:  update-L(0, c)
    if (ν2p(c) == 0) {
        update_L(0, c);
        if constexpr (Solve)
            if (p != 1)
                trsm(tril(cr_L.batch(c)), λ.batch(c * stride));
    }
    //  7|  for l = 0 ... log₂(P)-1
    for (index_t l = 0; l < lp(); ++l) {
        const auto c_ = cr_thread_assignment(l, c);
        //  8|  iU = c+1, iY = c+1-2^l
        const auto iU = add_wrap_ceil_p(c_, 1), iY = sub_wrap_ceil_p(c_, (1 << l) - 1);
        //  9|  -- sync --
        ctx.arrive_and_wait(); // wait for Q̆
        // 10|  if ν₂(iU) = l:  update-U(l, iU)
        if (ν2p(iU) == l) {
            update_U(l, iU);
            if constexpr (Solve)
                solve_u_forward(l, iU, λ, stride);
        }
        // 11|  elif ν₂(iY) = l:  update-Y(l, iY)
        else if (ν2p(iY) == l) {
            update_Y(l, iY);
            if constexpr (Solve)
                solve_y_forward(l, iY, λ, work_cr, stride);
        }
        // 12|  -- sync --
        ctx.arrive_and_wait(); // wait for Υ˃, Υ˂
        // 13|  if ν₂(iY) = l+1:  update-L(l+1, iY)
        if (ν2p(iY) == l + 1)
            update_L(l + 1, iY);
        if (ν2p(iU) == l)
            if constexpr (Solve)
                solve_λ_forward(l, iY, λ, work_cr, stride);
    }
    if constexpr (Solve) {
        ctx.arrive_and_wait();
        // TODO: synchronize here if switching to parallel PCR factor in update_L
        if (ν2p(c + 1) + 1 == lp() || p == 1)
            params.solve_method == SolveMethod::PCR
                ? solve_pcr(λ.batch(0), work_pcg.batch(0).left_cols(1))
                : solve_pcg(λ.batch(0), work_pcg.batch(0));
    }
}

CR factorization update helper functions

Most of the space here is taken up by the updates of the last level, which needs to handle some special cases depending on the final PCR or PCG solver, and depending on whether we perform updates or re-factorization.

The special cases if constexpr (v == 1) add some visual overhead, and can safely be ignored.

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_L(index_t l, index_t i) {
    if (l < lp()) {
        CYQ_TRACE_READ(Upf, i, 0);
        CYQ_TRACE_READ(Upb, i, 0);
        GUANAQO_TRACE("Update L", i);
        CYQ_TRACE_WRITE(Q, i, 0);
        CYQ_TRACE_WRITE(Q, i, 1);
        auto L   = tril(cr_L.batch(i));
        auto UpQ = work_Q_cr(l, i);
        auto Σ   = work_Σ_Q(l, i);
        auto WQ  = work_hyh.batch(i);
        // 16|  [ L̃(i) | 0 ] = [ L(i) | Υ˃(i)  Υ˂(i) ] Q̆(i),  blkdiag(-I, 𝒮(i;l+1))-orthogonal
        hyhound_diag(L, UpQ, Σ, WQ);
        return;
    }
 
    // Last level
    auto M0 = tril(cr_L.batch(0)), L0 = tril(pcr_L.batch(0));
    auto Y0   = cr_Y.batch(0);
    auto Ypen = cr_Y.batch(p / 2), Upen = cr_U.batch(p / 2); // Subdiag blocks of penultimate level
 
    auto Υ0_bwd = work_Ups_bwd_last(), Υ0_fwd = work_Ups_fwd_last();
    auto Σ_bwd = work_Σ_bwd_last(), Σ_fwd = work_Σ_fwd_last();
    BATMAT_ASSERT(Σ_bwd.rows() == Σ_fwd.rows() || m_update_u0 >= 0);
 
    // For p=2, v=4, the update of the last level looks like:
    //
    // [ Υ˂(0)                Υ˃(0) | L(0)                   ]
    // [ Υ˃(2)  Υ˂(2)               | Y(0)  L(2)             ]
    // [        Υ˃(4)  Υ˂(4)        |       Y(2)  L(4)       ]
    // [               Υ˃(6)  Υ˂(6) |             Y(4)  L(6) ]
    //
    // where the blocks are stored as follows:
    //  Υ0_bwd = [ Υ˂(0)  Υ˂(2)  Υ˂(4)  Υ˂(6) ]
    //  Υ0_fwd = [ Υ˃(2)  Υ˃(4)  Υ˃(6)  Υ˃(0) ]
    //  L0     = [ L(0)   L(2)   L(4)   L(6) ]
    //  Y0     = [ Y(0)   Y(2)   Y(4)   -    ]
    //
    // Note that Υ˂ and Υ˃ are aligned by column, not by row. To apply the updates (row-wise),
    // we therefore need to rotate Υ0_fwd by one block to the right first.
 
    // Check the rank to decide whether to update or recompute
    const index_t nj      = std::max(Σ_fwd.rows(), Σ_bwd.rows());
    auto pcr_update_thres = params.pcr_max_update_fraction * static_cast<double>(block_size);
    auto y0_update_thres  = params.cr_max_update_fraction_Y0 * static_cast<double>(block_size);
    bool update           = static_cast<double>(nj) < pcr_update_thres;
    bool update_y         = static_cast<double>(nj) < y0_update_thres;
    bool do_update_pcr    = params.solve_method == SolveMethod::PCR && update && v > 1;
    bool do_refactor_pcr  = params.solve_method == SolveMethod::PCR && !update;
 
    CYQ_TRACE_READ(Upf, 0, 0);
    CYQ_TRACE_READ(Upb, 0, 0);
    // Perform the PCR update
    if (do_update_pcr)
        update_pcr(Υ0_fwd, Υ0_bwd, Σ_bwd);
 
    { // Update or recompute the matrices Y(0), M(0) and L(0) in the last CR level
        GUANAQO_TRACE("Update L", i);
        // Update or recompute the subdiagonal block Y of the last CR level.
        // If there's only a single thread, we always update because there is no previous CR level
        // to recompute from (we would need to recompute the Riccati products, which is slow).
        // Otherwise, we only update if the rank is sufficiently low.
        if constexpr (v > 1) {
            if (update_y || p == 1)
                gemm_diag_add(Υ0_fwd, Υ0_bwd.transposed(), Y0, Σ_fwd);
            else
                gemm_neg(Ypen, Upen.transposed(), Y0);
        }
        // If at some point in the future we need to refactor PCR, we may need Y(0). So we just
        // always update it here. Alternatively, we could recompute it when needed, but that would
        // complicate the bookkeeping. Besides, we need Y(0) for the PCG case anyway.
 
        // Make sure the diagonal block M of the last CR level is up to date (it is needed for PCR).
        // This is done in two steps, the backward and the forward updates, the latter of which
        // requires a rotation first.
        if (params.solve_method == SolveMethod::PCR)
            syrk_diag_add(Υ0_bwd, M0, Σ_bwd);
        // When using PCG, we need the Cholesky factors L(0) of M(0) for the preconditioner, so
        // update them here. Like with the update of M(0), we do this in two steps.
        if (!do_update_pcr)
            hyhound_diag(L0, Υ0_bwd, Σ_bwd);
        // Rotate and repeat for the forward update.
        batmat::linalg::copy(Σ_fwd, Σ_fwd, with_rotate<-1>);
        batmat::linalg::copy(Υ0_fwd, Υ0_fwd, with_rotate<-1>);
        if (params.solve_method == SolveMethod::PCR)
            syrk_diag_add(Υ0_fwd, M0, Σ_fwd);
        if (!do_update_pcr)
            hyhound_diag(L0, Υ0_fwd, Σ_fwd);
        // TODO: we should actually merge these two hyhound_diag calls to make sure that the
        //       intermediate matrix after the backward update does not become indefinite
        //       (although this shouldn't be an issue for QPALM, at least not in exact arithmetic).
        //       We already have the code for this in update_pcr_level.
    }
 
    // Finally, recompute the PCR factorization if we did not do an update.
    if (do_refactor_pcr)
        factor_pcr(); // TODO: use parallel variant (when doing so, synchronize in update_solve_cr)
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_U(index_t l, index_t i) {
    const index_t i_bwd = sub_wrap_ceil_p(i, 1 << l);
    CYQ_TRACE_READ(Upb, i_bwd, 0);
    CYQ_TRACE_READ(Q, i, 1);
    GUANAQO_TRACE("Update U", i);
    CYQ_TRACE_WRITE(Upb, i_bwd, 0);
    auto Up_bwd = work_Ups_bwd(l, i_bwd), Up_bwd_next = work_Ups_bwd(l + 1, i_bwd);
    if constexpr (v == 1)
        if (i >= p) { // happens in cases where p is not a power of two
            // There's no matrix Q̆(i) to apply, just copy the update matrices forward
            if (Up_bwd.data() != Up_bwd_next.data())
                copy(Up_bwd, Up_bwd_next);
            // If the number of threads is odd, then update_Y won't be called for this column i,
            // so we need to copy the forward update matrices here as well.
            index_t i_fwd = add_wrap_ceil_p(i, 1 << l);
            if (i_fwd >= p)
                i_fwd = 0;
            if (i_fwd == 0 && m_update_u0 >= 0)
                return; // Υ˃(0) = 0
            auto Up_fwd = work_Ups_fwd(l, i_fwd), Up_fwd_next = work_Ups_fwd(l + 1, i_fwd);
            if (Up_fwd.data() != Up_fwd_next.data())
                copy(Up_fwd, Up_fwd_next);
            return;
        }
    auto UpQ = work_Q_cr(l, i);
    auto Σ   = work_Σ_Q(l, i);
    auto WQ  = work_hyh.batch(i);
    auto U   = cr_U.batch(i);
    // 18|  [ Ũ(i) | Υ˂(i-2^l;l+1) ] = [ U(i) | Υ˂(i-2^l;l)  0 ] Q̆(i)
    hyhound_diag_apply(U, Up_bwd, Up_bwd_next, //
                       UpQ, Σ, WQ, 0);
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_Y(index_t l, index_t i) {
    index_t i_fwd = add_wrap_ceil_p(i, 1 << l);
    CYQ_TRACE_READ(Upf, i_fwd, 0);
    CYQ_TRACE_READ(Q, i, 0);
    GUANAQO_TRACE("Update Y", i);
    CYQ_TRACE_WRITE(Upf, i_fwd, 0);
    if (i_fwd >= p)
        i_fwd = 0;
    if (i_fwd == 0 && m_update_u0 >= 0)
        return; // Υ˃(0) = 0
    auto UpQ    = work_Q_cr(l, i);
    auto Σ      = work_Σ_Q(l, i);
    auto WQ     = work_hyh.batch(i);
    auto Y      = cr_Y.batch(i);
    auto Up_fwd = work_Ups_fwd(l, i_fwd), Up_fwd_next = work_Ups_fwd(l + 1, i_fwd);
    // 20|  [ Ỹ(i) | Υ˃(i+2^l;l+1) ] = [ Y(i) | 0  Υ˃(i+2^l;l) ] Q̆(i)
    hyhound_diag_apply(Y, Up_fwd, Up_fwd_next, //
                       UpQ, Σ, WQ, Up_fwd_next.cols() - Up_fwd.cols());
}

Algorithm 5: CR: Solution of a symmetric block-tridiagonal system using cyclic reduction

Differences compared to the pseudo-code in the paper:

We use an iterative approach to factor all levels, instead of recursion.
The right-hand side vector λ is updated in-place.
It contains all stages of the original problem, not just the stages that are handled by CR. Therefore, we use the data batch index di = n bi, not the cyclic reduction batch index bi.
Y(k-2^l) b̃(k-2^l) is stored in a temporary workspace to allow it to be evaluated concurrently with U(k+2^l) b̃(k+2^l), as they both update b(k)⁺. Similarly for the backward solve, where U(k)ᵀ x(k-2^l) is stored in a temporary workspace to avoid races on x(k).
The last level is not handled here, because it is solved using PCG or PCR.
The forward solve is fused with the factorization above.

Serial reverse solve

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_reverse_serial(mut_view<> λ, mut_view<> work,
                                                                    index_t stride) const {
    for (index_t l = lp(); l-- > 0;) {
        for (index_t c = 0; c < p; ++c) {
            const index_t c_  = cr_thread_assignment(l, c);
            const index_t i_y = sub_wrap_ceil_p(c_, (1 << l) - 1);
            if (l < lp() - 1) { // λ(0) was already computed during forward solve
                if (ν2p(i_y) == l + 1)
                    solve_λ_backward(i_y, λ, work, stride);
            }
        }
        for (index_t c = 0; c < p; ++c) {
            const index_t c_  = cr_thread_assignment(l, c);
            const index_t i_u = add_wrap_ceil_p(c_, 1), i_y = sub_wrap_ceil_p(c_, (1 << l) - 1);
            if (ν2p(i_u) == l)
                solve_u_backward(l, i_u, λ, work, stride);
            else if (ν2p(i_y) == l)
                solve_y_backward(l, i_y, λ, stride);
        }
    }
    for (index_t c = 0; c < p; ++c)
        if (ν2p(c) == 0 && p != 1)
            solve_λ_backward(c, λ, work, stride);
}

Parallel reverse solve

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_reverse_parallel(Context &ctx, mut_view<> λ,
                                                                      mut_view<> work,
                                                                      index_t stride) const {
    const index_t c = ctx.index;
    for (index_t l = lp(); l-- > 0;) {
        const auto c_     = cr_thread_assignment(l, c);
        const index_t i_u = add_wrap_ceil_p(c_, 1), i_y = sub_wrap_ceil_p(c_, (1 << l) - 1);
        if (l < lp() - 1) {              // λ(0) was already computed during forward solve
            auto wait_uy = ctx.arrive(); // wait for Uᵀλ, Yᵀλ
            if (ν2p(i_y) == l + 1) {
                ctx.wait(std::move(wait_uy));
                solve_λ_backward(i_y, λ, work, stride);
            } else if (ν2p(i_u) == l) {
                prefetch_U(l, i_u);
                ctx.wait(std::move(wait_uy));
            } else {
                if (ν2p(i_y) == l)
                    prefetch_Y(l, i_y);
                ctx.wait(std::move(wait_uy));
            }
        }
        auto wait_λ = ctx.arrive(); // wait for λ
        if (ν2p(i_u) == l) {
            ctx.wait(std::move(wait_λ));
            solve_u_backward(l, i_u, λ, work, stride);
        } else if (ν2p(i_y) == l) {
            ctx.wait(std::move(wait_λ));
            solve_y_backward(l, i_y, λ, stride);
        } else {
            if (l > 0) {
                const auto l_next = l - 1, c_next = cr_thread_assignment(l_next, c);
                const index_t i_u_next = add_wrap_ceil_p(c_next, 1),
                              i_y_next = sub_wrap_ceil_p(c_next, (1 << l_next) - 1);
                if (ν2p(i_y_next) == l_next + 1) {
                    prefetch_U(l_next, i_u_next);
                    prefetch_L(i_y_next);
                }
            }
            ctx.wait(std::move(wait_λ));
        }
    }
    ctx.arrive_and_wait(); // wait for Uᵀλ, Yᵀλ
    if (ν2p(c) == 0 && p != 1)
        solve_λ_backward(c, λ, work, stride);
}

CR solve helper functions

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_u_forward(index_t l, index_t iU, mut_view<> λ,
                                                               index_t stride) const {
    if constexpr (v == 1)
        if (iU >= p && !circular) // happens in cases where p is not a power of two
            return;
    const index_t iL  = sub_wrap_ceil_p(iU, 1 << l); // = k, iU = k+2^l
    const index_t diU = iU * stride, diL = iL * stride;
    // 16|  b(0)⁺ = b(0) - U(2^l) b̃(2^l)
    // 21|  b(k)⁺ = b(k) - Y(k-2^l) b̃(k-2^l) - U(k+2^l) b̃(k+2^l)
    GUANAQO_TRACE("Subtract Ub", iL);
    gemv_sub(cr_U.batch(iU), λ.batch(diU), λ.batch(diL));
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_y_forward(index_t l, index_t iY, mut_view<> λ,
                                                               mut_view<> w, index_t stride) const {
    if constexpr (v == 1)
        if (iY + (1 << l) >= p && !circular) // Y(iY)=0 for scalar case
            return;
    const index_t iL  = add_wrap_ceil_p(iY, 1 << l); // = k, iY = k-2^l
    const index_t diY = iY * stride;
    // 21|  b(k)⁺ = b(k) - Y(k-2^l) b̃(k-2^l) - U(k+2^l) b̃(k+2^l)
    GUANAQO_TRACE("Subtract Yb", iL);
    gemv(cr_Y.batch(iY), λ.batch(diY), w.batch(iL));
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_λ_forward(index_t l, index_t iL, mut_view<> λ,
                                                               view<> w, index_t stride) const {
    const index_t diL = iL * stride;
    const index_t iY  = sub_wrap_ceil_p(iL, 1 << l);
    // 21|  b(k)⁺ = b(k) - Y(k-2^l) b̃(k-2^l) - U(k+2^l) b̃(k+2^l)
    if (v > 1 || iY + (1 << l) < p || circular) { // Equilvalent to iL >= (1 << l), kept for clarity
        // b(diL) -= w(iL)
        GUANAQO_TRACE("Subtract work b", iL);
        iL == 0 ? sub(λ.batch(diL), w.batch(iL), with_rotate<-1>) //
                : sub(λ.batch(diL), w.batch(iL));
    }
    // 14|  b̃(k)⁺ = L(k)⁻¹ b(k)⁺    -- for the next level
    if (ν2p(iL) == l + 1 && iL != 0) { // Don't solve the last level here
        GUANAQO_TRACE("Solve b", iL);
        // solve L(diL)⁻¹ b(diL)
        trsm(tril(cr_L.batch(iL)), λ.batch(diL));
    }
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_u_backward(index_t l, index_t iU, mut_view<> λ,
                                                                mut_view<> w,
                                                                index_t stride) const {
    if constexpr (v == 1)
        if (iU >= p && !circular) // happens in cases where p is not a power of two
            return;
    const index_t iL  = sub_wrap_ceil_p(iU, 1 << l); // = k, iU = k+2^l
    const index_t diL = iL * stride;
    // 25|  x(k) = L(k)⁻ᵀ (b̃(k) - Y(k)ᵀ x(k+2^l) - U(k)ᵀ x(k-2^l))
    GUANAQO_TRACE("Subtract Uᵀb", iL);
    // w[iU] = U[iU]ᵀ λ[diL]
    gemv(cr_U.batch(iU).transposed(), λ.batch(diL), w.batch(iU));
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_y_backward(index_t l, index_t iY, mut_view<> λ,
                                                                index_t stride) const {
    if constexpr (v == 1)
        if (iY + (1 << l) >= p && !circular) // Y(iY)=0 for scalar case
            return;
    const index_t iL  = add_wrap_ceil_p(iY, 1 << l); // = k, iY = k-2^l
    const index_t diL = iL * stride, diY = iY * stride;
    auto Y = cr_Y.batch(iY);
    // 25|  x(k) = L(k)⁻ᵀ (b̃(k) - Y(k)ᵀ x(k+2^l) - U(k)ᵀ x(k-2^l))
    GUANAQO_TRACE("Subtract Yᵀb", iL);
    // b[diY] -= Y[iY]ᵀ b[diL]
    v == 1 || iL > 0 ? gemv_sub(Y.transposed(), λ.batch(diL), λ.batch(diY)) //
                     : gemv_sub(Y.transposed(), λ.batch(diL), λ.batch(diY), with_rotate_B<1>);
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_λ_backward(index_t iL, mut_view<> λ, view<> w,
                                                                index_t stride) const {
    const index_t diL = iL * stride; // iL = k
    // 25|  x(k) = L(k)⁻ᵀ (b̃(k) - Y(k)ᵀ x(k+2^l) - U(k)ᵀ x(k-2^l))
    { // λ[diL] -= w[iL]
        GUANAQO_TRACE("Subtract work b", iL);
        sub(λ.batch(diL), w.batch(iL));
    }
    // solve D⁻ᵀ[diL] d[diL]
    GUANAQO_TRACE("Solve b", iL);
    BATMAT_ASSUME(iL != 0);
    trsm(tril(cr_L.batch(iL)).transposed(), λ.batch(diL));
}

Algorithm 6: PCR: Solution of a symmetric block-tridiagonal system using parallel cyclic reduction

Differences compared to the pseudo-code in the paper:

The solution step is separated from the factorization step. For the factorization, we use the periodic version below.
The solution is done in-place on the input λ.
We use an iterative approach to factor all levels, instead of recursion.

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_pcr(mut_batch_view<> λ,
                                                         mut_batch_view<> work_pcr) const {
    [&]<index_t... Levels>(std::integer_sequence<index_t, Levels...>) {
        (this->template solve_pcr_level<Levels>(λ, work_pcr), ...);
    }(std::make_integer_sequence<index_t, lv()>{});
    GUANAQO_TRACE("Solve PCR", lv());
    //  5|  x(k) = L(k)⁻ᵀ L(k)⁻¹ b(k)
    trsm(tril(pcr_L.batch(lv())), λ);
    trsm(triu(pcr_L.batch(lv()).transposed()), λ);
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <index_t Level>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::solve_pcr_level(mut_batch_view<> λ,
                                                               mut_batch_view<> work_pcr) const {
    GUANAQO_TRACE("Solve PCR", Level);
    auto L = pcr_L.batch(Level), Y = pcr_Y.batch(Level), U = pcr_U.batch(Level);
    static constexpr auto r = 1 << Level;
    //  8|  b̃(k) = L(k)⁻¹ b(k)
    trsm(tril(L), λ, work_pcr); // w = L⁻¹ λ
    // 11|  b(k)⁺ = b(k) - Y(k-2^l) b̃(k-2^l) - U(k+2^l) b̃(k+2^l)
    if constexpr (Level + 1 < lv() || !merge_last_level_pcr)
        gemv_sub(Y, work_pcr, λ, with_rotate_C<+r>, with_rotate_D<+r>);
    gemv_sub(U, work_pcr, λ, with_rotate_C<-r>, with_rotate_D<-r>);
}

Algorithm 7: Periodic PCR factorization of a block-tridiagonal matrix

Differences compared to the pseudo-code in the paper:

The factorization is done in-place on pcr_L, and the intermediate matrices K˂ and K˃ are stored in pcr_U and pcr_Y, before solving them in-place.
Triangular solves of the subdiagonal blocks are optionally parallelized.

Serial PCR factorization

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::factor_pcr() {
    [this]<index_t... Levels>(std::integer_sequence<index_t, Levels...>) {
        (this->template factor_pcr_level<Levels>(), ...);
    }(std::make_integer_sequence<index_t, lv()>{});
}
 
// The level is a template parameter to allow for compile-time vector rotations.
// The number of levels is small, so this should not bloat the code too much.
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <index_t Level>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::factor_pcr_level() {
    GUANAQO_TRACE("Factor PCR", Level);
    auto M      = Level == 0 ? cr_L.batch(0) : pcr_M.batch(0);
    auto K      = Level == 0 ? cr_Y.batch(0) : pcr_Y.batch(Level);
    auto M_next = pcr_M.batch(0);
    auto L = pcr_L.batch(Level), Y = pcr_Y.batch(Level), U = pcr_U.batch(Level);
    static constexpr auto r = 1 << Level; // 2^l
 
    //  9|  K̊(k) = K(k) + K(k+2^l)ᵀ
    if constexpr (Level + 1 == lv() && merge_last_level_pcr) {
        // In the last level, we only have a single sub-diagonal block, which is computed as
        // K(k) = -Y(k+2^l) U(k+2^l)ᵀ - U(k-2^l) Y(k-2^l)ᵀ. Since 2^l = -2^l mod v, we only need to
        // compute one term, and then add its transpose, K(k) ← K(k) + K(k+2^l)ᵀ. Because the right
        // half of the batches in K are zero in the absence of coupling between the first and
        // last blocks, we can perform the transposition in-place.
        if (!circular) {
            GUANAQO_TRACE("Merge last PCR level", Level, K.depth() / 2 * K.rows() * K.cols());
            using namespace batmat::datapar;
            using simd_half = deduced_simd<T, v / 2>;
            for (index_t j = 0; j < K.cols(); ++j)
                for (index_t i = 0; i < K.rows(); ++i)
                    aligned_store(aligned_load<simd_half>(&K(0, j, i)), &K(v / 2, i, j));
        } else {
            GUANAQO_TRACE("Merge last PCR level", Level, 2 * K.depth() * K.rows() * K.cols());
            // In case of circular coupling, we cannot exploit the complementarity of the batches,
            // so we cannot perform the transposition in-place. Instead, we transpose it into U
            // first (U is not used here, so we can overwrite it), and then add it to K.
            // TODO: is there a better way?
            batmat::linalg::copy(K.transposed(), U, with_rotate<-r>);
            linalg::add(K, U);
        }
    }
 
    //  4|  U(k) = K(k-2^l)ᵀ L(k)⁻ᵀ
    // 10|  U(k) = K̊(k-2^l)ᵀ L(k)⁻ᵀ
    trsm(K.transposed(), triu(L.transposed()), U, with_rotate_A<-r>);
    //  5|  Y(k) = K(k) L(k)⁻ᵀ
    if constexpr (Level + 1 < lv() || !merge_last_level_pcr)
        trsm(K, triu(L.transposed()), Y);
    //  8|  M(k)⁺ = M(k) - Y(k-2^l) Y(k-2^l)ᵀ - U(k+2^l) U(k+2^l)ᵀ
    // 11|  M(k)⁺ = M(k) - U(k+2^l) U(k+2^l)ᵀ
    //      -- implemented as M(k+2^l)⁺ = U(k) U(k)ᵀ
    syrk_sub(U, tril(M), tril(M_next), with_rotate_C<-r>, with_rotate_D<-r>);
    //      -- followed by    M(k-2^l)⁺ -= M(k-2^l) - Y(k) Y(k)ᵀ    (except in the last level)
    if constexpr (Level + 1 < lv() || !merge_last_level_pcr)
        syrk_sub(Y, tril(M_next), with_rotate_C<+r>, with_rotate_D<+r>);
    //  2|  L(k)⁺ = chol(M(k)⁺)    -- for the next level
    // 12|  L(k)⁺ = chol(M(k)⁺)    -- for the last level
    potrf(tril(M_next), tril(pcr_L.batch(Level + 1)));
    if constexpr (Level + 1 < lv()) {
        auto K_next = pcr_Y.batch(Level + 1);
        //  7|  K(k)⁺ = -Y(k+2^l) U(k+2^l)ᵀ    -- implemented as K(k-2^l)⁺ = -Y(k) U(k)ᵀ
        gemm_neg(Y, U.transposed(), K_next, {}, with_rotate_C<-r>, with_rotate_D<-r>);
        // TODO: we could store K_next in U instead of Y, so the last level would not need extra
        //       storage. But this is more complex, as we need to transpose it here, so we can
        //       perform the trsm in the next level in-place (which is not possible if the input
        //       and output are transposed).
    }
}

Parallel PCR factorization

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::factor_pcr_parallel(Context &ctx) {
    [this, &ctx]<index_t... Levels>(std::integer_sequence<index_t, Levels...>) {
        (this->template factor_pcr_level_parallel<Levels>(ctx), ...);
    }(std::make_integer_sequence<index_t, lv()>{});
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <index_t Level>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::factor_pcr_level_parallel(Context &ctx) {
    auto M      = Level == 0 ? cr_L.batch(0) : pcr_M.batch(0);
    auto K      = Level == 0 ? cr_Y.batch(0) : pcr_L.batch(Level + 1);
    auto M_next = pcr_M.batch(0);
    auto L = pcr_L.batch(Level), Y = pcr_Y.batch(Level), U = pcr_U.batch(Level);
    static constexpr auto r = 1 << Level; // 2^l
 
    // Use the same thread assignment as CR
    BATMAT_ASSUME(ctx.num_thr >= 2);
    const bool primary   = ν2p(ctx.index + 1) + 1 == lp(),
               secondary = ν2p(ctx.index + 1 + p / 2) + 1 == lp();
 
    if (secondary && Level + 1 == lv()) {
        GUANAQO_TRACE("Merge last PCR level", Level, K.depth() / 2 * K.rows() * K.cols());
        // In the last level, we only have a single sub-diagonal block, which is computed as
        // K(k) = -Y(k+2^l) U(k+2^l)ᵀ - U(k-2^l) Y(k-2^l)ᵀ. Since 2^l = -2^l mod v, we only need to
        // compute one term, and then add its transpose, K(k) ← K(k) + K(k+2^l)ᵀ. Because the right
        // half of the batches in K are zero in the absence of coupling between the first and
        // last blocks, we can perform the transposition in-place.
        if (!circular) {
            using namespace batmat::datapar;
            using simd_half = deduced_simd<T, v / 2>;
            for (index_t j = 0; j < K.cols(); ++j)
                for (index_t i = 0; i < K.rows(); ++i)
                    aligned_store(aligned_load<simd_half>(&K(0, j, i)), &K(v / 2, i, j));
        } else {
            GUANAQO_TRACE("Merge last PCR level", Level, 2 * K.depth() * K.rows() * K.cols());
            // In case of circular coupling, we cannot exploit the complementarity of the batches,
            // so we cannot perform the transposition in-place. Instead, we transpose it into U
            // first (U is not used here, so we can overwrite it), and then add it to K.
            // TODO: is there a better way?
            batmat::linalg::copy(K.transposed(), U, with_rotate<-r>);
            linalg::add(K, U);
        }
    }
 
    ctx.arrive_and_wait(); // wait for L and K
 
    if (primary) {
        GUANAQO_TRACE("Factor PCR U", Level);
        //  8|  U(k) = K(k-2^l)ᵀ L(k)⁻ᵀ
        trsm(K.transposed(), triu(L.transposed()), U, with_rotate_A<-r>);
    } else if (secondary && Level + 1 < lv()) {
        GUANAQO_TRACE("Factor PCR Y", Level);
        //  7|  Y(k) = K(k) L(k)⁻ᵀ
        trsm(K, triu(L.transposed()), Y);
    }
 
    if (Level + 1 < lv())
        ctx.arrive_and_wait(); // wait for U and Y
 
    if (primary) {
        GUANAQO_TRACE("Factor PCR L", Level);
        // 10|  M(k)⁺ = M(k) - Y(k-2^l) Y(k-2^l)ᵀ - U(k+2^l) U(k+2^l)ᵀ
        //      -- implemented as M(k-2^l)⁺ = M(k-2^l) - Y(k) Y(k)ᵀ
        syrk_sub(U, tril(M), tril(M_next), with_rotate_C<-r>, with_rotate_D<-r>);
        //      -- followed by    M(k+2^l)⁺ -= U(k) U(k)ᵀ
        if constexpr (Level + 1 < lv() || !merge_last_level_pcr)
            syrk_sub(Y, tril(M_next), with_rotate_C<+r>, with_rotate_D<+r>);
        //  3|  L(k)⁺ = chol(M(k)⁺)    -- for the next level
        potrf(tril(M_next), tril(pcr_L.batch(Level + 1)));
    } else if (secondary && Level + 1 < lv()) {
        GUANAQO_TRACE("Factor PCR K", Level);
        auto K_next = pcr_L.batch(Level + 2);
        // 11|  K(k)⁺ = -Y(k+2^l) U(k+2^l)ᵀ    -- implemented as K(k-2^l)⁺ = -Y(k) U(k)ᵀ
        gemm_neg(Y, U.transposed(), K_next, {}, with_rotate_C<-r>, with_rotate_D<-r>);
    }
}

Algorithm 8: Periodic PCR factorization updates by a block-bidiagonal matrix

Differences compared to the pseudo-code in the paper:

Updates are performed in-place on pcr_L, pcr_U and pcr_Y.
Intermediate update matrices are left rotated in memory to minimize the number of rotations required.

template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_pcr(batch_view<> fwd, batch_view<> bwd,
                                                          batch_view<> Σbwd) {
    index_t m = fwd.cols();
    BATMAT_ASSUME(m == bwd.cols());
    auto WYU = work_update_pcr_UY.left_cols(VL * m).batch(0);
    auto WY  = WYU.left_cols(VL * m / 2); // WY and WU start in the middle of WYU and grow outwards
    auto WU  = WYU.right_cols(VL * m / 2);
    auto Σ   = work_update_pcr_Σ.top_rows(VL * m).batch(0);
    batmat::linalg::copy(bwd, WU.left_cols(m));
    batmat::linalg::copy(fwd, WY.right_cols(m), with_rotate<-1>);
    batmat::linalg::copy(Σbwd, Σ.bottom_rows(m));
    [&]<index_t... Levels>(std::integer_sequence<index_t, Levels...>) {
        (this->template update_pcr_level<Levels>(m, WYU, Σ), ...);
    }(std::make_integer_sequence<index_t, TricyqleSolver::lv()>{});
}
 
template <index_t VL, class T, StorageOrder DefaultOrder, class Ctx>
template <index_t Level>
void TricyqleSolver<VL, T, DefaultOrder, Ctx>::update_pcr_level(index_t m, mut_batch_view<> WYU,
                                                                mut_batch_view<> WΣ) {
    constexpr index_t l = Level;
    // The algorithm requires the update matrices that are not reduced in the current level to be
    // offset by 2^l. We could do this by first rotating them by 2^l, applying the Householder
    // transformations, and then rotating them back. However, this would be inefficient, so instead
    // we leave the workspace rotated by 2^l from the previous level, and adjust the rotations in
    // the next level.
    constexpr index_t rot = 1 << l, prev_rot = rot >> 1;
    const index_t ml = m << l;
    GUANAQO_TRACE("Update PCR", l);
    auto Σ = WΣ.bottom_rows(2 * ml);
    if constexpr (prev_rot != 0)
        batmat::linalg::copy(Σ.bottom_rows(ml), Σ.bottom_rows(ml), with_rotate<+prev_rot>);
    batmat::linalg::copy(Σ.bottom_rows(ml), Σ.top_rows(ml), with_rotate<-rot>);
    if constexpr (l + 1 < lv()) {
        //          S(-1)    S(0)
        //  WL = [ Υ˃(0)  | Υ˂(0)  ]
        //  WY = [   0    | Υ˃(+1) ]
        //  WU = [ Υ˂(-1) |   0    ]
        auto WL  = work_update_pcr_L.left_cols(2 * ml).batch(0);
        auto WU0 = WYU.right_cols(VL * m / 2).left_cols(2 * ml);
        auto W0Y = WYU.left_cols(VL * m / 2).right_cols(2 * ml);
        auto WY  = W0Y.right_cols(ml);
        auto WU  = WU0.left_cols(ml);
        // undo workspace rotation
        batmat::linalg::copy(WY, WL.left_cols(ml), with_rotate<-prev_rot>);
        batmat::linalg::copy(WU, WL.right_cols(ml), with_rotate<+prev_rot>);
        // rotate element k-2^l to position k (but the workspace is already at -prev_rot)
        batmat::linalg::copy(WU, WU, with_rotate<-rot + prev_rot>);
        // rotate element k+2^l to position k (but the workspace is already at +prev_rot)
        batmat::linalg::copy(WY, WY, with_rotate<+rot - prev_rot>);
        // [ L̃(k;l) |       0       ]   [ L(k;l) | Υ˃(k;l)      Υ˂(k;l)     ]
        // [ Ũ(k;l) | Υ˂(k-2^l;l+1) ] = [ U(k;l) | Υ˂(k-2^l;l)     0        ] Q̆(k;l)
        // [ Ỹ(k;l) | Υ˃(k+2^l;l+1) ] = [ Y(k;l) |    0         Υ˃(k+2^l;l) ]
        hyhound_diag_cyclic(tril(pcr_L.batch(l)), WL, //
                            pcr_Y.batch(l), WY, W0Y,  //
                            pcr_U.batch(l), WU, WU0, Σ);
    } else {
        auto WL = WYU;
        auto WU = work_update_pcr_L.left_cols(2 * ml).batch(0);
        // undo workspace rotation
        batmat::linalg::copy(WYU.left_cols(ml), WL.left_cols(ml), with_rotate<-prev_rot>);
        batmat::linalg::copy(WYU.right_cols(ml), WL.right_cols(ml), with_rotate<+prev_rot>);
        //           S(-1)    S(0)
        //  WL =  [ Υ˃(0)  | Υ˂(0)  ]
        //  WYU = [ Υ˃(+1) | Υ˂(-1) |
        // rotate element k±2^l to position k
        batmat::linalg::copy(WL.left_cols(ml), WU.right_cols(ml), with_rotate<rot>);
        batmat::linalg::copy(WL.right_cols(ml), WU.left_cols(ml), with_rotate<rot>);
        // [ L̃(k;l) |       0       ]   [ L(k;l) | Υ˃(k;l)      Υ˂(k;l)     ]
        // [ Ũ(k;l) | Υ˂(k-2^l;l+1) ] = [ U(k;l) | Υ˂(k-2^l;l)  Υ˃(k+2^l;l) ] Q̆(k;l)
        hyhound_diag_2(tril(pcr_L.batch(l)), WL, pcr_U.batch(l), WU, Σ);
        batmat::linalg::copy(WU, WU, with_rotate<rot>); // undo rotation
        batmat::linalg::copy(Σ, Σ, with_rotate<+rot>);
        // Final diagonal block
        // [ L̃(k;l+1) |   0   ] = [ L(k;l+1) | Υ˃(k;l+1)  Υ˂(k;l+1) ] Q̆(k;l+1)
        hyhound_diag(tril(pcr_L.batch(l + 1)), WU, Σ);
    }
}