cyqlone develop
Fast, parallel and vectorized solver for linear systems with optimal control structure.
Loading...
Searching...
No Matches
solve-block-tridiagonal.cpp
Example demonstrating how to solve a block tridiagonal system in parallel

The system has the form

\[\begin{pmatrix} M_0 & \tp{K_0} & 0 & 0 & \cdots & K_{N-1} \\ K_0 & M_1 & \tp{K_1} & 0 & \cdots & 0 \\ 0 & K_1 & M_2 & \tp{K_2} & \cdots & 0 \\ 0 & 0 & K_2 & M_3 & \cdots & 0 \\ \vdots & \vdots & \vdots & \vdots & \ddots & \vdots \\ \tp{K_{N-1}} & 0 & 0 & 0 & \cdots & M_{N-1} \end{pmatrix} \begin{pmatrix} x^0 \\ x^1 \\ x^2 \\ x^3 \\ \vdots \\ x^{N-1} \end{pmatrix} = \begin{pmatrix} b^0 \\ b^1 \\ b^2 \\ b^3 \\ \vdots \\ b^{N-1} \end{pmatrix} \]

We use cyqlone::TricyqleSolver to solve the system in parallel. Since this solver uses vectorization using batched linear algebra operations and partitions the matrices and vectors across multiple threads, each thread needs to pack its local portion of the system into batches of matrices and vectors. In this example, we prepare the system in the batches of matrices M and K, but in practice, it is often more efficient to prepare the system in parallel, writing the local blocks into the solver's workspace directly.

The solution produced by this example is stored in a .mat file, which can be loaded and verified using the Python script solve-block-tridiagonal.py.

1#include <cyqlone/cyqlone.hpp>
2#include <cyqlone/packing.hpp>
5#include <guanaqo/print.hpp>
6#include <algorithm>
7#include <iostream>
8#include <random>
9#include <string>
10
11#if CYQLONE_WITH_MATIO
12#include <cyqlone/matio.hpp>
13#endif
14
15using cyqlone::index_t; // Integer type for indices
16using cyqlone::real_t; // Floating-point type for scalars
17using matrices = batmat::matrix::Matrix<real_t, index_t>; // Array of matrices
18
19constexpr index_t v = 4; // Vector length to use
20using Solver = cyqlone::TricyqleSolver<v, real_t>; // Block tridiagonal solver to use
21
22struct TridiagSystem {
23 matrices M, K, b; // Diagonal blocks M, subdiagonal blocks K and right-hand side b
24};
25
26// Generate a random SPD block tridiagonal system with given block size and number of blocks.
27TridiagSystem init_random_system(index_t block_size, index_t num_blocks, bool circular = false);
28
29// Example of solving a block tridiagonal system using the TricyqleSolver.
30int main(int argc, char *argv[]) try {
31 // Problem dimensions and parameters
32 index_t p = 8; // Number of processors/threads
33 index_t block_size = 5; // Size of each block of the block tridiagonal system
34 bool circular = false; // Is K(num_blocks-1) nonzero?
35 // Parse arguments
36 if (argc > 1)
37 p = std::stoi(argv[1]);
38 if (argc > 2)
39 block_size = std::stoi(argv[2]);
40 if (argc > 3)
41 circular = std::stoi(argv[3]) != 0;
42 // The number of blocks in the block tridiagonal system always equals p * v. If your system is
43 // larger, first reduce it to this size (in parallel). If it is smaller, manually add padding,
44 // or reduce p and/or v.
46 const index_t num_blocks = p * v;
47
48 // Diagonal blocks M, subdiagonal blocks K and right-hand side b of a block tridiagonal system
49 auto [M, K, b] = init_random_system(block_size, num_blocks, circular);
50
51 // Create a solver for the block tridiagonal system
52 Solver solver{.block_size = block_size, .circular = circular, .p = p};
53 solver.params.solve_method = cyqlone::SolveMethod::PCR; // Use the PCR solver instead of PCG.
54
55 // Create a compact copy of the right-hand side and allocate storage for the solution
56 Solver::matrix b_solve{{.depth = num_blocks, .rows = block_size, .cols = 1}};
57 matrices x{{.depth = num_blocks, .rows = block_size, .cols = 1}};
58
59 // Call the solver in parallel, passing a lambda that is executed by each thread.
60 auto pctx = solver.create_parallel_context();
61 pctx->run([&](Solver::Context &ctx) {
62 // Copy the data to the solver's internal data structures
64 auto pack_M = [&](index_t i, auto Ms) { return pack(M.middle_batches(i, v, p), Ms); };
65 auto pack_K = [&](index_t i, auto Ks) { return pack(K.middle_batches(i, v, p), Ks); };
66 auto pack_b = [&](index_t i, auto bs) { return pack(b.middle_batches(i, v, p), bs); };
67 solver.init_diag(ctx, pack_M);
68 solver.init_subdiag(ctx, pack_K);
69 solver.init_rhs(ctx, b_solve, pack_b);
70 // Perform the factorization and the forward solve (fused), followed by the backward solve
71 solver.factor_solve(ctx, b_solve);
72 solver.solve_reverse(ctx, b_solve);
73 // Copy the solution back to the original layout
75 auto unpack_x = [&](index_t i, auto xs) { return unpack(xs, x.middle_batches(i, v, p)); };
76 solver.get_solution(ctx, b_solve, unpack_x);
77 }); // blocks until all threads have joined
78
79#if CYQLONE_WITH_MATIO
80 // Export the original system and the solution as a .mat file
81 std::filesystem::path filename = "block_tridiagonal_system.mat";
82 auto matfile = cyqlone::create_mat(filename);
83 cyqlone::add_to_mat(matfile.get(), "M", M);
84 cyqlone::add_to_mat(matfile.get(), "K", K);
85 cyqlone::add_to_mat(matfile.get(), "b", b);
86 cyqlone::add_to_mat(matfile.get(), "x", x);
87 std::cout << "Saved system and solution to " << filename << "\n";
88#else
89 // Print the original system and the solution in a format that's easy to check using Python
90 std::cout << "import numpy as np\n";
91 std::cout << "v = " << v << "\n";
92 std::cout << "n = " << block_size << "\n";
93 std::cout << "N = " << num_blocks << "\n";
94 std::cout << "M = np.array([\n";
95 for (index_t i = 0; i < num_blocks; ++i)
96 guanaqo::print_python(std::cout, M(i), ",\n", false);
97 std::cout << "])\nK = np.array([\n";
98 for (index_t i = 0; i < num_blocks; ++i)
99 guanaqo::print_python(std::cout, K(i), ",\n", false);
100 std::cout << "])\nb = np.array([\n";
101 for (index_t i = 0; i < num_blocks; ++i)
102 guanaqo::print_python(std::cout, b(i), ",\n");
103 std::cout << "])\nx = np.array([\n";
104 for (index_t i = 0; i < num_blocks; ++i)
105 guanaqo::print_python(std::cout, x(i), ",\n");
106 std::cout << "])\n";
107#endif
108} catch (std::exception &e) {
109 std::cerr << "Error: " << e.what() << "\n";
110 return 1;
111}
112
113TridiagSystem init_random_system(index_t block_size, index_t num_blocks, bool circular) {
115 std::mt19937 rng(12345);
116 std::normal_distribution<real_t> dist(0.0, 1.0);
117 // Generate a random lower block bidiagonal matrix with diagonal blocks A(i) and subdiagonal
118 // blocks B(i).
119 matrices A{{.depth = num_blocks, .rows = block_size, .cols = block_size}, uninitialized};
120 matrices B{{.depth = num_blocks, .rows = block_size, .cols = block_size}, uninitialized};
121 std::ranges::generate(A, [&] { return dist(rng); });
122 std::ranges::generate(B, [&] { return dist(rng); });
123 // Allocate storage for a block tridiagonal system
124 matrices M{{.depth = num_blocks, .rows = block_size, .cols = block_size}, uninitialized};
125 matrices K{{.depth = num_blocks, .rows = block_size, .cols = block_size}, uninitialized};
126 matrices b{{.depth = num_blocks, .rows = block_size, .cols = 1}, uninitialized};
127 std::ranges::generate(b, [&] { return dist(rng); });
128 if (!circular)
129 B(num_blocks - 1).set_constant(0);
130 // Diagonal blocks M(i) = A(i) A(i)ᵀ + B(i-1) B(i-1)ᵀ and subdiagonal blocks K(i) = B(i) A(i)ᵀ
131 for (index_t i = 0; i < num_blocks; ++i) {
132 index_t i_prev = (i - 1 + num_blocks) % num_blocks;
133 guanaqo::blas::xsyrk_LN<real_t>(1, A(i), 0, M(i)); // M(i) = A(i) A(i)ᵀ
134 guanaqo::blas::xsyrk_LN<real_t>(1, B(i_prev), 1, M(i)); // M(i) += B(i-1) B(i-1)ᵀ
135 guanaqo::blas::xgemm_NT<real_t>(1, B(i), A(i), 0, K(i)); // K(i) = B(i) A(i)ᵀ
136 M(i).add_to_diagonal(1e-4); // Ensure positive definiteness
137 }
138 return TridiagSystem{.M = std::move(M), .K = std::move(K), .b = std::move(b)};
139}
#define BATMAT_ASSERT(x)
The main header for the Cyqlone and Tricyqle linear solvers.
int main()
std::ostream & print_python(std::ostream &os, std::span< T, E > x, std::string_view end="\n", bool squeeze=true)
void xsyrk_LN(T alpha, std::type_identity_t< MatrixView< const T, I > > A, T beta, MatrixView< T, I > C)
void xgemm_NT(T alpha, std::type_identity_t< MatrixView< const T, I, UnitStride< I >, O > > A, std::type_identity_t< MatrixView< const T, I, UnitStride< I >, O > > B, T beta, MatrixView< T, I, UnitStride< I >, O > C)
std::ptrdiff_t index_t
@ PCR
Parallel Cyclic Reduction (direct).
void unpack(VA &&A, VB &&B)
Copy a compact batch of matrices A to multiple scalar matrices B.
Definition packing.hpp:147
void pack(VA &&A, VB &&B)
Copy multiple scalar matrices A to a compact batch of matrices B.
Definition packing.hpp:157
struct batmat::matrix::uninitialized_t uninitialized
MatFilePtr create_mat(const std::filesystem::path &filename)
Create and open a new .mat file for writing.
Definition matio.cpp:97
void add_to_mat(mat_t *mat, const std::string &varname, float value)
Add a value to an open .mat file.
Definition matio.cpp:122
Functions for exporting and loading matrices and OCP data to and from .mat files.
constexpr bool is_pow_2(index_t n)
Definition cyqlone.hpp:32
batmat::matrix::Matrix< real_t, index_t > matrices
constexpr index_t v
TridiagSystem init_random_system(index_t block_size, index_t num_blocks, bool circular=false)
std::function< void(benchmark::State &, SpringMassParams)> run
auto middle_batches(index_type b, index_type n, index_type stride=1)
void set_constant(value_type t)
Solver for block-tridiagonal systems using cyclic reduction (CR), parallel cyclic reduction (PCR),...
Definition cyqlone.hpp:66
batmat::matrix::Matrix< value_type, index_t, vl_t, index_t, O, align_t > matrix
Definition cyqlone.hpp:152