cyqlone develop
Fast, parallel and vectorized solver for linear systems with optimal control structure.
Loading...
Searching...
No Matches
solve-block-tridiagonal.cpp
Go to the documentation of this file.
1#include <cyqlone/cyqlone.hpp>
2#include <cyqlone/packing.hpp>
3#include <batmat/matrix/matrix.hpp>
4#include <guanaqo/blas/hl-blas-interface.hpp>
5#include <guanaqo/print.hpp>
6#include <algorithm>
7#include <iostream>
8#include <random>
9#include <string>
10
11#if CYQLONE_WITH_MATIO
12#include <cyqlone/matio.hpp>
13#endif
14
15using cyqlone::index_t; // Integer type for indices
16using cyqlone::real_t; // Floating-point type for scalars
18
19constexpr index_t v = 4; // Vector length to use
20using Solver = cyqlone::TricyqleSolver<v, real_t>; // Block tridiagonal solver to use
21
23 matrices M, K, b; // Diagonal blocks M, subdiagonal blocks K and right-hand side b
24};
25
26// Generate a random SPD block tridiagonal system with given block size and number of blocks.
27TridiagSystem init_random_system(index_t block_size, index_t num_blocks, bool circular = false);
28
29// Example of solving a block tridiagonal system using the TricyqleSolver.
30int main(int argc, char *argv[]) try {
31 // Problem dimensions and parameters
32 index_t p = 8; // Number of processors/threads
33 index_t block_size = 5; // Size of each block of the block tridiagonal system
34 bool circular = false; // Is K(num_blocks-1) nonzero?
35 // Parse arguments
36 if (argc > 1)
37 p = std::stoi(argv[1]);
38 if (argc > 2)
39 block_size = std::stoi(argv[2]);
40 if (argc > 3)
41 circular = std::stoi(argv[3]) != 0;
42 // The number of blocks in the block tridiagonal system always equals p * v. If your system is
43 // larger, first reduce it to this size (in parallel). If it is smaller, manually add padding,
44 // or reduce p and/or v.
46 const index_t num_blocks = p * v;
47
48 // Diagonal blocks M, subdiagonal blocks K and right-hand side b of a block tridiagonal system
49 auto [M, K, b] = init_random_system(block_size, num_blocks, circular);
50
51 // Create a solver for the block tridiagonal system
52 Solver solver{.block_size = block_size, .circular = circular, .p = p};
53 solver.params.solve_method = cyqlone::SolveMethod::PCR; // Use the PCR solver instead of PCG.
54
55 // Create a compact copy of the right-hand side and allocate storage for the solution
56 Solver::matrix b_solve{{.depth = num_blocks, .rows = block_size, .cols = 1}};
57 matrices x{{.depth = num_blocks, .rows = block_size, .cols = 1}};
58
59 // Call the solver in parallel, passing a lambda that is executed by each thread.
60 auto pctx = solver.create_parallel_context();
61 pctx->run([&](Solver::Context &ctx) {
62 // Copy the data to the solver's internal data structures
64 auto pack_M = [&](index_t i, auto Ms) { return pack(M.middle_batches(i, v, p), Ms); };
65 auto pack_K = [&](index_t i, auto Ks) { return pack(K.middle_batches(i, v, p), Ks); };
66 auto pack_b = [&](index_t i, auto bs) { return pack(b.middle_batches(i, v, p), bs); };
67 solver.init_diag(ctx, pack_M);
68 solver.init_subdiag(ctx, pack_K);
69 solver.init_rhs(ctx, b_solve, pack_b);
70 // Perform the factorization and the forward solve (fused), followed by the backward solve
71 solver.factor_solve(ctx, b_solve);
72 solver.solve_reverse(ctx, b_solve);
73 // Copy the solution back to the original layout
75 auto unpack_x = [&](index_t i, auto xs) { return unpack(xs, x.middle_batches(i, v, p)); };
76 solver.get_solution(ctx, b_solve, unpack_x);
77 }); // blocks until all threads have joined
78
79#if CYQLONE_WITH_MATIO
80 // Export the original system and the solution as a .mat file
81 std::filesystem::path filename = "block_tridiagonal_system.mat";
82 auto matfile = cyqlone::create_mat(filename);
83 cyqlone::add_to_mat(matfile.get(), "M", M);
84 cyqlone::add_to_mat(matfile.get(), "K", K);
85 cyqlone::add_to_mat(matfile.get(), "b", b);
86 cyqlone::add_to_mat(matfile.get(), "x", x);
87 std::cout << "Saved system and solution to " << filename << "\n";
88#else
89 // Print the original system and the solution in a format that's easy to check using Python
90 std::cout << "import numpy as np\n";
91 std::cout << "v = " << v << "\n";
92 std::cout << "n = " << block_size << "\n";
93 std::cout << "N = " << num_blocks << "\n";
94 std::cout << "M = np.array([\n";
95 for (index_t i = 0; i < num_blocks; ++i)
96 guanaqo::print_python(std::cout, M(i), ",\n", false);
97 std::cout << "])\nK = np.array([\n";
98 for (index_t i = 0; i < num_blocks; ++i)
99 guanaqo::print_python(std::cout, K(i), ",\n", false);
100 std::cout << "])\nb = np.array([\n";
101 for (index_t i = 0; i < num_blocks; ++i)
102 guanaqo::print_python(std::cout, b(i), ",\n");
103 std::cout << "])\nx = np.array([\n";
104 for (index_t i = 0; i < num_blocks; ++i)
105 guanaqo::print_python(std::cout, x(i), ",\n");
106 std::cout << "])\n";
107#endif
108} catch (std::exception &e) {
109 std::cerr << "Error: " << e.what() << "\n";
110 return 1;
111}
112
113TridiagSystem init_random_system(index_t block_size, index_t num_blocks, bool circular) {
115 std::mt19937 rng(12345);
116 std::normal_distribution<real_t> dist(0.0, 1.0);
117 // Generate a random lower block bidiagonal matrix with diagonal blocks A(i) and subdiagonal
118 // blocks B(i).
119 matrices A{{.depth = num_blocks, .rows = block_size, .cols = block_size}, uninitialized};
120 matrices B{{.depth = num_blocks, .rows = block_size, .cols = block_size}, uninitialized};
121 std::ranges::generate(A, [&] { return dist(rng); });
122 std::ranges::generate(B, [&] { return dist(rng); });
123 // Allocate storage for a block tridiagonal system
124 matrices M{{.depth = num_blocks, .rows = block_size, .cols = block_size}, uninitialized};
125 matrices K{{.depth = num_blocks, .rows = block_size, .cols = block_size}, uninitialized};
126 matrices b{{.depth = num_blocks, .rows = block_size, .cols = 1}, uninitialized};
127 std::ranges::generate(b, [&] { return dist(rng); });
128 if (!circular)
129 B(num_blocks - 1).set_constant(0);
130 // Diagonal blocks M(i) = A(i) A(i)ᵀ + B(i-1) B(i-1)ᵀ and subdiagonal blocks K(i) = B(i) A(i)ᵀ
131 for (index_t i = 0; i < num_blocks; ++i) {
132 index_t i_prev = (i - 1 + num_blocks) % num_blocks;
133 guanaqo::blas::xsyrk_LN<real_t>(1, A(i), 0, M(i)); // M(i) = A(i) A(i)ᵀ
134 guanaqo::blas::xsyrk_LN<real_t>(1, B(i_prev), 1, M(i)); // M(i) += B(i-1) B(i-1)ᵀ
135 guanaqo::blas::xgemm_NT<real_t>(1, B(i), A(i), 0, K(i)); // K(i) = B(i) A(i)ᵀ
136 M(i).add_to_diagonal(1e-4); // Ensure positive definiteness
137 }
138 return TridiagSystem{.M = std::move(M), .K = std::move(K), .b = std::move(b)};
139}
#define BATMAT_ASSERT(x)
The main header for the Cyqlone and Tricyqle linear solvers.
int main()
std::ostream & print_python(std::ostream &os, std::span< T, E > x, std::string_view end="\n", bool squeeze=true)
void xsyrk_LN(T alpha, std::type_identity_t< MatrixView< const T, I > > A, T beta, MatrixView< T, I > C)
void xgemm_NT(T alpha, std::type_identity_t< MatrixView< const T, I, UnitStride< I >, O > > A, std::type_identity_t< MatrixView< const T, I, UnitStride< I >, O > > B, T beta, MatrixView< T, I, UnitStride< I >, O > C)
@ PCR
Parallel Cyclic Reduction (direct).
void unpack(VA &&A, VB &&B)
Copy a compact batch of matrices A to multiple scalar matrices B.
Definition packing.hpp:147
void pack(VA &&A, VB &&B)
Copy multiple scalar matrices A to a compact batch of matrices B.
Definition packing.hpp:157
struct batmat::matrix::uninitialized_t uninitialized
MatFilePtr create_mat(const std::filesystem::path &filename)
Create and open a new .mat file for writing.
Definition matio.cpp:97
void add_to_mat(mat_t *mat, const std::string &varname, float value)
Add a value to an open .mat file.
Definition matio.cpp:122
Functions for exporting and loading matrices and OCP data to and from .mat files.
constexpr bool is_pow_2(index_t n)
Definition cyqlone.hpp:32
batmat::matrix::Matrix< real_t, index_t > matrices
constexpr index_t v
TridiagSystem init_random_system(index_t block_size, index_t num_blocks, bool circular=false)
std::function< void(benchmark::State &, SpringMassParams)> run
auto middle_batches(index_type b, index_type n, index_type stride=1)
void set_constant(value_type t)
Solver for block-tridiagonal systems using cyclic reduction (CR), parallel cyclic reduction (PCR),...
Definition cyqlone.hpp:66
batmat::matrix::Matrix< value_type, index_t, vl_t, index_t, O, align_t > matrix
Definition cyqlone.hpp:152