1#include <benchmark/benchmark.h>
11#include <batmat/dtypes.hpp>
12#include <batmat/lut.hpp>
13#include <batmat/openmp.h>
14#include <guanaqo/demangled-typename.hpp>
15#include <guanaqo/pcm/counters.hpp>
16#include <guanaqo/perfetto/trace.hpp>
17#include <guanaqo/string-util.hpp>
18#include <guanaqo/stringify.h>
19#include <batmat-version.h>
20#include <cyqlone-version.h>
31namespace fs = std::filesystem;
35#elif GUANAQO_WITH_OPENBLAS
36#include <openblas_config.h>
41#if defined(__ARM_NEON)
42constexpr auto v_native = std::is_same_v<batmat::real_t, double> ? 2 : 4;
44constexpr auto v_native = std::is_same_v<batmat::real_t, double> ? 4 : 8;
49 return benchmark::Counter{
static_cast<double>(x), benchmark::Counter::kDefaults};
52 return benchmark::Counter{
static_cast<double>(x), benchmark::Counter::kAvgIterations};
83 std::vector<int>
horizon{32, 64, 96, 128, 192, 256};
101using cyqlone::index_t;
102using cyqlone::real_t;
109 std::seed_seq seed{params.
seed, params.
seed >> 32};
110 std::mt19937 rng{seed};
111 using dist_t = std::uniform_real_distribution<real_t>;
112 auto dist = params.
p_max > 1 ? dist_t{-3, 3} : dist_t{-0.6, -0.1};
113 for (index_t r = 0; r < static_cast<index_t>(params.
masses.size()); ++r)
114 problem.ocp.b(0)(r, 0) += dist(rng);
118#if GUANAQO_WITH_TRACING
119std::map<std::string, fs::path> traces;
121void trace_run(
auto &&fun,
const auto &name) {
122 std::filesystem::path out_dir{
"traces"};
123 out_dir /= *cyqlone_commit_hash ? cyqlone_commit_hash :
"unknown";
124 std::filesystem::path out_file = out_dir / name;
125 if (
auto [_, ins] = traces.try_emplace(name, out_file); !ins)
127#if GUANAQO_WITH_PERFETTO
128#if GUANAQO_WITH_PCM_TRACING
134#if GUANAQO_WITH_PCM_TRACING
146 std::filesystem::create_directories(out_dir);
147 std::ofstream csv{out_file.replace_extension(
".csv")};
150 for (
const auto &log : logs)
153 cyqlone::write_chrome_trace(out_file.replace_extension(
".json.gz"), logs);
158 for (
const auto &[_, pth] : traces)
167template <index_t VL, qp::StorageOrder Order>
168void run_benchmark(benchmark::State &state, [[maybe_unused]]
const std::string ¶m_name,
170 qp::Settings settings,
bool warm =
false,
bool trace =
false)
try {
173 return state.SkipWithMessage(
"Number of processors must be at least 1.");
181 double time_solve = 0, time_active_set_change = 0, time_line_search = 0,
182 time_recompute_outer = 0, time_recompute_inner = 0, time_mat_vec = 0;
185 const auto nx = problem.ocp.dim.nx, nu = problem.ocp.dim.nu;
186 qpalm.warm_start_solution();
187 auto x = qpalm.get_solution();
188 std::copy_n(x.data() + nu, nx, problem.ocp.b(0).data);
189 ocp.update(problem.ocp);
190 update_cyqpalm_backend(*qpalm.backend, ocp);
192 for (
auto _ : state) {
194 state.SetIterationTime(
seconds(qpalm.stats->timings.total.wall_time).count());
195 time_solve +=
seconds(qpalm.stats->timings.solve.wall_time).count();
196 time_active_set_change +=
seconds(qpalm.stats->timings.active_set_change.wall_time).count();
197 time_line_search +=
seconds(qpalm.stats->timings.line_search.wall_time).count();
198 time_recompute_outer +=
seconds(qpalm.stats->timings.recompute_outer.wall_time).count();
199 time_recompute_inner +=
seconds(qpalm.stats->timings.recompute_inner.wall_time).count();
200 time_mat_vec +=
seconds(qpalm.stats->timings.mat_vec_A.wall_time).count();
201 time_mat_vec +=
seconds(qpalm.stats->timings.mat_vec_AT.wall_time).count();
202 time_mat_vec +=
seconds(qpalm.stats->timings.mat_vec_M.wall_time).count();
203 time_mat_vec +=
seconds(qpalm.stats->timings.mat_vec_MT.wall_time).count();
204 time_mat_vec +=
seconds(qpalm.stats->timings.mat_vec_Q.wall_time).count();
208 auto sol = ocp.reconstruct_solution(problem.ocp, qpalm.get_solution(),
209 qpalm.get_inequality_multipliers(),
210 qpalm.get_equality_multipliers());
211 auto kkt_error = problem.ocp.compute_kkt_error(sol);
213 state.counters[
"status"] =
counter(status);
215 state.counters[
"iter"] =
counter(qpalm.stats->inner_iter);
216 state.counters[
"outer_iter"] =
counter(qpalm.stats->outer_iter);
217 state.counters[
"num_factor"] =
counter(qpalm.stats_backend->num_factor);
218 state.counters[
"num_upd"] =
counter(qpalm.stats_backend->num_updates);
219 state.counters[
"rank_upd"] =
counter(qpalm.stats_backend->rank_updates);
220 state.counters[
"t_solve"] =
counter_avg(time_solve);
221 state.counters[
"t_active_set_change"] =
counter_avg(time_active_set_change);
222 state.counters[
"t_line_search"] =
counter_avg(time_line_search);
223 state.counters[
"t_recompute_outer"] =
counter_avg(time_recompute_outer);
224 state.counters[
"t_recompute_inner"] =
counter_avg(time_recompute_inner);
225 state.counters[
"t_mat_vec"] =
counter_avg(time_mat_vec);
226 state.counters[
"res_dual"] =
counter(kkt_error.stationarity);
227 state.counters[
"res_eq"] =
counter(kkt_error.equality_residual);
228 state.counters[
"res_ineq"] =
counter(kkt_error.inequality_residual);
229 state.counters[
"compl"] =
counter(kkt_error.complementarity);
230}
catch (
const std::exception &e) {
231 state.SkipWithError(e.what());
242void run_benchmark_hpipm(benchmark::State &state,
const SpringMassParams ¶ms,
245 auto qp_data = build_hpipm_qp(problem.ocp);
248 solve_hpipm(*qp_data, *solver_data);
249 std::optional<cyqlone::LinearOCPStorage::Solution> prev_sol;
251 prev_sol = get_solution_hpipm(*solver_data);
252 update_x0_hpipm(problem.ocp, *qp_data, prev_sol->solution);
254 std::ranges::fill(prev_sol->solution, real_t{0});
256 shift_solution_hpipm(*qp_data, prev_sol->solution);
258 for (
auto _ : state) {
260 warm_start_hpipm(*solver_data, prev_sol->solution);
261 auto time_solve = solve_hpipm(*qp_data, *solver_data);
262 state.SetIterationTime(
seconds(time_solve).count());
264 auto sol = get_solution_hpipm(*solver_data);
265 auto kkt_error = problem.ocp.compute_kkt_error(sol);
266 auto stats = get_stats_hpipm(*solver_data);
267 state.counters[
"status"] =
counter(stats.status);
268 state.counters[
"success"] =
counter(stats.status == 0);
269 state.counters[
"iter"] =
counter(stats.iter);
270 state.counters[
"max_res_stat"] =
counter(stats.max_res_stat);
271 state.counters[
"max_res_eq"] =
counter(stats.max_res_eq);
272 state.counters[
"max_res_ineq"] =
counter(stats.max_res_ineq);
273 state.counters[
"max_res_comp"] =
counter(stats.max_res_comp);
274 state.counters[
"res_dual"] =
counter(kkt_error.stationarity);
275 state.counters[
"res_eq"] =
counter(kkt_error.equality_residual);
276 state.counters[
"res_ineq"] =
counter(kkt_error.inequality_residual);
277 state.counters[
"compl"] =
counter(kkt_error.complementarity);
288 for (
auto M : opts.
masses)
294 .name = format(
"spring-mass(M={},N={},seed={}) wb2008", M, N, seed),
300 .name = format(
"spring-mass(M={},N={},seed={}) wb2008w", M, N, seed),
306 .name = format(
"spring-mass(M={},N={},seed={}) d2012", M, N, seed),
312 .name = format(
"spring-mass(M={},N={},seed={}) act", M, N, seed),
316 default:
throw std::runtime_error(
"Unknown problem type");
326 if (fs::is_directory(filename))
327 filename /= params.name +
".mat";
330 std::cout <<
"Exported problem " << params.name <<
" to " << filename << std::endl;
342template <index_t VL, qp::StorageOrder O>
344 const auto cyqlone_solver = [trace{opts.
trace}](std::string_view name,
347 const auto param_name =
348 std::format(
"cyqlone(p={},v={},{},{})", backend.processors, VL,
order(O), name);
358 .max_update_count = 20,
369 .dual_tolerance = 1e-8,
370 .initial_penalty_y = 20,
382 co_yield cyqlone_solver(
"zero", backend, settings);
384 co_yield cyqlone_solver(
"zero,upd=0", backend_no_upd, settings);
388 co_yield cyqlone_solver(
"shift", backend, settings_warm);
390 co_yield cyqlone_solver(
"shift,upd=0", backend_no_upd, settings_warm);
394 co_yield cyqlone_solver(
"copy", backend, settings_warm);
396 co_yield cyqlone_solver(
"copy,upd=0", backend_no_upd, settings_warm);
405 co_yield {
"hpipm(zero)", [=](benchmark::State &state,
const SpringMassParams ¶ms) {
409 co_yield {
"hpipm(shift)", [=](benchmark::State &state,
const SpringMassParams ¶ms) {
413 co_yield {
"hpipm(copy)", [=](benchmark::State &state,
const SpringMassParams ¶ms) {
417 throw std::invalid_argument(
"HPIPM support not enabled in this build");
422using std::ranges::elements_of;
424template <qp::StorageOrder Order>
427 static constexpr auto to_string = [](
auto vl) {
return std::to_string(vl); };
432 auto dispatch = std::ranges::find(lut,
v, &
decltype(lut)::value_type::first);
433 if (dispatch == lut.end())
434 throw std::invalid_argument(
"Unsupported vector length. Supported lengths: " +
436 co_yield elements_of(dispatch->second(opts));
449 size_t max_problem_name_len = 0;
450 size_t max_solver_name_len = 0;
452 max_problem_name_len = std::max(max_problem_name_len, problem.name.size());
454 max_solver_name_len = std::max(max_solver_name_len, solver.name.size());
455 benchmark::RegisterBenchmark(std::format(
"{}@{}", problem.name, solver.name),
456 [params = problem.params, run = std::move(solver.run)](
457 benchmark::State &state) { run(state, params); })
458 ->MeasureProcessCPUTime()
460 ->Unit(benchmark::kMillisecond)
461 ->ComputeStatistics(
"max", [](
auto &
v) {
return *std::ranges::max_element(
v); })
462 ->ComputeStatistics(
"min", [](
auto &
v) {
return *std::ranges::min_element(
v); })
466 return std::make_pair(max_problem_name_len, max_solver_name_len);
470 size_t solver_name_width,
475 if (std::getenv(
"NO_COLOR"))
477 else if (std::getenv(
"CLICOLOR_FORCE"))
480 opts.
use_color = isatty(fileno(stdout)) == 1;
481#if BATMAT_WITH_OPENMP
484 app.usage(std::string(program) +
" [options] -- [benchmark options]");
485 app.footer(std::format(
"Benchmark options are passed to the Google Benchmark framework. "
486 "Use {} -- --help or see "
487 "https://google.github.io/benchmark/user_guide.html for details.",
490 app.add_flag(
"--cold,!--no-cold", opts.
cold,
"Benchmark with cold starting");
491 app.add_flag(
"--warm-shift,!--no-warm-shift", opts.
warm_shift,
492 "Benchmark with warm starting (shift)");
493 app.add_flag(
"--warm-copy,!--no-warm-copy", opts.
warm_copy,
494 "Benchmark with warm starting (copy)");
495 app.add_option(
"--horizon,-N", opts.
horizon,
"Specify a horizon length to benchmark");
496 app.add_option(
"--masses,-M", opts.
masses,
"Specify the number of masses to benchmark");
498 "Number of random problem instances to generate for each (M,N) pair");
499 app.add_option(
"--seed,-s", opts.
seed,
"Random seed for problem instance generation");
500 app.add_flag(
"--no-updates", opts.
no_updates,
"Compare to the Cyqlone backend without updates");
501 app.add_option(
"--parallelism,-p", opts.
parallelism,
502 "The number of threads to use in the Cyqlone backend");
504 "The vector length to use in the Cyqlone backend");
505 app.add_flag(
"--rm,!--no-rm", opts.
rm,
"Use row-major (default) storage");
506 app.add_flag(
"--cm,!--no-cm", opts.
cm,
"Use column-major storage");
507 app.add_flag(
"--pcr,!--pcg", opts.
pcr,
"Use parallel cyclic reduction in the Cyqlone backend");
508 app.add_flag(
"--hpipm,!--no-hpipm", opts.
hpipm,
"Use HPIPM solver for comparison");
509 app.add_option(
"--problem", opts.
problem_type,
"Problem type to benchmark")
512 "Maximum update rank fraction when using PCR");
514 "Maximum update rank fraction when using CR");
516 "Changing constraints factor for the Cyqlone backend");
518 "Parallel CR solve threshold for the Cyqlone backend");
520 "Parallel PCR factorization threshold for the Cyqlone backend");
522 "Export a single problem instance to a .mat file");
523 app.add_flag(
"--trace,!--no-trace", opts.
trace,
"Enable tracing of CyQPALM solver runs");
524 app.add_flag(
"--custom-reporter,!--no-custom-reporter", opts.
custom_reporter,
525 "Use custom benchmark reporter");
526 app.add_flag(
"--print-extra,!--no-print-extra", opts.
print_extra,
527 "Print additional counters in the benchmark report");
528 app.add_flag(
"--color,!--no-color", opts.
use_color,
"Enable/disable colored output");
532 if (!bm_args.empty() && bm_args.front() ==
"--")
533 bm_args.erase(bm_args.begin());
534 std::vector<char *> bm_argv(bm_args.size() + 2);
535 std::ranges::transform(bm_args, bm_argv.begin() + 1, [](
auto &s) { return s.data(); });
536 bm_argv[0] = program;
537 auto bm_argc =
static_cast<int>(bm_args.size()) + 1;
538 benchmark::Initialize(&bm_argc, bm_argv.data());
539 if (benchmark::ReportUnrecognizedArguments(bm_argc, bm_argv.data()))
545#if BATMAT_WITH_OPENMP
546 auto places = std::getenv(
"OMP_PLACES");
547 benchmark::AddCustomContext(
"OMP_NUM_THREADS", std::to_string(omp_get_max_threads()));
548 benchmark::AddCustomContext(
"OMP_PLACES", places ? places :
"");
550 benchmark::AddCustomContext(
"batmat_build_time", batmat_build_time);
551 benchmark::AddCustomContext(
"batmat_commit_hash", batmat_commit_hash);
552 benchmark::AddCustomContext(
"cyqlone_build_time", cyqlone_build_time);
553 benchmark::AddCustomContext(
"cyqlone_commit_hash", cyqlone_commit_hash);
558 mkl_get_version(&Version);
559 benchmark::AddCustomContext(
"blas_library",
"MKL");
560 benchmark::AddCustomContext(
"blas_version",
561 std::format(
"{}.{}.{} ({}) for {}: {}", Version.MajorVersion,
562 Version.MinorVersion, Version.UpdateVersion,
563 Version.Build, Version.Platform, Version.Processor));
564#elif GUANAQO_WITH_OPENBLAS
565 benchmark::AddCustomContext(
"blas_library",
"OpenBLAS");
566 benchmark::AddCustomContext(
"blas_version", OPENBLAS_VERSION);
568#if defined(__INTEL_LLVM_COMPILER)
569 benchmark::AddCustomContext(
"compiler",
"intel-llvm");
570 benchmark::AddCustomContext(
"compiler_version", __VERSION__);
571#elif defined(__clang__)
572 benchmark::AddCustomContext(
"compiler",
"clang");
573 benchmark::AddCustomContext(
"compiler_version", __VERSION__);
574#elif defined(__GNUC__)
575 benchmark::AddCustomContext(
"compiler",
"gcc");
576 benchmark::AddCustomContext(
"compiler_version", __VERSION__);
577#elif defined(_MSC_VER)
578 benchmark::AddCustomContext(
"compiler",
"msvc");
579 benchmark::AddCustomContext(
"compiler_version",
GUANAQO_STRINGIFY(_MSC_FULL_VER));
581#if defined(__AVX512F__)
582 benchmark::AddCustomContext(
"arch",
"avx512f");
583#elif defined(__AVX2__)
584 benchmark::AddCustomContext(
"arch",
"avx2");
585#elif defined(__AVX__)
586 benchmark::AddCustomContext(
"arch",
"avx");
587#elif defined(__SSE3__)
588 benchmark::AddCustomContext(
"arch",
"sse3");
589#elif defined(__ARM_NEON)
590 benchmark::AddCustomContext(
"arch",
"neon");
592#if BATMAT_WITH_GSI_HPC_SIMD
593 benchmark::AddCustomContext(
"simd_library",
"GSI-HPC/simd");
595 benchmark::AddCustomContext(
"simd_library",
"libstdc++ <experimental/simd>");
599int main(
int argc,
char **argv)
try {
600#if GUANAQO_WITH_PERFETTO
603 char *
const program = argv[0];
604 CLI::App app{
"CyQPALM and Cyqlone Spring-Mass Benchmarks"};
608 app.parse(argc, argv);
609 }
catch (
const CLI::ParseError &e) {
621 benchmark::RunSpecifiedBenchmarks(reporter.get());
623 benchmark::Shutdown();
624}
catch (
const std::exception &e) {
625 std::cerr <<
"Error: " << e.what() << std::endl;
std::generator< Solver > get_hpipm_solvers(const Options &opts)
void export_problem(const Options &opts)
std::generator< Solver > get_cyqlone_solvers_vl(const Options &opts)
std::generator< Solver > get_solvers(const Options &opts)
int initialize_google_benchmark(char *program, auto bm_args)
std::unique_ptr< benchmark::BenchmarkReporter > make_custom_reporter(size_t problem_name_width, size_t solver_name_width, bool print_extra, bool with_color)
auto register_benchmarks(const Options &opts)
double changing_constr_factor
void register_options(const char *program, CLI::App &app, Options &opts)
double cr_max_update_fraction
std::vector< int > masses
std::vector< int > vector_length
double pcr_max_update_fraction
std::vector< int > parallelism
qp::problems::SpringMassProblem create_problem(const SpringMassParams ¶ms)
void print_traces(std::ostream &)
void run_benchmark(benchmark::State &state, const std::string ¶m_name, const SpringMassParams ¶ms, qp::CyQPALMBackendSettings backend_settings, qp::Settings settings, bool warm=false, bool trace=false)
int parallel_solve_cr_threshold
int parallel_factor_pcr_threshold
std::generator< Problem > get_spring_mass_params(const Options &opts)
std::generator< Solver > get_cyqlone_solvers(const Options &opts)
void trace_run(auto &&, const auto &)
std::vector< int > horizon
std::chrono::duration< double > seconds
std::string export_problem
const std::map< std::string, ProblemType > problem_type_map
std::string_view order(qp::StorageOrder o)
std::string demangled_typename(const std::type_info &t)
consteval auto make_lut(F f)
#define GUANAQO_STRINGIFY(s)
std::string join(std::ranges::input_range auto strings, join_opt opt={})
SolveMethod solve_method
Algorithm to use for solving the final reduced block tridiagonal system.
@ PCR
Parallel Cyclic Reduction (direct).
WarmStartingStrategy strategy
cyqlone::TricyqleParams< real_t > tricyqle_params
unique_CyQPALMBackend< VL, DefaultOrder > make_cyqpalm_backend(const CyqloneStorage< real_t > &ocp, CyqloneData data, const CyQPALMBackendSettings &settings)
SolverStatus
Exit status of a numerical solver.
@ Converged
Converged and reached given tolerance.
MatFilePtr create_mat(const std::filesystem::path &filename)
Create and open a new .mat file for writing.
void add_to_mat(mat_t *mat, const std::string &varname, float value)
Add a value to an open .mat file.
TraceLogger & get_trace_logger()
void stop_tracing(std::unique_ptr<::perfetto::TracingSession > tracing_session, const fs::path &output_path)
std::unique_ptr<::perfetto::TracingSession > start_tracing(uint32_t memory_kb=128 *1024)
void initialize_tracing()
Functions for exporting and loading matrices and OCP data to and from .mat files.
constexpr std::array vl_for_real_t
constexpr index_t vl_at_most
std::integral_constant< index_t, I > index_constant
SpringMassProblem spring_mass(SpringMassParams p)
std::function< void(benchmark::State &, SpringMassParams)> run
static SpringMassParams wang_boyd_2008_width(index_t n_masses, index_t N_horiz=30, uint64_t seed=0, double steady_state_spring_length=0.1)
static SpringMassParams active_state_constr(index_t n_masses=18, index_t N_horiz=256, uint64_t seed=0)
static SpringMassParams domahidi_2012(index_t n_masses, index_t N_horiz, uint64_t seed=0)
static SpringMassParams wang_boyd_2008(index_t n_masses, index_t N_horiz=30, uint64_t seed=0)
static CyqloneStorage build(const LinearOCPStorage &ocp, index_t ny_0=-1)
real_t initial_inner_tolerance
std::vector< real_t > masses
uint64_t seed
random seed for actuator placement
real_t p_max
maximum displacement of each mass
static std::ostream & write_column_headings(std::ostream &os)
std::span< const Log > get_logs() const