32#ifndef SPARSEMATCUDA_HPP
33#define SPARSEMATCUDA_HPP
37#include <cusolverSp.h>
43 cusolverSpHandle_t handle =
nullptr;
44 cudaStream_t stream =
nullptr;
45 cusparseMatDescr_t descr =
nullptr;
47 void* d_val_idx =
nullptr;
48 void* d_col_idx =
nullptr;
49 void* d_row_ptr =
nullptr;
52 cusolverSpCreate(&handle);
53 cudaStreamCreate(&stream);
54 cusolverSpSetStream(handle, stream);
55 cusparseCreateMatDescr(&descr);
56 cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
57 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
58 cudaMalloc(&d_row_ptr,
sizeof(
int) * (this->n_rows + 1));
61 void release()
const {
62 if(handle) cusolverSpDestroy(handle);
63 if(stream) cudaStreamDestroy(stream);
64 if(descr) cusparseDestroyMatDescr(descr);
65 if(d_row_ptr) cudaFree(d_row_ptr);
68 template<sp_d ET>
void device_alloc(csr_form<ET, int>&& csr_mat) {
69 const size_t n_val =
sizeof(ET) * csr_mat.n_elem;
70 const size_t n_col =
sizeof(int) * csr_mat.n_elem;
72 cudaMalloc(&d_val_idx, n_val);
73 cudaMalloc(&d_col_idx, n_col);
75 cudaMemcpyAsync(d_val_idx, csr_mat.val_mem(), n_val, cudaMemcpyHostToDevice, stream);
76 cudaMemcpyAsync(d_col_idx, csr_mat.col_mem(), n_col, cudaMemcpyHostToDevice, stream);
77 cudaMemcpyAsync(d_row_ptr, csr_mat.row_mem(),
sizeof(
int) * (csr_mat.n_rows + 1llu), cudaMemcpyHostToDevice, stream);
80 void device_dealloc()
const {
81 if(d_val_idx) cudaFree(d_val_idx);
82 if(d_col_idx) cudaFree(d_col_idx);
88 int direct_solve(Mat<T>&,
const Mat<T>&)
override;
91 SparseMatCUDA(
const uword in_row,
const uword in_col,
const uword in_elem = 0)
92 :
SparseMat<
T>(in_row, in_col, in_elem) { acquire(); }
106 unique_ptr<MetaMat<T>>
make_copy()
override {
return std::make_unique<SparseMatCUDA>(*
this); }
110 if(!this->factored) {
114 std::is_same_v<T, float> ||
Precision::MIXED == this->setting.precision ? device_alloc(csr_form<float, int>(this->triplet_mat)) : device_alloc(
csr_form<double, int>(this->triplet_mat));
116 this->factored =
true;
119 const size_t n_rhs = (std::is_same_v<T, float> ||
Precision::MIXED == this->setting.precision ?
sizeof(float) : sizeof(double)) * B.n_elem;
124 cudaMalloc(&d_b, n_rhs);
125 cudaMalloc(&d_x, n_rhs);
130 if constexpr(std::is_same_v<T, float>) {
131 cudaMemcpyAsync(d_b, B.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
133 for(
auto I = 0llu; I < B.n_elem; I += B.n_rows) code += cusolverSpScsrlsvqr(handle,
int(this->n_rows),
int(this->triplet_mat.n_elem), descr, (
float*)d_val_idx, (
int*)d_row_ptr, (
int*)d_col_idx, (
float*)d_b + I,
float(this->setting.tolerance), 3, (
float*)d_x + I, &singularity);
135 X.set_size(arma::size(B));
137 cudaMemcpyAsync(X.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
139 cudaDeviceSynchronize();
142 cudaMemcpyAsync(d_b, B.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
144 for(
auto I = 0llu; I < B.n_elem; I += B.n_rows) code += cusolverSpDcsrlsvqr(handle,
int(this->n_rows),
int(this->triplet_mat.n_elem), descr, (
double*)d_val_idx, (
int*)d_row_ptr, (
int*)d_col_idx, (
double*)d_b + I, this->setting.tolerance, 3, (
double*)d_x + I, &singularity);
146 X.set_size(arma::size(B));
148 cudaMemcpyAsync(X.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
150 cudaDeviceSynchronize();
153 X = arma::zeros(arma::size(B));
155 mat full_residual = B;
157 auto multiplier =
norm(full_residual);
160 while(counter++ < this->setting.iterative_refinement) {
161 if(multiplier < this->setting.tolerance)
break;
163 auto residual = conv_to<fmat>::from(full_residual / multiplier);
165 cudaMemcpyAsync(d_b, residual.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
168 for(
auto I = 0llu; I < B.n_elem; I += B.n_rows) code += cusolverSpScsrlsvqr(handle,
int(this->n_rows),
int(this->triplet_mat.n_elem), descr, (
float*)d_val_idx, (
int*)d_row_ptr, (
int*)d_col_idx, (
float*)d_b + I,
float(this->setting.tolerance), 3, (
float*)d_x + I, &singularity);
171 cudaMemcpyAsync(residual.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
173 cudaDeviceSynchronize();
175 const mat incre = multiplier * conv_to<mat>::from(residual);
179 suanpan_debug(
"Mixed precision algorithm multiplier: {:.5E}.\n", multiplier = arma::norm(full_residual -= this->
operator*(incre)));
183 if(d_b) cudaFree(d_b);
184 if(d_x) cudaFree(d_x);
A SparseMatCUDA class that holds matrices.
A SparseMat class that holds matrices.
Definition SparseMat.hpp:34
unique_ptr< Material > make_copy(const shared_ptr< Material > &)
Definition Material.cpp:370
double norm(const vec &)
Definition tensor.cpp:370
#define suanpan_debug(...)
Definition suanPan.h:307
constexpr auto SUANPAN_SUCCESS
Definition suanPan.h:172
constexpr auto SUANPAN_FAIL
Definition suanPan.h:173