cuda-samples/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 *  Test three linear solvers, including Cholesky, LU and QR.
 *  The user has to prepare a sparse matrix of "matrix market format" (with
 extension .mtx).
 *  For example, the user can download matrices in Florida Sparse Matrix
 Collection.
 *  (http://www.cise.ufl.edu/research/sparse/matrices/)
 *
 *  The user needs to choose a solver by the switch -R<solver> and
 *  to provide the path of the matrix by the switch -F<file>, then
 *  the program solves
 *          A*x = b
 *  and reports relative error
 *          |b-A*x|/(|A|*|x|+|b|)
 *
 *  How does it work?
 *     The example solves A*x = b by the following steps
 *  step 1: B = A(Q,Q)
 *     Q is the ordering to minimize zero fill-in.
 *     The user can choose symrcm or symamd.
 *  step 2: solve B*z = Q*b
 *  step 3: x = inv(Q)*z
 *
 *  Above three steps can be combined by the formula
 *        (Q*A*Q')*(Q*x) = (Q*b)
 *
 *  The elapsed time is also reported so the user can compare efficiency of
 different solvers.
 *
 *  How to use
        /cuSolverSp_LinearSolver            // Default: Cholesky, symrcm &
 file=lap2D_5pt_n100.mtx
 *     ./cuSolverSp_LinearSolver -R=chol  -file=<file>   // cholesky
 factorization
 *     ./cuSolverSp_LinearSolver -R=lu -P=symrcm -file=<file>     // symrcm + LU
 with partial pivoting
 *     ./cuSolverSp_LinearSolver -R=qr -P=symamd -file=<file>     // symamd + QR
 factorization
 *
 *
 *  Remark: the absolute error on solution x is meaningless without knowing
 condition number of A.
 *     The relative error on residual should be close to machine zero,
 i.e. 1.e-15.
 */

#include <assert.h>
#include <ctype.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "cusolverSp.h"
#include "cusparse.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"

template <typename T_ELEM>
int loadMMSparseMatrix(char    *filename,
                       char     elem_type,
                       bool     csrFormat,
                       int     *m,
                       int     *n,
                       int     *nnz,
                       T_ELEM **aVal,
                       int    **aRowInd,
                       int    **aColInd,
                       int      extendSymMatrix);

void UsageSP(void)
{
    printf("<options>\n");
    printf("-h          : display this help\n");
    printf("-R=<name>   : choose a linear solver\n");
    printf("              chol (cholesky factorization), this is default\n");
    printf("              qr   (QR factorization)\n");
    printf("              lu   (LU factorization)\n");
    printf("-P=<name>    : choose a reordering\n");
    printf("              symrcm (Reverse Cuthill-McKee)\n");
    printf("              symamd (Approximate Minimum Degree)\n");
    printf("              metis  (nested dissection)\n");
    printf("-file=<filename> : filename containing a matrix in MM format\n");
    printf("-device=<device_id> : <device_id> if want to run on specific GPU\n");

    exit(0);
}

void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts)
{
    memset(&opts, 0, sizeof(opts));

    if (checkCmdLineFlag(argc, (const char **)argv, "-h")) {
        UsageSP();
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "R")) {
        char *solverType = NULL;
        getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType);

        if (solverType) {
            if ((STRCASECMP(solverType, "chol") != 0) && (STRCASECMP(solverType, "lu") != 0)
                && (STRCASECMP(solverType, "qr") != 0)) {
                printf("\nIncorrect argument passed to -R option\n");
                UsageSP();
            }
            else {
                opts.testFunc = solverType;
            }
        }
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "P")) {
        char *reorderType = NULL;
        getCmdLineArgumentString(argc, (const char **)argv, "P", &reorderType);

        if (reorderType) {
            if ((STRCASECMP(reorderType, "symrcm") != 0) && (STRCASECMP(reorderType, "symamd") != 0)
                && (STRCASECMP(reorderType, "metis") != 0)) {
                printf("\nIncorrect argument passed to -P option\n");
                UsageSP();
            }
            else {
                opts.reorder = reorderType;
            }
        }
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
        char *fileName = 0;
        getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);

        if (fileName) {
            opts.sparse_mat_filename = fileName;
        }
        else {
            printf("\nIncorrect filename passed to -file \n ");
            UsageSP();
        }
    }
}

int main(int argc, char *argv[])
{
    struct testOpts    opts;
    cusolverSpHandle_t handle         = NULL;
    cusparseHandle_t   cusparseHandle = NULL; /* used in residual evaluation */
    cudaStream_t       stream         = NULL;
    cusparseMatDescr_t descrA         = NULL;

    int rowsA = 0; /* number of rows of A */
    int colsA = 0; /* number of columns of A */
    int nnzA  = 0; /* number of nonzeros of A */
    int baseA = 0; /* base index in CSR format */

    /* CSR(A) from I/O */
    int    *h_csrRowPtrA = NULL;
    int    *h_csrColIndA = NULL;
    double *h_csrValA    = NULL;

    double *h_z  = NULL; /* z = B \ (Q*b) */
    double *h_x  = NULL; /* x = A \ b */
    double *h_b  = NULL; /* b = ones(n,1) */
    double *h_Qb = NULL; /* Q*b */
    double *h_r  = NULL; /* r = b - A*x */

    int *h_Q = NULL; /* <int> n */
                     /* reorder to reduce zero fill-in */
                     /* Q = symrcm(A) or Q = symamd(A) */
    /* B = Q*A*Q' or B = A(Q,Q) by MATLAB notation */
    int    *h_csrRowPtrB = NULL; /* <int> n+1 */
    int    *h_csrColIndB = NULL; /* <int> nnzA */
    double *h_csrValB    = NULL; /* <double> nnzA */
    int    *h_mapBfromA  = NULL; /* <int> nnzA */

    size_t size_perm  = 0;
    void  *buffer_cpu = NULL; /* working space for permutation: B = Q*A*Q^T */

    /* device copy of A: used in residual evaluation */
    int    *d_csrRowPtrA = NULL;
    int    *d_csrColIndA = NULL;
    double *d_csrValA    = NULL;

    /* device copy of B: used in B*z = Q*b */
    int    *d_csrRowPtrB = NULL;
    int    *d_csrColIndB = NULL;
    double *d_csrValB    = NULL;

    int    *d_Q  = NULL; /* device copy of h_Q */
    double *d_z  = NULL; /* z = B \ Q*b */
    double *d_x  = NULL; /* x = A \ b */
    double *d_b  = NULL; /* a copy of h_b */
    double *d_Qb = NULL; /* a copy of h_Qb */
    double *d_r  = NULL; /* r = b - A*x */

    double    tol         = 1.e-12;
    const int reorder     = 0; /* no reordering */
    int       singularity = 0; /* -1 if A is invertible under tol. */

    /* the constants are used in residual evaluation, r = b - A*x */
    const double minus_one = -1.0;
    const double one       = 1.0;

    double b_inf  = 0.0;
    double x_inf  = 0.0;
    double r_inf  = 0.0;
    double A_inf  = 0.0;
    int    errors = 0;
    int    issym  = 0;

    double start, stop;
    double time_solve_cpu;
    double time_solve_gpu;

    parseCommandLineArguments(argc, argv, opts);

    if (NULL == opts.testFunc) {
        opts.testFunc = "chol"; /* By default running Cholesky as NO solver selected with -R
                                   option. */
    }

    findCudaDevice(argc, (const char **)argv);

    if (opts.sparse_mat_filename == NULL) {
        opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]);
        if (opts.sparse_mat_filename != NULL)
            printf("Using default input file [%s]\n", opts.sparse_mat_filename);
        else
            printf("Could not find lap2D_5pt_n100.mtx\n");
    }
    else {
        printf("Using input file [%s]\n", opts.sparse_mat_filename);
    }

    printf("step 1: read matrix market format\n");

    if (opts.sparse_mat_filename == NULL) {
        fprintf(stderr, "Error: input matrix is not provided\n");
        return EXIT_FAILURE;
    }

    if (loadMMSparseMatrix<double>(opts.sparse_mat_filename,
                                   'd',
                                   true,
                                   &rowsA,
                                   &colsA,
                                   &nnzA,
                                   &h_csrValA,
                                   &h_csrRowPtrA,
                                   &h_csrColIndA,
                                   true)) {
        exit(EXIT_FAILURE);
    }
    baseA = h_csrRowPtrA[0]; // baseA = {0,1}
    printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);

    if (rowsA != colsA) {
        fprintf(stderr, "Error: only support square matrix\n");
        return 1;
    }

    checkCudaErrors(cusolverSpCreate(&handle));
    checkCudaErrors(cusparseCreate(&cusparseHandle));

    checkCudaErrors(cudaStreamCreate(&stream));
    /* bind stream to cusparse and cusolver*/
    checkCudaErrors(cusolverSpSetStream(handle, stream));
    checkCudaErrors(cusparseSetStream(cusparseHandle, stream));

    /* configure matrix descriptor*/
    checkCudaErrors(cusparseCreateMatDescr(&descrA));
    checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
    if (baseA) {
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
    }
    else {
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
    }

    h_z  = (double *)malloc(sizeof(double) * colsA);
    h_x  = (double *)malloc(sizeof(double) * colsA);
    h_b  = (double *)malloc(sizeof(double) * rowsA);
    h_Qb = (double *)malloc(sizeof(double) * rowsA);
    h_r  = (double *)malloc(sizeof(double) * rowsA);

    h_Q          = (int *)malloc(sizeof(int) * colsA);
    h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1));
    h_csrColIndB = (int *)malloc(sizeof(int) * nnzA);
    h_csrValB    = (double *)malloc(sizeof(double) * nnzA);
    h_mapBfromA  = (int *)malloc(sizeof(int) * nnzA);

    assert(NULL != h_z);
    assert(NULL != h_x);
    assert(NULL != h_b);
    assert(NULL != h_Qb);
    assert(NULL != h_r);
    assert(NULL != h_Q);
    assert(NULL != h_csrRowPtrB);
    assert(NULL != h_csrColIndB);
    assert(NULL != h_csrValB);
    assert(NULL != h_mapBfromA);

    checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrA, sizeof(int) * (rowsA + 1)));
    checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int) * nnzA));
    checkCudaErrors(cudaMalloc((void **)&d_csrValA, sizeof(double) * nnzA));
    checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrB, sizeof(int) * (rowsA + 1)));
    checkCudaErrors(cudaMalloc((void **)&d_csrColIndB, sizeof(int) * nnzA));
    checkCudaErrors(cudaMalloc((void **)&d_csrValB, sizeof(double) * nnzA));
    checkCudaErrors(cudaMalloc((void **)&d_Q, sizeof(int) * colsA));
    checkCudaErrors(cudaMalloc((void **)&d_z, sizeof(double) * colsA));
    checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA));
    checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA));
    checkCudaErrors(cudaMalloc((void **)&d_Qb, sizeof(double) * rowsA));
    checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA));

    /* verify if A has symmetric pattern or not */
    checkCudaErrors(
        cusolverSpXcsrissymHost(handle, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrRowPtrA + 1, h_csrColIndA, &issym));

    if (0 == strcmp(opts.testFunc, "chol")) {
        if (!issym) {
            printf("Error: A has no symmetric pattern, please use LU or QR \n");
            exit(EXIT_FAILURE);
        }
    }

    printf("step 2: reorder the matrix A to minimize zero fill-in\n");
    printf("        if the user choose a reordering by -P=symrcm, -P=symamd or "
           "-P=metis\n");

    if (NULL != opts.reorder) {
        if (0 == strcmp(opts.reorder, "symrcm")) {
            printf("step 2.1: Q = symrcm(A) \n");
            checkCudaErrors(cusolverSpXcsrsymrcmHost(handle, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_Q));
        }
        else if (0 == strcmp(opts.reorder, "symamd")) {
            printf("step 2.1: Q = symamd(A) \n");
            checkCudaErrors(cusolverSpXcsrsymamdHost(handle, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_Q));
        }
        else if (0 == strcmp(opts.reorder, "metis")) {
            printf("step 2.1: Q = metis(A) \n");
            checkCudaErrors(cusolverSpXcsrmetisndHost(handle,
                                                      rowsA,
                                                      nnzA,
                                                      descrA,
                                                      h_csrRowPtrA,
                                                      h_csrColIndA,
                                                      NULL, /* default setting. */
                                                      h_Q));
        }
        else {
            fprintf(stderr, "Error: %s is unknown reordering\n", opts.reorder);
            return 1;
        }
    }
    else {
        printf("step 2.1: no reordering is chosen, Q = 0:n-1 \n");
        for (int j = 0; j < rowsA; j++) {
            h_Q[j] = j;
        }
    }

    printf("step 2.2: B = A(Q,Q) \n");

    memcpy(h_csrRowPtrB, h_csrRowPtrA, sizeof(int) * (rowsA + 1));
    memcpy(h_csrColIndB, h_csrColIndA, sizeof(int) * nnzA);

    checkCudaErrors(cusolverSpXcsrperm_bufferSizeHost(
        handle, rowsA, colsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, h_Q, h_Q, &size_perm));

    if (buffer_cpu) {
        free(buffer_cpu);
    }
    buffer_cpu = (void *)malloc(sizeof(char) * size_perm);
    assert(NULL != buffer_cpu);

    /* h_mapBfromA = Identity */
    for (int j = 0; j < nnzA; j++) {
        h_mapBfromA[j] = j;
    }
    checkCudaErrors(cusolverSpXcsrpermHost(
        handle, rowsA, colsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, h_Q, h_Q, h_mapBfromA, buffer_cpu));

    /* B = A( mapBfromA ) */
    for (int j = 0; j < nnzA; j++) {
        h_csrValB[j] = h_csrValA[h_mapBfromA[j]];
    }

    printf("step 3: b(j) = 1 + j/n \n");
    for (int row = 0; row < rowsA; row++) {
        h_b[row] = 1.0 + ((double)row) / ((double)rowsA);
    }

    /* h_Qb = b(Q) */
    for (int row = 0; row < rowsA; row++) {
        h_Qb[row] = h_b[h_Q[row]];
    }

    printf("step 4: prepare data on device\n");
    checkCudaErrors(
        cudaMemcpyAsync(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_csrValA, h_csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(
        cudaMemcpyAsync(d_csrRowPtrB, h_csrRowPtrB, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_csrColIndB, h_csrColIndB, sizeof(int) * nnzA, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_csrValB, h_csrValB, sizeof(double) * nnzA, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_Qb, h_Qb, sizeof(double) * rowsA, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_Q, h_Q, sizeof(int) * rowsA, cudaMemcpyHostToDevice, stream));

    printf("step 5: solve A*x = b on CPU \n");
    start = second();

    /* solve B*z = Q*b */
    if (0 == strcmp(opts.testFunc, "chol")) {
        checkCudaErrors(cusolverSpDcsrlsvcholHost(
            handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, h_Qb, tol, reorder, h_z, &singularity));
    }
    else if (0 == strcmp(opts.testFunc, "lu")) {
        checkCudaErrors(cusolverSpDcsrlsvluHost(
            handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, h_Qb, tol, reorder, h_z, &singularity));
    }
    else if (0 == strcmp(opts.testFunc, "qr")) {
        checkCudaErrors(cusolverSpDcsrlsvqrHost(
            handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, h_Qb, tol, reorder, h_z, &singularity));
    }
    else {
        fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
        return 1;
    }

    /* Q*x = z */
    for (int row = 0; row < rowsA; row++) {
        h_x[h_Q[row]] = h_z[row];
    }

    if (0 <= singularity) {
        printf("WARNING: the matrix is singular at row %d under tol (%E)\n", singularity, tol);
    }

    stop           = second();
    time_solve_cpu = stop - start;

    printf("step 6: evaluate residual r = b - A*x (result on CPU)\n");
    checkCudaErrors(cudaMemcpyAsync(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_x, h_x, sizeof(double) * colsA, cudaMemcpyHostToDevice, stream));

    /* Wrap raw data into cuSPARSE generic API objects */
    cusparseSpMatDescr_t matA = NULL;
    if (baseA) {
        checkCudaErrors(cusparseCreateCsr(&matA,
                                          rowsA,
                                          colsA,
                                          nnzA,
                                          d_csrRowPtrA,
                                          d_csrColIndA,
                                          d_csrValA,
                                          CUSPARSE_INDEX_32I,
                                          CUSPARSE_INDEX_32I,
                                          CUSPARSE_INDEX_BASE_ONE,
                                          CUDA_R_64F));
    }
    else {
        checkCudaErrors(cusparseCreateCsr(&matA,
                                          rowsA,
                                          colsA,
                                          nnzA,
                                          d_csrRowPtrA,
                                          d_csrColIndA,
                                          d_csrValA,
                                          CUSPARSE_INDEX_32I,
                                          CUSPARSE_INDEX_32I,
                                          CUSPARSE_INDEX_BASE_ZERO,
                                          CUDA_R_64F));
    }

    cusparseDnVecDescr_t vecx = NULL;
    checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F));
    cusparseDnVecDescr_t vecAx = NULL;
    checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F));

    /* Allocate workspace for cuSPARSE */
    size_t bufferSize = 0;
    checkCudaErrors(cusparseSpMV_bufferSize(cusparseHandle,
                                            CUSPARSE_OPERATION_NON_TRANSPOSE,
                                            &minus_one,
                                            matA,
                                            vecx,
                                            &one,
                                            vecAx,
                                            CUDA_R_64F,
                                            CUSPARSE_SPMV_ALG_DEFAULT,
                                            &bufferSize));
    void *buffer = NULL;
    checkCudaErrors(cudaMalloc(&buffer, bufferSize));

    checkCudaErrors(cusparseSpMV(cusparseHandle,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &minus_one,
                                 matA,
                                 vecx,
                                 &one,
                                 vecAx,
                                 CUDA_R_64F,
                                 CUSPARSE_SPMV_ALG_DEFAULT,
                                 buffer));

    checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost, stream));
    /* wait until h_r is ready */
    checkCudaErrors(cudaDeviceSynchronize());

    b_inf = vec_norminf(rowsA, h_b);
    x_inf = vec_norminf(colsA, h_x);
    r_inf = vec_norminf(rowsA, h_r);
    A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA);

    printf("(CPU) |b - A*x| = %E \n", r_inf);
    printf("(CPU) |A| = %E \n", A_inf);
    printf("(CPU) |x| = %E \n", x_inf);
    printf("(CPU) |b| = %E \n", b_inf);
    printf("(CPU) |b - A*x|/(|A|*|x| + |b|) = %E \n", r_inf / (A_inf * x_inf + b_inf));

    printf("step 7: solve A*x = b on GPU\n");
    start = second();

    /* solve B*z = Q*b */
    if (0 == strcmp(opts.testFunc, "chol")) {
        checkCudaErrors(cusolverSpDcsrlsvchol(
            handle, rowsA, nnzA, descrA, d_csrValB, d_csrRowPtrB, d_csrColIndB, d_Qb, tol, reorder, d_z, &singularity));
    }
    else if (0 == strcmp(opts.testFunc, "lu")) {
        printf("WARNING: no LU available on GPU \n");
    }
    else if (0 == strcmp(opts.testFunc, "qr")) {
        checkCudaErrors(cusolverSpDcsrlsvqr(
            handle, rowsA, nnzA, descrA, d_csrValB, d_csrRowPtrB, d_csrColIndB, d_Qb, tol, reorder, d_z, &singularity));
    }
    else {
        fprintf(stderr, "Error: %s is unknow function\n", opts.testFunc);
        return 1;
    }
    checkCudaErrors(cudaDeviceSynchronize());
    if (0 <= singularity) {
        printf("WARNING: the matrix is singular at row %d under tol (%E)\n", singularity, tol);
    }
    /* Q*x = z */
    cusparseSpVecDescr_t vecz = NULL;
    checkCudaErrors(
        cusparseCreateSpVec(&vecz, colsA, rowsA, d_Q, d_z, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
    checkCudaErrors(cusparseScatter(cusparseHandle, vecz, vecx));
    checkCudaErrors(cusparseDestroySpVec(vecz));


    checkCudaErrors(cudaDeviceSynchronize());

    stop           = second();
    time_solve_gpu = stop - start;

    printf("step 8: evaluate residual r = b - A*x (result on GPU)\n");
    checkCudaErrors(cudaMemcpyAsync(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice, stream));

    checkCudaErrors(cusparseSpMV(cusparseHandle,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &minus_one,
                                 matA,
                                 vecx,
                                 &one,
                                 vecAx,
                                 CUDA_R_64F,
                                 CUSPARSE_SPMV_ALG_DEFAULT,
                                 buffer));

    checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost, stream));
    checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost, stream));
    /* wait until h_x and h_r are ready */
    checkCudaErrors(cudaDeviceSynchronize());

    b_inf = vec_norminf(rowsA, h_b);
    x_inf = vec_norminf(colsA, h_x);
    r_inf = vec_norminf(rowsA, h_r);

    if (0 != strcmp(opts.testFunc, "lu")) {
        // only cholesky and qr have GPU version
        printf("(GPU) |b - A*x| = %E \n", r_inf);
        printf("(GPU) |A| = %E \n", A_inf);
        printf("(GPU) |x| = %E \n", x_inf);
        printf("(GPU) |b| = %E \n", b_inf);
        printf("(GPU) |b - A*x|/(|A|*|x| + |b|) = %E \n", r_inf / (A_inf * x_inf + b_inf));
    }

    fprintf(stdout, "timing %s: CPU = %10.6f sec , GPU = %10.6f sec\n", opts.testFunc, time_solve_cpu, time_solve_gpu);

    if (0 != strcmp(opts.testFunc, "lu")) {
        printf("show last 10 elements of solution vector (GPU) \n");
        printf("consistent result for different reordering and solver \n");
        for (int j = rowsA - 10; j < rowsA; j++) {
            printf("x[%d] = %E\n", j, h_x[j]);
        }
    }

    if (handle) {
        checkCudaErrors(cusolverSpDestroy(handle));
    }
    if (cusparseHandle) {
        checkCudaErrors(cusparseDestroy(cusparseHandle));
    }
    if (stream) {
        checkCudaErrors(cudaStreamDestroy(stream));
    }
    if (descrA) {
        checkCudaErrors(cusparseDestroyMatDescr(descrA));
    }
    if (matA) {
        checkCudaErrors(cusparseDestroySpMat(matA));
    }
    if (vecx) {
        checkCudaErrors(cusparseDestroyDnVec(vecx));
    }
    if (vecAx) {
        checkCudaErrors(cusparseDestroyDnVec(vecAx));
    }

    if (h_csrValA) {
        free(h_csrValA);
    }
    if (h_csrRowPtrA) {
        free(h_csrRowPtrA);
    }
    if (h_csrColIndA) {
        free(h_csrColIndA);
    }
    if (h_z) {
        free(h_z);
    }
    if (h_x) {
        free(h_x);
    }
    if (h_b) {
        free(h_b);
    }
    if (h_Qb) {
        free(h_Qb);
    }
    if (h_r) {
        free(h_r);
    }

    if (h_Q) {
        free(h_Q);
    }

    if (h_csrRowPtrB) {
        free(h_csrRowPtrB);
    }
    if (h_csrColIndB) {
        free(h_csrColIndB);
    }
    if (h_csrValB) {
        free(h_csrValB);
    }
    if (h_mapBfromA) {
        free(h_mapBfromA);
    }

    if (buffer_cpu) {
        free(buffer_cpu);
    }

    if (buffer) {
        checkCudaErrors(cudaFree(buffer));
    }
    if (d_csrValA) {
        checkCudaErrors(cudaFree(d_csrValA));
    }
    if (d_csrRowPtrA) {
        checkCudaErrors(cudaFree(d_csrRowPtrA));
    }
    if (d_csrColIndA) {
        checkCudaErrors(cudaFree(d_csrColIndA));
    }
    if (d_csrValB) {
        checkCudaErrors(cudaFree(d_csrValB));
    }
    if (d_csrRowPtrB) {
        checkCudaErrors(cudaFree(d_csrRowPtrB));
    }
    if (d_csrColIndB) {
        checkCudaErrors(cudaFree(d_csrColIndB));
    }
    if (d_Q) {
        checkCudaErrors(cudaFree(d_Q));
    }
    if (d_z) {
        checkCudaErrors(cudaFree(d_z));
    }
    if (d_x) {
        checkCudaErrors(cudaFree(d_x));
    }
    if (d_b) {
        checkCudaErrors(cudaFree(d_b));
    }
    if (d_Qb) {
        checkCudaErrors(cudaFree(d_Qb));
    }
    if (d_r) {
        checkCudaErrors(cudaFree(d_r));
    }

    return 0;
}