cuda-samples/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.cuh

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>

#include "Mandelbrot_kernel.h"
#include "helper_cuda.h"

// The dimensions of the thread block
#define BLOCKDIM_X 16
#define BLOCKDIM_Y 16

#define ABS(n) ((n) < 0 ? -(n) : (n))

// Double single functions based on DSFUN90 package:
// http://crd.lbl.gov/~dhbailey/mpdist/index.html

// This function sets the DS number A equal to the double precision floating
// point number B.
inline void dsdeq(float &a0, float &a1, double b)
{
    a0 = (float)b;
    a1 = (float)(b - a0);
} // dsdcp

// This function sets the DS number A equal to the single precision floating
// point number B.
__device__ inline void dsfeq(float &a0, float &a1, float b)
{
    a0 = b;
    a1 = 0.0f;
} // dsfeq

// This function computes c = a + b.
__device__ inline void dsadd(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1)
{
    // Compute dsa + dsb using Knuth's trick.
    float t1 = a0 + b0;
    float e  = t1 - a0;
    float t2 = ((b0 - e) + (a0 - (t1 - e))) + a1 + b1;

    // The result is t1 + t2, after normalization.
    c0 = e = t1 + t2;
    c1     = t2 - (e - t1);
} // dsadd

// This function computes c = a - b.
__device__ inline void dssub(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1)
{
    // Compute dsa - dsb using Knuth's trick.
    float t1 = a0 - b0;
    float e  = t1 - a0;
    float t2 = ((-b0 - e) + (a0 - (t1 - e))) + a1 - b1;

    // The result is t1 + t2, after normalization.
    c0 = e = t1 + t2;
    c1     = t2 - (e - t1);
} // dssub

#if 1

// This function multiplies DS numbers A and B to yield the DS product C.
__device__ inline void dsmul(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1)
{
    // This splits dsa(1) and dsb(1) into high-order and low-order words.
    float cona = a0 * 8193.0f;
    float conb = b0 * 8193.0f;
    float sa1  = cona - (cona - a0);
    float sb1  = conb - (conb - b0);
    float sa2  = a0 - sa1;
    float sb2  = b0 - sb1;

    // Multilply a0 * b0 using Dekker's method.
    float c11 = a0 * b0;
    float c21 = (((sa1 * sb1 - c11) + sa1 * sb2) + sa2 * sb1) + sa2 * sb2;

    // Compute a0 * b1 + a1 * b0 (only high-order word is needed).
    float c2 = a0 * b1 + a1 * b0;

    // Compute (c11, c21) + c2 using Knuth's trick, also adding low-order product.
    float t1 = c11 + c2;
    float e  = t1 - c11;
    float t2 = ((c2 - e) + (c11 - (t1 - e))) + c21 + a1 * b1;

    // The result is t1 + t2, after normalization.
    c0 = e = t1 + t2;
    c1     = t2 - (e - t1);
} // dsmul

#else

// Modified double-single mul function by Norbert Juffa, NVIDIA
// uses __fmul_rn() and __fadd_rn() intrinsics which prevent FMAD merging

/* Based on: Guillaume Da Gra<EFBFBD>a, David Defour. Implementation of Float-Float
 * Operators on Graphics Hardware. RNC'7 pp. 23-32, 2006.
 */

// This function multiplies DS numbers A and B to yield the DS product C.
__device__ inline void dsmul(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1)
{
    // This splits dsa(1) and dsb(1) into high-order and low-order words.
    float cona = a0 * 8193.0f;
    float conb = b0 * 8193.0f;
    float sa1  = cona - (cona - a0);
    float sb1  = conb - (conb - b0);
    float sa2  = a0 - sa1;
    float sb2  = b0 - sb1;

    // Multilply a0 * b0 using Dekker's method.
    float c11 = __fmul_rn(a0, b0);
    float c21 = (((sa1 * sb1 - c11) + sa1 * sb2) + sa2 * sb1) + sa2 * sb2;

    // Compute a0 * b1 + a1 * b0 (only high-order word is needed).
    float c2 = __fmul_rn(a0, b1) + __fmul_rn(a1, b0);

    // Compute (c11, c21) + c2 using Knuth's trick, also adding low-order product.
    float t1 = c11 + c2;
    float e  = t1 - c11;
    float t2 = ((c2 - e) + (c11 - (t1 - e))) + c21 + __fmul_rn(a1, b1);

    // The result is t1 + t2, after normalization.
    c0 = e = t1 + t2;
    c1     = t2 - (e - t1);
} // dsmul

#endif

// The core Mandelbrot CUDA GPU calculation function
#if 1
// Unrolled version
template <class T>
__device__ inline int
CalcMandelbrot(const T xPos, const T yPos, const T xJParam, const T yJParam, const int crunch, const bool isJulia)
{
    T   x, y, xx, yy;
    int i = crunch;

    T xC, yC;

    if (isJulia) {
        xC = xJParam;
        yC = yJParam;
        y  = yPos;
        x  = xPos;
        yy = y * y;
        xx = x * x;
    }
    else {
        xC = xPos;
        yC = yPos;
        y  = 0;
        x  = 0;
        yy = 0;
        xx = 0;
    }

    do {
        // Iteration 1
        if (xx + yy > T(4.0))
            return i - 1;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 2
        if (xx + yy > T(4.0))
            return i - 2;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 3
        if (xx + yy > T(4.0))
            return i - 3;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 4
        if (xx + yy > T(4.0))
            return i - 4;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 5
        if (xx + yy > T(4.0))
            return i - 5;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 6
        if (xx + yy > T(4.0))
            return i - 6;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 7
        if (xx + yy > T(4.0))
            return i - 7;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 8
        if (xx + yy > T(4.0))
            return i - 8;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 9
        if (xx + yy > T(4.0))
            return i - 9;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 10
        if (xx + yy > T(4.0))
            return i - 10;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 11
        if (xx + yy > T(4.0))
            return i - 11;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 12
        if (xx + yy > T(4.0))
            return i - 12;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 13
        if (xx + yy > T(4.0))
            return i - 13;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 14
        if (xx + yy > T(4.0))
            return i - 14;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 15
        if (xx + yy > T(4.0))
            return i - 15;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 16
        if (xx + yy > T(4.0))
            return i - 16;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 17
        if (xx + yy > T(4.0))
            return i - 17;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 18
        if (xx + yy > T(4.0))
            return i - 18;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 19
        if (xx + yy > T(4.0))
            return i - 19;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;

        // Iteration 20
        i -= 20;

        if ((i <= 0) || (xx + yy > T(4.0)))
            return i;

        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;
    } while (1);
} // CalcMandelbrot

#else

template <class T>
__device__ inline int
CalcMandelbrot(const T xPos, const T yPos, const T xJParam, const T yJParam, const int crunch, const isJulia)
{
    T x, y, xx, yy, xC, yC;

    if (isJulia) {
        xC = xJParam;
        yC = yJParam;
        y  = yPos;
        x  = xPos;
        yy = y * y;
        xx = x * x;
    }
    else {
        xC = xPos;
        yC = yPos;
        y  = 0;
        x  = 0;
        yy = 0;
        xx = 0;
    }

    int i = crunch;

    while (--i && (xx + yy < T(4.0))) {
        y  = x * y * T(2.0) + yC;
        x  = xx - yy + xC;
        yy = y * y;
        xx = x * x;
    }

    return i; // i > 0 ? crunch - i : 0;
} // CalcMandelbrot

#endif

// The core Mandelbrot calculation function in double-single precision
__device__ inline int CalcMandelbrotDS(const float xPos0,
                                       const float xPos1,
                                       const float yPos0,
                                       const float yPos1,
                                       const float xJParam,
                                       const float yJParam,
                                       const int   crunch,
                                       const bool  isJulia)
{
    float xx0, xx1;
    float yy0, yy1;
    float sum0, sum1;
    int   i = crunch;

    float x0, x1, y0, y1;
    float xC0, xC1, yC0, yC1;

    if (isJulia) {
        xC0 = xJParam;
        xC1 = 0;
        yC0 = yJParam;
        yC1 = 0;
        y0  = yPos0; // y = yPos;
        y1  = yPos1;
        x0  = xPos0; // x = xPos;
        x1  = xPos1;
        dsmul(yy0, yy1, y0, y1, y0, y1); // yy = y * y;
        dsmul(xx0, xx1, x0, x1, x0, x1); // xx = x * x;
    }
    else {
        xC0 = xPos0;
        xC1 = xPos1;
        yC0 = yPos0;
        yC1 = yPos1;
        y0  = 0; // y = 0 ;
        y1  = 0;
        x0  = 0; // x = 0 ;
        x1  = 0;
        yy0 = 0; // yy = 0 ;
        yy1 = 0;
        xx0 = 0; // xx = 0 ;
        xx1 = 0;
    }

    dsadd(sum0, sum1, xx0, xx1, yy0, yy1); // sum = xx + yy;

    while (--i && (sum0 + sum1 < 4.0f)) {
        dsmul(y0, y1, x0, x1, y0, y1); // y = x * y * 2.0f + yC;  // yC is yPos for
                                       // Mandelbrot and it is yJParam for Julia
        dsadd(y0, y1, y0, y1, y0, y1);
        dsadd(y0, y1, y0, y1, yC0, yC1);

        dssub(x0, x1, xx0, xx1, yy0, yy1); //  x = xx - yy + xC;  // xC is xPos for
                                           //  Mandelbrot and it is xJParam for
                                           //  Julia
        dsadd(x0, x1, x0, x1, xC0, xC1);

        dsmul(yy0, yy1, y0, y1, y0, y1);       // yy = y * y;
        dsmul(xx0, xx1, x0, x1, x0, x1);       // xx = x * x;
        dsadd(sum0, sum1, xx0, xx1, yy0, yy1); // sum = xx + yy;
    }

    return i;
} // CalcMandelbrotDS

// Determine if two pixel colors are within tolerance
__device__ inline int CheckColors(const uchar4 &color0, const uchar4 &color1)
{
    int x = color1.x - color0.x;
    int y = color1.y - color0.y;
    int z = color1.z - color0.z;
    return (ABS(x) > 10) || (ABS(y) > 10) || (ABS(z) > 10);
} // CheckColors

// Increase the grid size by 1 if the image width or height does not divide
// evenly
// by the thread block dimensions
inline int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); } // iDivUp