CuPBoP/examples/dwt2d/dwt_cuda/common.h

///  
/// @file    common.h
/// @author  Martin Jirman (207962@mail.muni.cz)
/// @brief   Common stuff for all CUDA dwt functions.
/// @date    2011-01-20 14:19
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
/// 
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
/// 
///     * Redistributions of source code must retain the above copyright
///       notice, this list of conditions and the following disclaimer.
///     * Redistributions in binary form must reproduce the above copyright
///       notice, this list of conditions and the following disclaimer in the
///       documentation and/or other materials provided with the distribution.
/// 
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///


#ifndef DWT_COMMON_H
#define	DWT_COMMON_H


#include <cstdio>
#include <algorithm>
#include <vector>


// compile time minimum macro
#define CTMIN(a,b) (((a) < (b)) ? (a) : (b))


// performance testing macros
#if defined(GPU_DWT_TESTING)
  #define PERF_BEGIN  \
  { \
    dwt_cuda::CudaDWTTester PERF_TESTER; \
    for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \
    { \
      PERF_TESTER.beginTestIteration();

  #define PERF_END(PERF_NAME, PERF_W, PERF_H)  \
      PERF_TESTER.endTestIteration(); \
    } \
    PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
  }
#else // GPU_DWT_TESTING
  #define PERF_BEGIN
  #define PERF_END(PERF_NAME, PERF_W, PERF_H)
#endif // GPU_DWT_TESTING


namespace dwt_cuda {
  
  
  /// Divide and round up.
  template <typename T>
  __device__ __host__ inline T divRndUp(const T & n, const T & d) {
    return (n / d) + ((n % d) ? 1 : 0);
  }
  
  
  // 9/7 forward DWT lifting schema coefficients
  const float f97Predict1 = -1.586134342;   ///< forward 9/7 predict 1
  const float f97Update1 = -0.05298011854;  ///< forward 9/7 update 1
  const float f97Predict2 = 0.8829110762;   ///< forward 9/7 predict 2
  const float f97Update2 = 0.4435068522;    ///< forward 9/7 update 2


  // 9/7 reverse DWT lifting schema coefficients
  const float r97update2 = -f97Update2;    ///< undo 9/7 update 2
  const float r97predict2 = -f97Predict2;  ///< undo 9/7 predict 2
  const float r97update1 = -f97Update1;    ///< undo 9/7 update 1
  const float r97Predict1 = -f97Predict1;  ///< undo 9/7 predict 1
  
  // FDWT 9/7 scaling coefficients
  const float scale97Mul = 1.23017410491400f;
  const float scale97Div = 1.0 / scale97Mul;
  
  
  // 5/3 forward DWT lifting schema coefficients
  const float forward53Predict = -0.5f;   /// forward 5/3 predict
  const float forward53Update = 0.25f;    /// forward 5/3 update
  
  // 5/3 forward DWT lifting schema coefficients
  const float reverse53Update = -forward53Update;    /// undo 5/3 update
  const float reverse53Predict = -forward53Predict;  /// undo 5/3 predict
  
  
  /// Functor which adds scaled sum of neighbors to given central pixel.
  struct AddScaledSum {
    const float scale;  // scale of neighbors
    __device__ AddScaledSum(const float scale) : scale(scale) {}
    __device__ void operator()(const float p, float & c, const float n) const {

      // if(threadIdx.x == 0) {

      //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );
      
      // }
      
      c += scale * (p + n);
    }
  };
  
  
  /// Returns index ranging from 0 to num threads, such that first half
  /// of threads get even indices and others get odd indices. Each thread
  /// gets different index.
  /// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
  ///                              parityIdx:   0  2  4  6  1  3  5  7
  /// @tparam THREADS  total count of participating threads
  /// @return parity-separated index of thread
  template <int THREADS>
  __device__ inline int parityIdx() {
    return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
  }
  
          
  /// size of shared memory
  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
  const int SHM_SIZE = 48 * 1024;
  #else
  const int SHM_SIZE = 16 * 1024;
  #endif
  
  
  /// Perrformance and return code tester.
  class CudaDWTTester {
  private:
    static bool testRunning;    ///< true if any test is currently running
    cudaEvent_t beginEvent;     ///< begin CUDA event
    cudaEvent_t endEvent;       ///< end CUDA event
    std::vector<float> times;   ///< collected times
    const bool disabled;        ///< true if this object is disabled
  public:
    /// Checks CUDA related error.
    /// @param status   return code to be checked
    /// @param message  message to be shown if there was an error
    /// @return true if there was no error, false otherwise
    static bool check(const cudaError_t & status, const char * message) {
      #if defined(GPU_DWT_TESTING)
      if((!testRunning) && status != cudaSuccess) {
        const char * errorString = cudaGetErrorString(status);
        fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
        fflush(stderr);
        return false;
      }
      #endif // GPU_DWT_TESTING
      return true;
    }

    /// Checks last kernel call for errors.
    /// @param message  description of the kernel call
    /// @return true if there was no error, false otherwise
    static bool checkLastKernelCall(const char * message) {
      #if defined(GPU_DWT_TESTING)
      return testRunning ? true : check(cudaThreadSynchronize(), message);
      #else // GPU_DWT_TESTING
      return true;
      #endif // GPU_DWT_TESTING
    }
    
    /// Initializes DWT tester for time measurement
    CudaDWTTester() : disabled(testRunning) {}
    
    /// Gets rpefered number of iterations
    int getNumIterations() {
      return disabled ? 1 : 31;
    }
    
    /// Starts one test iteration.
    void beginTestIteration() {
      if(!disabled) {
        cudaEventCreate(&beginEvent);
        cudaEventCreate(&endEvent);
        cudaEventRecord(beginEvent, 0);
        testRunning = true;
      }
    }
    
    /// Ends on etest iteration.
    void endTestIteration() {
      if(!disabled) {
        float time;
        testRunning = false;
        cudaEventRecord(endEvent, 0);
        cudaEventSynchronize(endEvent);
        cudaEventElapsedTime(&time, beginEvent, endEvent);
        cudaEventDestroy(beginEvent);
        cudaEventDestroy(endEvent);
        times.push_back(time);
      }
    }
    
    /// Shows brief info about all iterations.
    /// @param name   name of processing method
    /// @param sizeX  width of processed image
    /// @param sizeY  height of processed image
    void showPerformance(const char * name, const int sizeX, const int sizeY) {
      if(!disabled) {
        // compute mean and median
        std::sort(times.begin(), times.end());
        double sum = 0;
        for(int i = times.size(); i--; ) {
          sum += times[i];
        }
        const double median = (times[times.size() / 2]
                             + times[(times.size() - 1) / 2]) * 0.5f;
        printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
               "(%d x %d)\n", name, (sum / times.size()), median, 
               times[times.size() - 1], sizeX, sizeY);
      }
    }
  };
  
  
  /// Simple cudaMemcpy wrapped in performance tester.
  /// @param dest  destination bufer
  /// @param src   source buffer
  /// @param sx    width of copied image
  /// @param sy    height of copied image
  template <typename T>
  inline void memCopy(T * const dest, const T * const src,
                      const size_t sx, const size_t sy) {
    cudaError_t status;
    PERF_BEGIN
    status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice);
    PERF_END("        memcpy", sx, sy)
    CudaDWTTester::check(status, "memcpy device > device");
  }
  
  
} // end of namespace dwt_cuda


#endif	// DWT_COMMON_CUDA_H
add dwt2d example and fixes 2022-05-22 03:55:49 +08:00			`///`
			`/// @file common.h`
			`/// @author Martin Jirman (207962@mail.muni.cz)`
			`/// @brief Common stuff for all CUDA dwt functions.`
			`/// @date 2011-01-20 14:19`
			`///`
			`/// Copyright (c) 2011 Martin Jirman`
			`/// All rights reserved.`
			`///`
			`/// Redistribution and use in source and binary forms, with or without`
			`/// modification, are permitted provided that the following conditions are met:`
			`///`
			`/// * Redistributions of source code must retain the above copyright`
			`/// notice, this list of conditions and the following disclaimer.`
			`/// * Redistributions in binary form must reproduce the above copyright`
			`/// notice, this list of conditions and the following disclaimer in the`
			`/// documentation and/or other materials provided with the distribution.`
			`///`
			`/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE`
			`/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`/// POSSIBILITY OF SUCH DAMAGE.`
			`///`


			`#ifndef DWT_COMMON_H`
			`#define DWT_COMMON_H`


			`#include <cstdio>`
			`#include <algorithm>`
			`#include <vector>`



			`// compile time minimum macro`
			`#define CTMIN(a,b) (((a) < (b)) ? (a) : (b))`



			`// performance testing macros`
			`#if defined(GPU_DWT_TESTING)`
			`#define PERF_BEGIN \`
			`{ \`
			`dwt_cuda::CudaDWTTester PERF_TESTER; \`
			`for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \`
			`{ \`
			`PERF_TESTER.beginTestIteration();`

			`#define PERF_END(PERF_NAME, PERF_W, PERF_H) \`
			`PERF_TESTER.endTestIteration(); \`
			`} \`
			`PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \`
			`}`
			`#else // GPU_DWT_TESTING`
			`#define PERF_BEGIN`
			`#define PERF_END(PERF_NAME, PERF_W, PERF_H)`
			`#endif // GPU_DWT_TESTING`



			`namespace dwt_cuda {`


			`/// Divide and round up.`
			`template <typename T>`
			`__device__ __host__ inline T divRndUp(const T & n, const T & d) {`
			`return (n / d) + ((n % d) ? 1 : 0);`
			`}`


			`// 9/7 forward DWT lifting schema coefficients`
			`const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1`
			`const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1`
			`const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2`
			`const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2`


			`// 9/7 reverse DWT lifting schema coefficients`
			`const float r97update2 = -f97Update2; ///< undo 9/7 update 2`
			`const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2`
			`const float r97update1 = -f97Update1; ///< undo 9/7 update 1`
			`const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1`

			`// FDWT 9/7 scaling coefficients`
			`const float scale97Mul = 1.23017410491400f;`
			`const float scale97Div = 1.0 / scale97Mul;`


			`// 5/3 forward DWT lifting schema coefficients`
			`const float forward53Predict = -0.5f; /// forward 5/3 predict`
			`const float forward53Update = 0.25f; /// forward 5/3 update`

			`// 5/3 forward DWT lifting schema coefficients`
			`const float reverse53Update = -forward53Update; /// undo 5/3 update`
			`const float reverse53Predict = -forward53Predict; /// undo 5/3 predict`



			`/// Functor which adds scaled sum of neighbors to given central pixel.`
			`struct AddScaledSum {`
			`const float scale; // scale of neighbors`
			`__device__ AddScaledSum(const float scale) : scale(scale) {}`
			`__device__ void operator()(const float p, float & c, const float n) const {`

			`// if(threadIdx.x == 0) {`

			`// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );`

			`// }`

			`c += scale * (p + n);`
			`}`
			`};`



			`/// Returns index ranging from 0 to num threads, such that first half`
			`/// of threads get even indices and others get odd indices. Each thread`
			`/// gets different index.`
			`/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7`
			`/// parityIdx: 0 2 4 6 1 3 5 7`
			`/// @tparam THREADS total count of participating threads`
			`/// @return parity-separated index of thread`
			`template <int THREADS>`
			`__device__ inline int parityIdx() {`
			`return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));`
			`}`



			`/// size of shared memory`
			`#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)`
			`const int SHM_SIZE = 48 * 1024;`
			`#else`
			`const int SHM_SIZE = 16 * 1024;`
			`#endif`



			`/// Perrformance and return code tester.`
			`class CudaDWTTester {`
			`private:`
			`static bool testRunning; ///< true if any test is currently running`
			`cudaEvent_t beginEvent; ///< begin CUDA event`
			`cudaEvent_t endEvent; ///< end CUDA event`
			`std::vector<float> times; ///< collected times`
			`const bool disabled; ///< true if this object is disabled`
			`public:`
			`/// Checks CUDA related error.`
			`/// @param status return code to be checked`
			`/// @param message message to be shown if there was an error`
			`/// @return true if there was no error, false otherwise`
			`static bool check(const cudaError_t & status, const char * message) {`
			`#if defined(GPU_DWT_TESTING)`
			`if((!testRunning) && status != cudaSuccess) {`
			`const char * errorString = cudaGetErrorString(status);`
			`fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);`
			`fflush(stderr);`
			`return false;`
			`}`
			`#endif // GPU_DWT_TESTING`
			`return true;`
			`}`

			`/// Checks last kernel call for errors.`
			`/// @param message description of the kernel call`
			`/// @return true if there was no error, false otherwise`
			`static bool checkLastKernelCall(const char * message) {`
			`#if defined(GPU_DWT_TESTING)`
			`return testRunning ? true : check(cudaThreadSynchronize(), message);`
			`#else // GPU_DWT_TESTING`
			`return true;`
			`#endif // GPU_DWT_TESTING`
			`}`

			`/// Initializes DWT tester for time measurement`
			`CudaDWTTester() : disabled(testRunning) {}`

			`/// Gets rpefered number of iterations`
			`int getNumIterations() {`
			`return disabled ? 1 : 31;`
			`}`

			`/// Starts one test iteration.`
			`void beginTestIteration() {`
			`if(!disabled) {`
			`cudaEventCreate(&beginEvent);`
			`cudaEventCreate(&endEvent);`
			`cudaEventRecord(beginEvent, 0);`
			`testRunning = true;`
			`}`
			`}`

			`/// Ends on etest iteration.`
			`void endTestIteration() {`
			`if(!disabled) {`
			`float time;`
			`testRunning = false;`
			`cudaEventRecord(endEvent, 0);`
			`cudaEventSynchronize(endEvent);`
			`cudaEventElapsedTime(&time, beginEvent, endEvent);`
			`cudaEventDestroy(beginEvent);`
			`cudaEventDestroy(endEvent);`
			`times.push_back(time);`
			`}`
			`}`

			`/// Shows brief info about all iterations.`
			`/// @param name name of processing method`
			`/// @param sizeX width of processed image`
			`/// @param sizeY height of processed image`
			`void showPerformance(const char * name, const int sizeX, const int sizeY) {`
			`if(!disabled) {`
			`// compute mean and median`
			`std::sort(times.begin(), times.end());`
			`double sum = 0;`
			`for(int i = times.size(); i--; ) {`
			`sum += times[i];`
			`}`
			`const double median = (times[times.size() / 2]`
			`+ times[(times.size() - 1) / 2]) * 0.5f;`
			`printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "`
			`"(%d x %d)\n", name, (sum / times.size()), median,`
			`times[times.size() - 1], sizeX, sizeY);`
			`}`
			`}`
			`};`



			`/// Simple cudaMemcpy wrapped in performance tester.`
			`/// @param dest destination bufer`
			`/// @param src source buffer`
			`/// @param sx width of copied image`
			`/// @param sy height of copied image`
			`template <typename T>`
			`inline void memCopy(T * const dest, const T * const src,`
			`const size_t sx, const size_t sy) {`
			`cudaError_t status;`
			`PERF_BEGIN`
			`status = cudaMemcpy(dest, src, sxsysizeof(T), cudaMemcpyDeviceToDevice);`
			`PERF_END(" memcpy", sx, sy)`
			`CudaDWTTester::check(status, "memcpy device > device");`
			`}`



			`} // end of namespace dwt_cuda`



			`#endif // DWT_COMMON_CUDA_H`