commit
						197abc867d
					
				|  | @ -27,10 +27,10 @@ Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RIS | |||
|    export CuPBoP_PATH=`pwd` | ||||
|    export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH | ||||
|    ``` | ||||
| If you are using boson, you can pre-installed llvm 10.0.0  | ||||
| 
 | ||||
|  LLVM_PATH=/opt/llvm-10.0.0 | ||||
|  export PATH=$LLVM_PATH/bin:$PATH | ||||
|    If you are using boson, you can pre-installed llvm 10.0.0\ | ||||
|    `LLVM_PATH=/opt/llvm-10.0.0`\ | ||||
|    `export PATH=$LLVM_PATH/bin:$PATH` | ||||
| 
 | ||||
| 2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file | ||||
| 
 | ||||
|  |  | |||
|  | @ -396,7 +396,7 @@ void init_block(llvm::Module *M, std::ofstream &fout) { | |||
|   replace_asm_call(M); | ||||
|   // replace dynamic shared memory
 | ||||
|   auto dynamic_shared_memory_addr = | ||||
|         M->getGlobalVariable("dynamic_shared_memory"); | ||||
|       M->getGlobalVariable("dynamic_shared_memory"); | ||||
|   if (dynamic_shared_memory_addr) { | ||||
|     replace_dynamic_shared_memory(M); | ||||
|   } | ||||
|  |  | |||
|  | @ -272,13 +272,12 @@ void AddContextSaveRestore(llvm::Instruction *instruction, | |||
|   std::vector<Instruction *> uses; | ||||
|   Function *f2 = instruction->getParent()->getParent(); | ||||
| 
 | ||||
| 
 | ||||
|   for (Instruction::use_iterator ui = instruction->use_begin(), | ||||
|                                  ue = instruction->use_end(); | ||||
|        ui != ue; ++ui) { | ||||
|     llvm::Instruction *user = cast<Instruction>(ui->getUser()); | ||||
|     Function *f1 = user->getParent()->getParent(); | ||||
|     if(f2->getName() != f1->getName()) { | ||||
|     if (f2->getName() != f1->getName()) { | ||||
|       continue; | ||||
|     } | ||||
|     if (user == NULL) | ||||
|  |  | |||
|  | @ -89,11 +89,12 @@ void mem_share2global(llvm::Module *M) { | |||
|           } else if (element_type->isStructTy()) { | ||||
|             auto undef = llvm::UndefValue::get(element_type); | ||||
|             llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( | ||||
|                 *M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef, | ||||
|                 new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, | ||||
|                 false); | ||||
|                 *M, element_type, false, llvm::GlobalValue::ExternalLinkage, | ||||
|                 undef, new_name, NULL, | ||||
|                 llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); | ||||
|             global_memory->setDSOLocal(true); | ||||
|             Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName())); | ||||
|             Comdat *comdat = | ||||
|                 M->getOrInsertComdat(StringRef(share_memory->getName())); | ||||
|             comdat->setSelectionKind(Comdat::SelectionKind::Any); | ||||
|             global_memory->setComdat(comdat); | ||||
|             global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage); | ||||
|  | @ -101,8 +102,7 @@ void mem_share2global(llvm::Module *M) { | |||
|             global_memory->setAlignment(share_memory->getAlignment()); | ||||
|             corresponding_global_memory.insert( | ||||
|                 std::pair<GlobalVariable *, GlobalVariable *>(share_memory, | ||||
|                 global_memory)); | ||||
| 
 | ||||
|                                                               global_memory)); | ||||
| 
 | ||||
|           } else { | ||||
|             assert(0 && "The required Share Memory Type is not supported\n"); | ||||
|  |  | |||
|  | @ -27,9 +27,9 @@ | |||
| #ifndef _COMMON_H | ||||
| #define _COMMON_H | ||||
| 
 | ||||
| //24-bit multiplication is faster on G80,
 | ||||
| //but we must be sure to multiply integers
 | ||||
| //only within [-8M, 8M - 1] range
 | ||||
| // 24-bit multiplication is faster on G80,
 | ||||
| // but we must be sure to multiply integers
 | ||||
| // only within [-8M, 8M - 1] range
 | ||||
| #define IMUL(a, b) __mul24(a, b) | ||||
| 
 | ||||
| ////cuda timing macros
 | ||||
|  | @ -42,21 +42,23 @@ | |||
| //                          cudaEventSynchronize(cstop); \ | ||||
| //                          cudaEventElapsedTime(&elapsedTime, cstart, cstop)
 | ||||
| 
 | ||||
| //divide and round up macro
 | ||||
| // divide and round up macro
 | ||||
| #define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b))) | ||||
| 
 | ||||
| #  define cudaCheckError( msg ) {                                            \ | ||||
|     cudaError_t err = cudaGetLastError();                                    \ | ||||
|     if( cudaSuccess != err) {                                                \ | ||||
|         fprintf(stderr, "%s: %i: %s: %s.\n",                                 \ | ||||
|                 __FILE__, __LINE__, msg, cudaGetErrorString( err) );         \ | ||||
|         exit(-1);                                                            \ | ||||
|     } } | ||||
| 
 | ||||
| #  define cudaCheckAsyncError( msg ) {                                       \ | ||||
|     cudaThreadSynchronize();                                                 \ | ||||
|     cudaCheckError( msg );                                                   \ | ||||
|     } | ||||
| #define cudaCheckError(msg)                                                    \ | ||||
|   {                                                                            \ | ||||
|     cudaError_t err = cudaGetLastError();                                      \ | ||||
|     if (cudaSuccess != err) {                                                  \ | ||||
|       fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg,            \ | ||||
|               cudaGetErrorString(err));                                        \ | ||||
|       exit(-1);                                                                \ | ||||
|     }                                                                          \ | ||||
|   } | ||||
| 
 | ||||
| #define cudaCheckAsyncError(msg)                                               \ | ||||
|   {                                                                            \ | ||||
|     cudaThreadSynchronize();                                                   \ | ||||
|     cudaCheckError(msg);                                                       \ | ||||
|   } | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -28,11 +28,12 @@ | |||
| #define _COMPONENTS_H | ||||
| 
 | ||||
| /* Separate compoents of source 8bit RGB image */ | ||||
| template<typename T> | ||||
| void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height); | ||||
| template <typename T> | ||||
| void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width, | ||||
|                      int height); | ||||
| 
 | ||||
| /* Copy a 8bit source image data into a color compoment of type T */ | ||||
| template<typename T> | ||||
| void bwToComponent(T *d_c, unsigned char * src, int width, int height); | ||||
| template <typename T> | ||||
| void bwToComponent(T *d_c, unsigned char *src, int width, int height); | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -27,14 +27,15 @@ | |||
| #ifndef _DWT_H | ||||
| #define _DWT_H | ||||
| 
 | ||||
| template<typename T>  | ||||
| int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward); | ||||
| template <typename T> | ||||
| int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight, | ||||
|                 int stages, bool forward); | ||||
| 
 | ||||
| template<typename T> | ||||
| int writeNStage2DDWT(T *component_cuda, int width, int height,  | ||||
|                      int stages, const char * filename, const char * suffix); | ||||
| template<typename T> | ||||
| int writeLinear(T *component_cuda, int width, int height,  | ||||
|                      const char * filename, const char * suffix); | ||||
| template <typename T> | ||||
| int writeNStage2DDWT(T *component_cuda, int width, int height, int stages, | ||||
|                      const char *filename, const char *suffix); | ||||
| template <typename T> | ||||
| int writeLinear(T *component_cuda, int width, int height, const char *filename, | ||||
|                 const char *suffix); | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -29,233 +29,204 @@ | |||
| /// POSSIBILITY OF SUCH DAMAGE.
 | ||||
| ///
 | ||||
| 
 | ||||
| 
 | ||||
| #ifndef DWT_COMMON_H | ||||
| #define	DWT_COMMON_H | ||||
| #define DWT_COMMON_H | ||||
| 
 | ||||
| 
 | ||||
| #include <cstdio> | ||||
| #include <algorithm> | ||||
| #include <cstdio> | ||||
| #include <vector> | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| // compile time minimum macro
 | ||||
| #define CTMIN(a,b) (((a) < (b)) ? (a) : (b)) | ||||
| 
 | ||||
| 
 | ||||
| #define CTMIN(a, b) (((a) < (b)) ? (a) : (b)) | ||||
| 
 | ||||
| // performance testing macros
 | ||||
| #if defined(GPU_DWT_TESTING) | ||||
|   #define PERF_BEGIN  \ | ||||
|   { \ | ||||
|     dwt_cuda::CudaDWTTester PERF_TESTER; \ | ||||
|     for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \ | ||||
|     { \ | ||||
| #define PERF_BEGIN                                                             \ | ||||
|   {                                                                            \ | ||||
|     dwt_cuda::CudaDWTTester PERF_TESTER;                                       \ | ||||
|     for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) {             \ | ||||
|       PERF_TESTER.beginTestIteration(); | ||||
| 
 | ||||
|   #define PERF_END(PERF_NAME, PERF_W, PERF_H)  \ | ||||
|       PERF_TESTER.endTestIteration(); \ | ||||
|     } \ | ||||
|     PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \ | ||||
| #define PERF_END(PERF_NAME, PERF_W, PERF_H)                                    \ | ||||
|   PERF_TESTER.endTestIteration();                                              \ | ||||
|   }                                                                            \ | ||||
|   PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H);                      \ | ||||
|   } | ||||
| #else // GPU_DWT_TESTING
 | ||||
|   #define PERF_BEGIN | ||||
|   #define PERF_END(PERF_NAME, PERF_W, PERF_H) | ||||
| #define PERF_BEGIN | ||||
| #define PERF_END(PERF_NAME, PERF_W, PERF_H) | ||||
| #endif // GPU_DWT_TESTING
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| namespace dwt_cuda { | ||||
| 
 | ||||
| /// Divide and round up.
 | ||||
| template <typename T> | ||||
| __device__ __host__ inline T divRndUp(const T &n, const T &d) { | ||||
|   return (n / d) + ((n % d) ? 1 : 0); | ||||
| } | ||||
| 
 | ||||
|   /// Divide and round up.
 | ||||
|   template <typename T> | ||||
|   __device__ __host__ inline T divRndUp(const T & n, const T & d) { | ||||
|     return (n / d) + ((n % d) ? 1 : 0); | ||||
| // 9/7 forward DWT lifting schema coefficients
 | ||||
| const float f97Predict1 = -1.586134342;  ///< forward 9/7 predict 1
 | ||||
| const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
 | ||||
| const float f97Predict2 = 0.8829110762;  ///< forward 9/7 predict 2
 | ||||
| const float f97Update2 = 0.4435068522;   ///< forward 9/7 update 2
 | ||||
| 
 | ||||
| // 9/7 reverse DWT lifting schema coefficients
 | ||||
| const float r97update2 = -f97Update2;   ///< undo 9/7 update 2
 | ||||
| const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
 | ||||
| const float r97update1 = -f97Update1;   ///< undo 9/7 update 1
 | ||||
| const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
 | ||||
| 
 | ||||
| // FDWT 9/7 scaling coefficients
 | ||||
| const float scale97Mul = 1.23017410491400f; | ||||
| const float scale97Div = 1.0 / scale97Mul; | ||||
| 
 | ||||
| // 5/3 forward DWT lifting schema coefficients
 | ||||
| const float forward53Predict = -0.5f; /// forward 5/3 predict
 | ||||
| const float forward53Update = 0.25f;  /// forward 5/3 update
 | ||||
| 
 | ||||
| // 5/3 forward DWT lifting schema coefficients
 | ||||
| const float reverse53Update = -forward53Update;   /// undo 5/3 update
 | ||||
| const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
 | ||||
| 
 | ||||
| /// Functor which adds scaled sum of neighbors to given central pixel.
 | ||||
| struct AddScaledSum { | ||||
|   const float scale; // scale of neighbors
 | ||||
|   __device__ AddScaledSum(const float scale) : scale(scale) {} | ||||
|   __device__ void operator()(const float p, float &c, const float n) const { | ||||
| 
 | ||||
|     // if(threadIdx.x == 0) {
 | ||||
| 
 | ||||
|     //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
 | ||||
|     //   scale * (p + n) );
 | ||||
| 
 | ||||
|     // }
 | ||||
| 
 | ||||
|     c += scale * (p + n); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| /// Returns index ranging from 0 to num threads, such that first half
 | ||||
| /// of threads get even indices and others get odd indices. Each thread
 | ||||
| /// gets different index.
 | ||||
| /// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
 | ||||
| ///                              parityIdx:   0  2  4  6  1  3  5  7
 | ||||
| /// @tparam THREADS  total count of participating threads
 | ||||
| /// @return parity-separated index of thread
 | ||||
| template <int THREADS> __device__ inline int parityIdx() { | ||||
|   return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2)); | ||||
| } | ||||
| 
 | ||||
| /// size of shared memory
 | ||||
| #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) | ||||
| const int SHM_SIZE = 48 * 1024; | ||||
| #else | ||||
| const int SHM_SIZE = 16 * 1024; | ||||
| #endif | ||||
| 
 | ||||
| /// Perrformance and return code tester.
 | ||||
| class CudaDWTTester { | ||||
| private: | ||||
|   static bool testRunning;  ///< true if any test is currently running
 | ||||
|   cudaEvent_t beginEvent;   ///< begin CUDA event
 | ||||
|   cudaEvent_t endEvent;     ///< end CUDA event
 | ||||
|   std::vector<float> times; ///< collected times
 | ||||
|   const bool disabled;      ///< true if this object is disabled
 | ||||
| public: | ||||
|   /// Checks CUDA related error.
 | ||||
|   /// @param status   return code to be checked
 | ||||
|   /// @param message  message to be shown if there was an error
 | ||||
|   /// @return true if there was no error, false otherwise
 | ||||
|   static bool check(const cudaError_t &status, const char *message) { | ||||
| #if defined(GPU_DWT_TESTING) | ||||
|     if ((!testRunning) && status != cudaSuccess) { | ||||
|       const char *errorString = cudaGetErrorString(status); | ||||
|       fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString); | ||||
|       fflush(stderr); | ||||
|       return false; | ||||
|     } | ||||
| #endif // GPU_DWT_TESTING
 | ||||
|     return true; | ||||
|   } | ||||
| 
 | ||||
|    | ||||
|   // 9/7 forward DWT lifting schema coefficients
 | ||||
|   const float f97Predict1 = -1.586134342;   ///< forward 9/7 predict 1
 | ||||
|   const float f97Update1 = -0.05298011854;  ///< forward 9/7 update 1
 | ||||
|   const float f97Predict2 = 0.8829110762;   ///< forward 9/7 predict 2
 | ||||
|   const float f97Update2 = 0.4435068522;    ///< forward 9/7 update 2
 | ||||
| 
 | ||||
| 
 | ||||
|   // 9/7 reverse DWT lifting schema coefficients
 | ||||
|   const float r97update2 = -f97Update2;    ///< undo 9/7 update 2
 | ||||
|   const float r97predict2 = -f97Predict2;  ///< undo 9/7 predict 2
 | ||||
|   const float r97update1 = -f97Update1;    ///< undo 9/7 update 1
 | ||||
|   const float r97Predict1 = -f97Predict1;  ///< undo 9/7 predict 1
 | ||||
|    | ||||
|   // FDWT 9/7 scaling coefficients
 | ||||
|   const float scale97Mul = 1.23017410491400f; | ||||
|   const float scale97Div = 1.0 / scale97Mul; | ||||
|    | ||||
|    | ||||
|   // 5/3 forward DWT lifting schema coefficients
 | ||||
|   const float forward53Predict = -0.5f;   /// forward 5/3 predict
 | ||||
|   const float forward53Update = 0.25f;    /// forward 5/3 update
 | ||||
|    | ||||
|   // 5/3 forward DWT lifting schema coefficients
 | ||||
|   const float reverse53Update = -forward53Update;    /// undo 5/3 update
 | ||||
|   const float reverse53Predict = -forward53Predict;  /// undo 5/3 predict
 | ||||
|    | ||||
|    | ||||
|    | ||||
|   /// Functor which adds scaled sum of neighbors to given central pixel.
 | ||||
|   struct AddScaledSum { | ||||
|     const float scale;  // scale of neighbors
 | ||||
|     __device__ AddScaledSum(const float scale) : scale(scale) {} | ||||
|     __device__ void operator()(const float p, float & c, const float n) const { | ||||
| 
 | ||||
|       // if(threadIdx.x == 0) {
 | ||||
| 
 | ||||
|       //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );
 | ||||
|        | ||||
|       // }
 | ||||
|        | ||||
|       c += scale * (p + n); | ||||
|     } | ||||
|   }; | ||||
|    | ||||
|    | ||||
|    | ||||
|   /// Returns index ranging from 0 to num threads, such that first half
 | ||||
|   /// of threads get even indices and others get odd indices. Each thread
 | ||||
|   /// gets different index.
 | ||||
|   /// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
 | ||||
|   ///                              parityIdx:   0  2  4  6  1  3  5  7
 | ||||
|   /// @tparam THREADS  total count of participating threads
 | ||||
|   /// @return parity-separated index of thread
 | ||||
|   template <int THREADS> | ||||
|   __device__ inline int parityIdx() { | ||||
|     return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2)); | ||||
|   /// Checks last kernel call for errors.
 | ||||
|   /// @param message  description of the kernel call
 | ||||
|   /// @return true if there was no error, false otherwise
 | ||||
|   static bool checkLastKernelCall(const char *message) { | ||||
| #if defined(GPU_DWT_TESTING) | ||||
|     return testRunning ? true : check(cudaThreadSynchronize(), message); | ||||
| #else  // GPU_DWT_TESTING
 | ||||
|     return true; | ||||
| #endif // GPU_DWT_TESTING
 | ||||
|   } | ||||
| 
 | ||||
|   /// Initializes DWT tester for time measurement
 | ||||
|   CudaDWTTester() : disabled(testRunning) {} | ||||
| 
 | ||||
|   /// Gets rpefered number of iterations
 | ||||
|   int getNumIterations() { return disabled ? 1 : 31; } | ||||
| 
 | ||||
|   /// size of shared memory
 | ||||
|   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) | ||||
|   const int SHM_SIZE = 48 * 1024; | ||||
|   #else | ||||
|   const int SHM_SIZE = 16 * 1024; | ||||
|   #endif | ||||
|    | ||||
|    | ||||
|    | ||||
|   /// Perrformance and return code tester.
 | ||||
|   class CudaDWTTester { | ||||
|   private: | ||||
|     static bool testRunning;    ///< true if any test is currently running
 | ||||
|     cudaEvent_t beginEvent;     ///< begin CUDA event
 | ||||
|     cudaEvent_t endEvent;       ///< end CUDA event
 | ||||
|     std::vector<float> times;   ///< collected times
 | ||||
|     const bool disabled;        ///< true if this object is disabled
 | ||||
|   public: | ||||
|     /// Checks CUDA related error.
 | ||||
|     /// @param status   return code to be checked
 | ||||
|     /// @param message  message to be shown if there was an error
 | ||||
|     /// @return true if there was no error, false otherwise
 | ||||
|     static bool check(const cudaError_t & status, const char * message) { | ||||
|       #if defined(GPU_DWT_TESTING) | ||||
|       if((!testRunning) && status != cudaSuccess) { | ||||
|         const char * errorString = cudaGetErrorString(status); | ||||
|         fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString); | ||||
|         fflush(stderr); | ||||
|         return false; | ||||
|       } | ||||
|       #endif // GPU_DWT_TESTING
 | ||||
|       return true; | ||||
|   /// Starts one test iteration.
 | ||||
|   void beginTestIteration() { | ||||
|     if (!disabled) { | ||||
|       cudaEventCreate(&beginEvent); | ||||
|       cudaEventCreate(&endEvent); | ||||
|       cudaEventRecord(beginEvent, 0); | ||||
|       testRunning = true; | ||||
|     } | ||||
| 
 | ||||
|     /// Checks last kernel call for errors.
 | ||||
|     /// @param message  description of the kernel call
 | ||||
|     /// @return true if there was no error, false otherwise
 | ||||
|     static bool checkLastKernelCall(const char * message) { | ||||
|       #if defined(GPU_DWT_TESTING) | ||||
|       return testRunning ? true : check(cudaThreadSynchronize(), message); | ||||
|       #else // GPU_DWT_TESTING
 | ||||
|       return true; | ||||
|       #endif // GPU_DWT_TESTING
 | ||||
|     } | ||||
|      | ||||
|     /// Initializes DWT tester for time measurement
 | ||||
|     CudaDWTTester() : disabled(testRunning) {} | ||||
|      | ||||
|     /// Gets rpefered number of iterations
 | ||||
|     int getNumIterations() { | ||||
|       return disabled ? 1 : 31; | ||||
|     } | ||||
|      | ||||
|     /// Starts one test iteration.
 | ||||
|     void beginTestIteration() { | ||||
|       if(!disabled) { | ||||
|         cudaEventCreate(&beginEvent); | ||||
|         cudaEventCreate(&endEvent); | ||||
|         cudaEventRecord(beginEvent, 0); | ||||
|         testRunning = true; | ||||
|       } | ||||
|     } | ||||
|      | ||||
|     /// Ends on etest iteration.
 | ||||
|     void endTestIteration() { | ||||
|       if(!disabled) { | ||||
|         float time; | ||||
|         testRunning = false; | ||||
|         cudaEventRecord(endEvent, 0); | ||||
|         cudaEventSynchronize(endEvent); | ||||
|         cudaEventElapsedTime(&time, beginEvent, endEvent); | ||||
|         cudaEventDestroy(beginEvent); | ||||
|         cudaEventDestroy(endEvent); | ||||
|         times.push_back(time); | ||||
|       } | ||||
|     } | ||||
|      | ||||
|     /// Shows brief info about all iterations.
 | ||||
|     /// @param name   name of processing method
 | ||||
|     /// @param sizeX  width of processed image
 | ||||
|     /// @param sizeY  height of processed image
 | ||||
|     void showPerformance(const char * name, const int sizeX, const int sizeY) { | ||||
|       if(!disabled) { | ||||
|         // compute mean and median
 | ||||
|         std::sort(times.begin(), times.end()); | ||||
|         double sum = 0; | ||||
|         for(int i = times.size(); i--; ) { | ||||
|           sum += times[i]; | ||||
|         } | ||||
|         const double median = (times[times.size() / 2] | ||||
|                              + times[(times.size() - 1) / 2]) * 0.5f; | ||||
|         printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  " | ||||
|                "(%d x %d)\n", name, (sum / times.size()), median,  | ||||
|                times[times.size() - 1], sizeX, sizeY); | ||||
|       } | ||||
|     } | ||||
|   }; | ||||
|    | ||||
|    | ||||
|    | ||||
|   /// Simple cudaMemcpy wrapped in performance tester.
 | ||||
|   /// @param dest  destination bufer
 | ||||
|   /// @param src   source buffer
 | ||||
|   /// @param sx    width of copied image
 | ||||
|   /// @param sy    height of copied image
 | ||||
|   template <typename T> | ||||
|   inline void memCopy(T * const dest, const T * const src, | ||||
|                       const size_t sx, const size_t sy) { | ||||
|     cudaError_t status; | ||||
|     PERF_BEGIN | ||||
|     status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice); | ||||
|     PERF_END("        memcpy", sx, sy) | ||||
|     CudaDWTTester::check(status, "memcpy device > device"); | ||||
|   } | ||||
| 
 | ||||
|   /// Ends on etest iteration.
 | ||||
|   void endTestIteration() { | ||||
|     if (!disabled) { | ||||
|       float time; | ||||
|       testRunning = false; | ||||
|       cudaEventRecord(endEvent, 0); | ||||
|       cudaEventSynchronize(endEvent); | ||||
|       cudaEventElapsedTime(&time, beginEvent, endEvent); | ||||
|       cudaEventDestroy(beginEvent); | ||||
|       cudaEventDestroy(endEvent); | ||||
|       times.push_back(time); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   /// Shows brief info about all iterations.
 | ||||
|   /// @param name   name of processing method
 | ||||
|   /// @param sizeX  width of processed image
 | ||||
|   /// @param sizeY  height of processed image
 | ||||
|   void showPerformance(const char *name, const int sizeX, const int sizeY) { | ||||
|     if (!disabled) { | ||||
|       // compute mean and median
 | ||||
|       std::sort(times.begin(), times.end()); | ||||
|       double sum = 0; | ||||
|       for (int i = times.size(); i--;) { | ||||
|         sum += times[i]; | ||||
|       } | ||||
|       const double median = | ||||
|           (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f; | ||||
|       printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  " | ||||
|              "(%d x %d)\n", | ||||
|              name, (sum / times.size()), median, times[times.size() - 1], sizeX, | ||||
|              sizeY); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| /// Simple cudaMemcpy wrapped in performance tester.
 | ||||
| /// @param dest  destination bufer
 | ||||
| /// @param src   source buffer
 | ||||
| /// @param sx    width of copied image
 | ||||
| /// @param sy    height of copied image
 | ||||
| template <typename T> | ||||
| inline void memCopy(T *const dest, const T *const src, const size_t sx, | ||||
|                     const size_t sy) { | ||||
|   cudaError_t status; | ||||
|   PERF_BEGIN | ||||
|   status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice); | ||||
|   PERF_END("        memcpy", sx, sy) | ||||
|   CudaDWTTester::check(status, "memcpy device > device"); | ||||
| } | ||||
| 
 | ||||
| } // end of namespace dwt_cuda
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #endif	// DWT_COMMON_CUDA_H
 | ||||
| 
 | ||||
| #endif // DWT_COMMON_CUDA_H
 | ||||
|  |  | |||
|  | @ -56,57 +56,48 @@ | |||
| ///
 | ||||
| 
 | ||||
| #ifndef DWT_CUDA_H | ||||
| #define	DWT_CUDA_H | ||||
| 
 | ||||
| #define DWT_CUDA_H | ||||
| 
 | ||||
| namespace dwt_cuda { | ||||
| 
 | ||||
| /// Forward 5/3 2D DWT. See common rules (above) for more details.
 | ||||
| /// @param in      Expected to be normalized into range [-128, 127].
 | ||||
| ///                Will not be preserved (will be overwritten).
 | ||||
| /// @param out     output buffer on GPU
 | ||||
| /// @param sizeX   width of input image (in pixels)
 | ||||
| /// @param sizeY   height of input image (in pixels)
 | ||||
| /// @param levels  number of recursive DWT levels
 | ||||
| void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels); | ||||
| 
 | ||||
|   /// Forward 5/3 2D DWT. See common rules (above) for more details.
 | ||||
|   /// @param in      Expected to be normalized into range [-128, 127].
 | ||||
|   ///                Will not be preserved (will be overwritten).
 | ||||
|   /// @param out     output buffer on GPU
 | ||||
|   /// @param sizeX   width of input image (in pixels)
 | ||||
|   /// @param sizeY   height of input image (in pixels)
 | ||||
|   /// @param levels  number of recursive DWT levels
 | ||||
|   void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels); | ||||
| /// Reverse 5/3 2D DWT. See common rules (above) for more details.
 | ||||
| /// @param in      Input DWT coefficients. Format described in common rules.
 | ||||
| ///                Will not be preserved (will be overwritten).
 | ||||
| /// @param out     output buffer on GPU - will contain original image
 | ||||
| ///                in normalized range [-128, 127].
 | ||||
| /// @param sizeX   width of input image (in pixels)
 | ||||
| /// @param sizeY   height of input image (in pixels)
 | ||||
| /// @param levels  number of recursive DWT levels
 | ||||
| void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels); | ||||
| 
 | ||||
| /// Forward 9/7 2D DWT. See common rules (above) for more details.
 | ||||
| /// @param in      Input DWT coefficients. Should be normalized (in range
 | ||||
| ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
 | ||||
| /// @param out     output buffer on GPU - format specified in common rules
 | ||||
| /// @param sizeX   width of input image (in pixels)
 | ||||
| /// @param sizeY   height of input image (in pixels)
 | ||||
| /// @param levels  number of recursive DWT levels
 | ||||
| void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels); | ||||
| 
 | ||||
|   /// Reverse 5/3 2D DWT. See common rules (above) for more details.
 | ||||
|   /// @param in      Input DWT coefficients. Format described in common rules.
 | ||||
|   ///                Will not be preserved (will be overwritten).
 | ||||
|   /// @param out     output buffer on GPU - will contain original image
 | ||||
|   ///                in normalized range [-128, 127].
 | ||||
|   /// @param sizeX   width of input image (in pixels)
 | ||||
|   /// @param sizeY   height of input image (in pixels)
 | ||||
|   /// @param levels  number of recursive DWT levels
 | ||||
|   void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels); | ||||
|    | ||||
|    | ||||
|   /// Forward 9/7 2D DWT. See common rules (above) for more details.
 | ||||
|   /// @param in      Input DWT coefficients. Should be normalized (in range 
 | ||||
|   ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
 | ||||
|   /// @param out     output buffer on GPU - format specified in common rules
 | ||||
|   /// @param sizeX   width of input image (in pixels)
 | ||||
|   /// @param sizeY   height of input image (in pixels)
 | ||||
|   /// @param levels  number of recursive DWT levels
 | ||||
|   void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels); | ||||
|    | ||||
|    | ||||
|   /// Reverse 9/7 2D DWT. See common rules (above) for more details.
 | ||||
|   /// @param in      Input DWT coefficients. Format described in common rules.
 | ||||
|   ///                Will not be preserved (will be overwritten).
 | ||||
|   /// @param out     output buffer on GPU - will contain original image
 | ||||
|   ///                in normalized range [-0.5, 0.5].
 | ||||
|   /// @param sizeX   width of input image (in pixels)
 | ||||
|   /// @param sizeY   height of input image (in pixels)
 | ||||
|   /// @param levels  number of recursive DWT levels
 | ||||
|   void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels); | ||||
|    | ||||
| /// Reverse 9/7 2D DWT. See common rules (above) for more details.
 | ||||
| /// @param in      Input DWT coefficients. Format described in common rules.
 | ||||
| ///                Will not be preserved (will be overwritten).
 | ||||
| /// @param out     output buffer on GPU - will contain original image
 | ||||
| ///                in normalized range [-0.5, 0.5].
 | ||||
| /// @param sizeX   width of input image (in pixels)
 | ||||
| /// @param sizeY   height of input image (in pixels)
 | ||||
| /// @param levels  number of recursive DWT levels
 | ||||
| void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels); | ||||
| 
 | ||||
| } // namespace dwt_cuda
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #endif	// DWT_CUDA_H
 | ||||
| 
 | ||||
| #endif // DWT_CUDA_H
 | ||||
|  |  | |||
|  | @ -30,454 +30,411 @@ | |||
| /// POSSIBILITY OF SUCH DAMAGE.
 | ||||
| ///
 | ||||
| 
 | ||||
| 
 | ||||
| #ifndef IO_H | ||||
| #define	IO_H | ||||
| 
 | ||||
| #define IO_H | ||||
| 
 | ||||
| #include "common.h" | ||||
| 
 | ||||
| namespace dwt_cuda { | ||||
| 
 | ||||
|    | ||||
|   /// Base for all IO classes - manages mirroring.
 | ||||
|   class DWTIO { | ||||
|   protected: | ||||
|     /// Handles mirroring of image at edges in a DWT correct way.
 | ||||
|     /// @param d      a position in the image (will be replaced by mirrored d)
 | ||||
|     /// @param sizeD  size of the image along the dimension of 'd'
 | ||||
|     __device__ static void mirror(int & d, const int & sizeD) { | ||||
|       // TODO: enable multiple mirroring:
 | ||||
| //      if(sizeD > 1) {
 | ||||
| //        if(d < 0) {
 | ||||
| //          const int underflow = -1 - d;
 | ||||
| //          const int phase = (underflow / (sizeD - 1)) & 1;
 | ||||
| //          const int remainder = underflow % (sizeD - 1);
 | ||||
| //          if(phase == 0) {
 | ||||
| //            d = remainder + 1;
 | ||||
| //          } else {
 | ||||
| //            d = sizeD - 2 - remainder;
 | ||||
| //          }
 | ||||
| //        } else if(d >= sizeD) {
 | ||||
| //          const int overflow = d - sizeD;
 | ||||
| //          const int phase = (overflow / (sizeD - 1)) & 1;
 | ||||
| //          const int remainder = overflow % (sizeD - 1);
 | ||||
| //          if(phase == 0) {
 | ||||
| //            d = sizeD - 2 - remainder;
 | ||||
| //          } else {
 | ||||
| //            d = remainder + 1;
 | ||||
| //          }
 | ||||
| //        }
 | ||||
| //      } else {
 | ||||
| //        d = 0;
 | ||||
| //      }
 | ||||
|   //for test the mirror's use Feb 17
 | ||||
|       if(d >= sizeD) { | ||||
|         d = 2 * sizeD - 2 - d; | ||||
|       } else if(d < 0) { | ||||
|         d = -d; | ||||
|       } | ||||
| /// Base for all IO classes - manages mirroring.
 | ||||
| class DWTIO { | ||||
| protected: | ||||
|   /// Handles mirroring of image at edges in a DWT correct way.
 | ||||
|   /// @param d      a position in the image (will be replaced by mirrored d)
 | ||||
|   /// @param sizeD  size of the image along the dimension of 'd'
 | ||||
|   __device__ static void mirror(int &d, const int &sizeD) { | ||||
|     // TODO: enable multiple mirroring:
 | ||||
|     //      if(sizeD > 1) {
 | ||||
|     //        if(d < 0) {
 | ||||
|     //          const int underflow = -1 - d;
 | ||||
|     //          const int phase = (underflow / (sizeD - 1)) & 1;
 | ||||
|     //          const int remainder = underflow % (sizeD - 1);
 | ||||
|     //          if(phase == 0) {
 | ||||
|     //            d = remainder + 1;
 | ||||
|     //          } else {
 | ||||
|     //            d = sizeD - 2 - remainder;
 | ||||
|     //          }
 | ||||
|     //        } else if(d >= sizeD) {
 | ||||
|     //          const int overflow = d - sizeD;
 | ||||
|     //          const int phase = (overflow / (sizeD - 1)) & 1;
 | ||||
|     //          const int remainder = overflow % (sizeD - 1);
 | ||||
|     //          if(phase == 0) {
 | ||||
|     //            d = sizeD - 2 - remainder;
 | ||||
|     //          } else {
 | ||||
|     //            d = remainder + 1;
 | ||||
|     //          }
 | ||||
|     //        }
 | ||||
|     //      } else {
 | ||||
|     //        d = 0;
 | ||||
|     //      }
 | ||||
|     // for test the mirror's use Feb 17
 | ||||
|     if (d >= sizeD) { | ||||
|       d = 2 * sizeD - 2 - d; | ||||
|     } else if (d < 0) { | ||||
|       d = -d; | ||||
|     } | ||||
|   }; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| /// Base class for pixel loader and writer - manages computing start index,
 | ||||
| /// stride and end of image for loading column of pixels.
 | ||||
| /// @tparam T        type of image pixels
 | ||||
| /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
| template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO { | ||||
| protected: | ||||
|   int end;    ///< index of bottom neightbor of last pixel of column
 | ||||
|   int stride; ///< increment of pointer to get to next pixel
 | ||||
| 
 | ||||
|   /// Base class for pixel loader and writer - manages computing start index,
 | ||||
|   /// stride and end of image for loading column of pixels.
 | ||||
|   /// @tparam T        type of image pixels
 | ||||
|   /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
|   template <typename T, bool CHECKED> | ||||
|   class VerticalDWTPixelIO : protected DWTIO { | ||||
|   protected: | ||||
|     int end;         ///< index of bottom neightbor of last pixel of column
 | ||||
|     int stride;      ///< increment of pointer to get to next pixel
 | ||||
|   /// Initializes pixel IO - sets end index and a position of first pixel.
 | ||||
|   /// @param sizeX   width of the image
 | ||||
|   /// @param sizeY   height of the image
 | ||||
|   /// @param firstX  x-coordinate of first pixel to use
 | ||||
|   /// @param firstY  y-coordinate of first pixel to use
 | ||||
|   /// @return index of pixel at position [x, y] in the image
 | ||||
|   __device__ int initialize(const int sizeX, const int sizeY, int firstX, | ||||
|                             int firstY) { | ||||
|     // initialize all pointers and stride
 | ||||
|     end = CHECKED ? (sizeY * sizeX + firstX) : 0; | ||||
|     stride = sizeX; | ||||
|     return firstX + sizeX * firstY; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
|     /// Initializes pixel IO - sets end index and a position of first pixel.
 | ||||
|     /// @param sizeX   width of the image
 | ||||
|     /// @param sizeY   height of the image
 | ||||
|     /// @param firstX  x-coordinate of first pixel to use
 | ||||
|     /// @param firstY  y-coordinate of first pixel to use
 | ||||
|     /// @return index of pixel at position [x, y] in the image
 | ||||
|     __device__ int initialize(const int sizeX, const int sizeY, | ||||
|                               int firstX, int firstY) { | ||||
|       // initialize all pointers and stride
 | ||||
|       end = CHECKED ? (sizeY * sizeX + firstX) : 0; | ||||
|       stride = sizeX; | ||||
|       return firstX + sizeX * firstY; | ||||
|     } | ||||
|   }; | ||||
| /// Writes reverse transformed pixels directly into output image.
 | ||||
| /// @tparam T        type of output pixels
 | ||||
| /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
| template <typename T, bool CHECKED> | ||||
| class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> { | ||||
| private: | ||||
|   int next; // index of the next pixel to be loaded
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|   /// Writes reverse transformed pixels directly into output image.
 | ||||
|   /// @tparam T        type of output pixels
 | ||||
|   /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
|   template <typename T, bool CHECKED> | ||||
|   class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> { | ||||
|   private: | ||||
|     int next;   // index of the next pixel to be loaded
 | ||||
| 
 | ||||
|   public: | ||||
|     /// Initializes writer - sets output buffer and a position of first pixel.
 | ||||
|     /// @param sizeX   width of the image
 | ||||
|     /// @param sizeY   height of the image
 | ||||
|     /// @param firstX  x-coordinate of first pixel to write into
 | ||||
|     /// @param firstY  y-coordinate of first pixel to write into
 | ||||
|     __device__ void init(const int sizeX, const int sizeY,  | ||||
|                          int firstX, int firstY) { | ||||
|       if(firstX < sizeX) { | ||||
|         next = this->initialize(sizeX, sizeY, firstX, firstY); | ||||
|       } else { | ||||
|         this->end = 0; | ||||
|         this->stride = 0; | ||||
|         next = 0; | ||||
|       } | ||||
|     } | ||||
| 
 | ||||
|     /// Writes given value at next position and advances internal pointer while
 | ||||
|     /// correctly handling mirroring.
 | ||||
|     /// @param output  output image to write pixel into
 | ||||
|     /// @param value   value of the pixel to be written
 | ||||
|     __device__ void writeInto(T * const output, const T & value) { | ||||
|       if((!CHECKED) || (next != this->end)) { | ||||
|         output[next] = value; | ||||
|         next += this->stride; | ||||
|       } | ||||
|     } | ||||
|   }; | ||||
| 
 | ||||
| 
 | ||||
|    | ||||
|   /// Loads pixels from input image.
 | ||||
|   /// @tparam T        type of image input pixels
 | ||||
|   /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
|   template <typename T, bool CHECKED> | ||||
|   class VerticalDWTPixelLoader | ||||
|           : protected VerticalDWTPixelIO<const T, CHECKED> { | ||||
|   private: | ||||
|     int last;  ///< index of last loaded pixel
 | ||||
|   public: | ||||
|    | ||||
| 
 | ||||
|   //******************* FOR TEST **********************
 | ||||
|   __device__ int getlast(){ | ||||
| 		return last; | ||||
| 	} | ||||
|   __device__ int getend(){ | ||||
| 		return this->end; | ||||
| 	} | ||||
|   __device__ int getstride(){ | ||||
| 		return this->stride; | ||||
| 	} | ||||
|   __device__ void setend(int a){ | ||||
|       this->end=a; | ||||
| 	} | ||||
| 	//******************* FOR TEST **********************
 | ||||
|    | ||||
|    | ||||
|    | ||||
|     /// Initializes loader - sets input size and a position of first pixel.
 | ||||
|     /// @param sizeX   width of the image
 | ||||
|     /// @param sizeY   height of the image
 | ||||
|     /// @param firstX  x-coordinate of first pixel to load
 | ||||
|     /// @param firstY  y-coordinate of first pixel to load
 | ||||
|     __device__ void init(const int sizeX, const int sizeY, | ||||
|                          int firstX, int firstY) { | ||||
|       // correctly mirror x coordinate
 | ||||
|       this->mirror(firstX, sizeX); | ||||
|        | ||||
|       // 'last' always points to already loaded pixel (subtract sizeX = stride)
 | ||||
|       last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX; | ||||
|       //last = (FirstX + sizeX * FirstY) - sizeX
 | ||||
|     } | ||||
|      | ||||
|     /// Sets all fields to zeros, for compiler not to complain about
 | ||||
|     /// uninitialized stuff.
 | ||||
|     __device__ void clear() { | ||||
| public: | ||||
|   /// Initializes writer - sets output buffer and a position of first pixel.
 | ||||
|   /// @param sizeX   width of the image
 | ||||
|   /// @param sizeY   height of the image
 | ||||
|   /// @param firstX  x-coordinate of first pixel to write into
 | ||||
|   /// @param firstY  y-coordinate of first pixel to write into
 | ||||
|   __device__ void init(const int sizeX, const int sizeY, int firstX, | ||||
|                        int firstY) { | ||||
|     if (firstX < sizeX) { | ||||
|       next = this->initialize(sizeX, sizeY, firstX, firstY); | ||||
|     } else { | ||||
|       this->end = 0; | ||||
|       this->stride = 0; | ||||
|       this->last = 0; | ||||
|       next = 0; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   /// Writes given value at next position and advances internal pointer while
 | ||||
|   /// correctly handling mirroring.
 | ||||
|   /// @param output  output image to write pixel into
 | ||||
|   /// @param value   value of the pixel to be written
 | ||||
|   __device__ void writeInto(T *const output, const T &value) { | ||||
|     if ((!CHECKED) || (next != this->end)) { | ||||
|       output[next] = value; | ||||
|       next += this->stride; | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| /// Loads pixels from input image.
 | ||||
| /// @tparam T        type of image input pixels
 | ||||
| /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
| template <typename T, bool CHECKED> | ||||
| class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> { | ||||
| private: | ||||
|   int last; ///< index of last loaded pixel
 | ||||
| public: | ||||
|   //******************* FOR TEST **********************
 | ||||
|   __device__ int getlast() { return last; } | ||||
|   __device__ int getend() { return this->end; } | ||||
|   __device__ int getstride() { return this->stride; } | ||||
|   __device__ void setend(int a) { this->end = a; } | ||||
|   //******************* FOR TEST **********************
 | ||||
| 
 | ||||
|   /// Initializes loader - sets input size and a position of first pixel.
 | ||||
|   /// @param sizeX   width of the image
 | ||||
|   /// @param sizeY   height of the image
 | ||||
|   /// @param firstX  x-coordinate of first pixel to load
 | ||||
|   /// @param firstY  y-coordinate of first pixel to load
 | ||||
|   __device__ void init(const int sizeX, const int sizeY, int firstX, | ||||
|                        int firstY) { | ||||
|     // correctly mirror x coordinate
 | ||||
|     this->mirror(firstX, sizeX); | ||||
| 
 | ||||
|     // 'last' always points to already loaded pixel (subtract sizeX = stride)
 | ||||
|     last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX; | ||||
|     // last = (FirstX + sizeX * FirstY) - sizeX
 | ||||
|   } | ||||
| 
 | ||||
|   /// Sets all fields to zeros, for compiler not to complain about
 | ||||
|   /// uninitialized stuff.
 | ||||
|   __device__ void clear() { | ||||
|     this->end = 0; | ||||
|     this->stride = 0; | ||||
|     this->last = 0; | ||||
|   } | ||||
| 
 | ||||
|   /// Gets another pixel and advancees internal pointer to following one.
 | ||||
|   /// @param input  input image to load next pixel from
 | ||||
|   /// @return next pixel from given image
 | ||||
|   __device__ T loadFrom(const T *const input) { | ||||
|     last += this->stride; | ||||
|     if (CHECKED && (last == this->end)) { | ||||
|       last -= 2 * this->stride; | ||||
|       this->stride = -this->stride; // reverse loader's direction
 | ||||
|     } | ||||
|     // avoid reading from negative indices if loader is checked
 | ||||
|     // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
 | ||||
|     // checked variant later
 | ||||
|     if (last < 0) { | ||||
|       return 0; | ||||
|     } | ||||
| 
 | ||||
|     /// Gets another pixel and advancees internal pointer to following one.
 | ||||
|     /// @param input  input image to load next pixel from
 | ||||
|     /// @return next pixel from given image
 | ||||
|     __device__ T loadFrom(const T * const input) { | ||||
|       last += this->stride; | ||||
|       if(CHECKED && (last == this->end)) { | ||||
|         last -= 2 * this->stride; | ||||
|         this->stride = -this->stride; // reverse loader's direction
 | ||||
|       } | ||||
|       // avoid reading from negative indices if loader is checked
 | ||||
|       // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
 | ||||
|       if(last < 0 ) { | ||||
|         return 0; | ||||
|       } | ||||
|     return input[last]; | ||||
|     // return this->end;
 | ||||
|     // return last;
 | ||||
|     // return this->stride;
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
|       return input[last]; | ||||
|       // return this->end;
 | ||||
|       // return last;
 | ||||
|       // return this->stride;
 | ||||
|     } | ||||
|   }; | ||||
| /// Base for band write and loader. Manages computing strides and pointers
 | ||||
| /// to first and last pixels in a linearly-stored-bands correct way.
 | ||||
| /// @tparam T        type of band coefficients
 | ||||
| /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
| template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO { | ||||
| protected: | ||||
|   /// index of bottom neighbor of last pixel of loaded column
 | ||||
|   int end; | ||||
| 
 | ||||
|   /// increment of index to get from highpass band to the lowpass one
 | ||||
|   int strideHighToLow; | ||||
| 
 | ||||
|   /// increment of index to get from the lowpass band to the highpass one
 | ||||
|   int strideLowToHigh; | ||||
| 
 | ||||
|   /// Base for band write and loader. Manages computing strides and pointers
 | ||||
|   /// to first and last pixels in a linearly-stored-bands correct way.
 | ||||
|   /// @tparam T        type of band coefficients
 | ||||
|   /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
|   template <typename T, bool CHECKED> | ||||
|   class VerticalDWTBandIO : protected DWTIO { | ||||
|   protected: | ||||
|     /// index of bottom neighbor of last pixel of loaded column
 | ||||
|     int end; | ||||
|   /// Initializes IO - sets size of image and a position of first pixel.
 | ||||
|   /// @param imageSizeX   width of the image
 | ||||
|   /// @param imageSizeY   height of the image
 | ||||
|   /// @param firstX       x-coordinate of first pixel to use
 | ||||
|   ///                     (Parity determines vertically low or high band.)
 | ||||
|   /// @param firstY       y-coordinate of first pixel to use
 | ||||
|   ///                     (Parity determines horizontally low or high band.)
 | ||||
|   /// @return index of first item specified by firstX and firstY
 | ||||
|   __device__ int initialize(const int imageSizeX, const int imageSizeY, | ||||
|                             int firstX, int firstY) { | ||||
|     // index of first pixel (topmost one) of the column with index firstX
 | ||||
|     int columnOffset = firstX / 2; | ||||
| 
 | ||||
|     /// increment of index to get from highpass band to the lowpass one
 | ||||
|     int strideHighToLow; | ||||
|     // difference between indices of two vertically neighboring pixels
 | ||||
|     // in the same band
 | ||||
|     int verticalStride; | ||||
| 
 | ||||
|     /// increment of index to get from the lowpass band to the highpass one
 | ||||
|     int strideLowToHigh; | ||||
| 
 | ||||
|     /// Initializes IO - sets size of image and a position of first pixel.
 | ||||
|     /// @param imageSizeX   width of the image
 | ||||
|     /// @param imageSizeY   height of the image
 | ||||
|     /// @param firstX       x-coordinate of first pixel to use
 | ||||
|     ///                     (Parity determines vertically low or high band.)
 | ||||
|     /// @param firstY       y-coordinate of first pixel to use
 | ||||
|     ///                     (Parity determines horizontally low or high band.)
 | ||||
|     /// @return index of first item specified by firstX and firstY
 | ||||
|     __device__ int initialize(const int imageSizeX, const int imageSizeY, | ||||
|                               int firstX, int firstY) { | ||||
|       // index of first pixel (topmost one) of the column with index firstX
 | ||||
|       int columnOffset = firstX / 2; | ||||
|        | ||||
|       // difference between indices of two vertically neighboring pixels
 | ||||
|       // in the same band
 | ||||
|       int verticalStride; | ||||
|        | ||||
|       // resolve index of first pixel according to horizontal parity
 | ||||
|       if(firstX & 1) { | ||||
|         // first pixel in one of right bands
 | ||||
|         verticalStride = imageSizeX / 2; | ||||
|         columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2); | ||||
|         strideLowToHigh = (imageSizeX * imageSizeY) / 2; | ||||
|       } else { | ||||
|         // first pixel in one of left bands
 | ||||
|         verticalStride = imageSizeX / 2 + (imageSizeX & 1); | ||||
|         strideLowToHigh = divRndUp(imageSizeY, 2)  * imageSizeX; | ||||
|       } | ||||
|        | ||||
|       // set the other stride
 | ||||
|       strideHighToLow = verticalStride - strideLowToHigh; | ||||
| 
 | ||||
|       // compute index of coefficient which indicates end of image
 | ||||
|       if(CHECKED) { | ||||
|         end = columnOffset                            // right column
 | ||||
|                 + (imageSizeY / 2) * verticalStride   // right row
 | ||||
|                 + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
 | ||||
|       } else { | ||||
|         end = 0; | ||||
|       } | ||||
| 
 | ||||
| 
 | ||||
| 	//***********for test**************
 | ||||
| 	//	end = CHECKED;
 | ||||
| 	//***********for test**************
 | ||||
| 	 | ||||
| 	 | ||||
|       // finally, return index of the first item
 | ||||
|       return columnOffset                        // right column
 | ||||
|               + (firstY / 2) * verticalStride    // right row
 | ||||
|               + (firstY & 1) * strideLowToHigh;  // possibly in high band
 | ||||
|     } | ||||
|   }; | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|   /// Directly loads coefficients from four consecutively stored transformed
 | ||||
|   /// bands.
 | ||||
|   /// @tparam T        type of input band coefficients
 | ||||
|   /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
|   template <typename T, bool CHECKED> | ||||
|   class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> { | ||||
|   private: | ||||
|     int last;  ///< index of last loaded pixel
 | ||||
| 
 | ||||
|     /// Checks internal index and possibly reverses direction of loader.
 | ||||
|     /// (Handles mirroring at the bottom of the image.)
 | ||||
|     /// @param input   input image to load next coefficient from
 | ||||
|     /// @param stride  stride to use now (one of two loader's strides)
 | ||||
|     /// @return loaded coefficient
 | ||||
|     __device__ T updateAndLoad(const T * const input, const int & stride) { | ||||
|       last += stride; | ||||
|       if(CHECKED && (last == this->end)) { | ||||
|         // undo last two updates of index (to get to previous mirrored item)
 | ||||
|         last -= (this->strideLowToHigh + this->strideHighToLow); | ||||
| 
 | ||||
|         // swap and reverse strides (to move up in the loaded column now)
 | ||||
|         const int temp = this->strideLowToHigh; | ||||
|         this->strideLowToHigh = -this->strideHighToLow; | ||||
|         this->strideHighToLow = -temp; | ||||
|       } | ||||
|       if(last < 0 ) { | ||||
|         return 0; | ||||
|       } | ||||
|       // avoid reading from negative indices if loader is checked
 | ||||
|       // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
 | ||||
|       return input[last]; | ||||
|     } | ||||
|   public: | ||||
| 
 | ||||
|     /// Initializes loader - sets input size and a position of first pixel.
 | ||||
|     /// @param imageSizeX   width of the image
 | ||||
|     /// @param imageSizeY   height of the image
 | ||||
|     /// @param firstX       x-coordinate of first pixel to load
 | ||||
|     ///                     (Parity determines vertically low or high band.)
 | ||||
|     /// @param firstY       y-coordinate of first pixel to load
 | ||||
|     ///                     (Parity determines horizontally low or high band.)
 | ||||
|     __device__ void init(const int imageSizeX, const int imageSizeY, | ||||
|                          int firstX, const int firstY) { | ||||
|       this->mirror(firstX, imageSizeX); | ||||
|       last = this->initialize(imageSizeX, imageSizeY, firstX, firstY); | ||||
|        | ||||
|       // adjust to point to previous item
 | ||||
|       last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;  | ||||
|     // resolve index of first pixel according to horizontal parity
 | ||||
|     if (firstX & 1) { | ||||
|       // first pixel in one of right bands
 | ||||
|       verticalStride = imageSizeX / 2; | ||||
|       columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2); | ||||
|       strideLowToHigh = (imageSizeX * imageSizeY) / 2; | ||||
|     } else { | ||||
|       // first pixel in one of left bands
 | ||||
|       verticalStride = imageSizeX / 2 + (imageSizeX & 1); | ||||
|       strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX; | ||||
|     } | ||||
| 
 | ||||
|     /// Sets all fields to zeros, for compiler not to complain about
 | ||||
|     /// uninitialized stuff.
 | ||||
|     __device__ void clear() { | ||||
|       this->end = 0; | ||||
|       this->strideHighToLow = 0; | ||||
|       this->strideLowToHigh = 0; | ||||
|       this->last = 0; | ||||
|     // set the other stride
 | ||||
|     strideHighToLow = verticalStride - strideLowToHigh; | ||||
| 
 | ||||
|     // compute index of coefficient which indicates end of image
 | ||||
|     if (CHECKED) { | ||||
|       end = columnOffset                          // right column
 | ||||
|             + (imageSizeY / 2) * verticalStride   // right row
 | ||||
|             + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
 | ||||
|     } else { | ||||
|       end = 0; | ||||
|     } | ||||
| 
 | ||||
|     /// Gets another coefficient from lowpass band and advances internal index.
 | ||||
|     /// Call this method first if position of first pixel passed to init
 | ||||
|     /// was in high band.
 | ||||
|     /// @param input   input image to load next coefficient from
 | ||||
|     /// @return next coefficient from the lowpass band of the given image
 | ||||
|     __device__ T loadLowFrom(const T * const input) { | ||||
|       return updateAndLoad(input, this->strideHighToLow); | ||||
|     //***********for test**************
 | ||||
|     //	end = CHECKED;
 | ||||
|     //***********for test**************
 | ||||
| 
 | ||||
|     // finally, return index of the first item
 | ||||
|     return columnOffset                      // right column
 | ||||
|            + (firstY / 2) * verticalStride   // right row
 | ||||
|            + (firstY & 1) * strideLowToHigh; // possibly in high band
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| /// Directly loads coefficients from four consecutively stored transformed
 | ||||
| /// bands.
 | ||||
| /// @tparam T        type of input band coefficients
 | ||||
| /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
| template <typename T, bool CHECKED> | ||||
| class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> { | ||||
| private: | ||||
|   int last; ///< index of last loaded pixel
 | ||||
| 
 | ||||
|   /// Checks internal index and possibly reverses direction of loader.
 | ||||
|   /// (Handles mirroring at the bottom of the image.)
 | ||||
|   /// @param input   input image to load next coefficient from
 | ||||
|   /// @param stride  stride to use now (one of two loader's strides)
 | ||||
|   /// @return loaded coefficient
 | ||||
|   __device__ T updateAndLoad(const T *const input, const int &stride) { | ||||
|     last += stride; | ||||
|     if (CHECKED && (last == this->end)) { | ||||
|       // undo last two updates of index (to get to previous mirrored item)
 | ||||
|       last -= (this->strideLowToHigh + this->strideHighToLow); | ||||
| 
 | ||||
|       // swap and reverse strides (to move up in the loaded column now)
 | ||||
|       const int temp = this->strideLowToHigh; | ||||
|       this->strideLowToHigh = -this->strideHighToLow; | ||||
|       this->strideHighToLow = -temp; | ||||
|     } | ||||
| 
 | ||||
|     /// Gets another coefficient from the highpass band and advances index.
 | ||||
|     /// Call this method first if position of first pixel passed to init
 | ||||
|     /// was in high band.
 | ||||
|     /// @param input   input image to load next coefficient from
 | ||||
|     /// @return next coefficient from the highbass band of the given image
 | ||||
|     __device__ T loadHighFrom(const T * const input) { | ||||
|       return updateAndLoad(input, this->strideLowToHigh); | ||||
|     if (last < 0) { | ||||
|       return 0; | ||||
|     } | ||||
|     // avoid reading from negative indices if loader is checked
 | ||||
|     // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
 | ||||
|     // checked variant later
 | ||||
|     return input[last]; | ||||
|   } | ||||
| 
 | ||||
|   }; | ||||
| public: | ||||
|   /// Initializes loader - sets input size and a position of first pixel.
 | ||||
|   /// @param imageSizeX   width of the image
 | ||||
|   /// @param imageSizeY   height of the image
 | ||||
|   /// @param firstX       x-coordinate of first pixel to load
 | ||||
|   ///                     (Parity determines vertically low or high band.)
 | ||||
|   /// @param firstY       y-coordinate of first pixel to load
 | ||||
|   ///                     (Parity determines horizontally low or high band.)
 | ||||
|   __device__ void init(const int imageSizeX, const int imageSizeY, int firstX, | ||||
|                        const int firstY) { | ||||
|     this->mirror(firstX, imageSizeX); | ||||
|     last = this->initialize(imageSizeX, imageSizeY, firstX, firstY); | ||||
| 
 | ||||
|     // adjust to point to previous item
 | ||||
|     last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow; | ||||
|   } | ||||
| 
 | ||||
|   /// Sets all fields to zeros, for compiler not to complain about
 | ||||
|   /// uninitialized stuff.
 | ||||
|   __device__ void clear() { | ||||
|     this->end = 0; | ||||
|     this->strideHighToLow = 0; | ||||
|     this->strideLowToHigh = 0; | ||||
|     this->last = 0; | ||||
|   } | ||||
| 
 | ||||
|   /// Gets another coefficient from lowpass band and advances internal index.
 | ||||
|   /// Call this method first if position of first pixel passed to init
 | ||||
|   /// was in high band.
 | ||||
|   /// @param input   input image to load next coefficient from
 | ||||
|   /// @return next coefficient from the lowpass band of the given image
 | ||||
|   __device__ T loadLowFrom(const T *const input) { | ||||
|     return updateAndLoad(input, this->strideHighToLow); | ||||
|   } | ||||
| 
 | ||||
|   /// Directly saves coefficients into four transformed bands.
 | ||||
|   /// @tparam T        type of output band coefficients
 | ||||
|   /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
|   template <typename T, bool CHECKED> | ||||
|   class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> { | ||||
|   private: | ||||
|     int next;  ///< index of last loaded pixel
 | ||||
|   /// Gets another coefficient from the highpass band and advances index.
 | ||||
|   /// Call this method first if position of first pixel passed to init
 | ||||
|   /// was in high band.
 | ||||
|   /// @param input   input image to load next coefficient from
 | ||||
|   /// @return next coefficient from the highbass band of the given image
 | ||||
|   __device__ T loadHighFrom(const T *const input) { | ||||
|     return updateAndLoad(input, this->strideLowToHigh); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
|     /// Checks internal index and possibly stops the writer.
 | ||||
|     /// (Handles mirroring at edges of the image.)
 | ||||
|     /// @param output  output buffer
 | ||||
|     /// @param item    item to put into the output
 | ||||
|     /// @param stride  increment of the pointer to get to next output index
 | ||||
|     __device__ int saveAndUpdate(T * const output, const T & item, | ||||
|                                   const int & stride) { | ||||
| //	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){		//test, Mar 20					  
 | ||||
|       if((!CHECKED) || (next != this->end)) { | ||||
|         // if(next == 4) {
 | ||||
|         //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
 | ||||
|         // }
 | ||||
|         output[next] = item; | ||||
|         next += stride; | ||||
|       }  | ||||
| //	}
 | ||||
|       // if((!CHECKED) || (next != this->end)) { //the real one
 | ||||
|         // output[next] = item;
 | ||||
|         // next += stride;  //stride has been test
 | ||||
| /// Directly saves coefficients into four transformed bands.
 | ||||
| /// @tparam T        type of output band coefficients
 | ||||
| /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 | ||||
| template <typename T, bool CHECKED> | ||||
| class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> { | ||||
| private: | ||||
|   int next; ///< index of last loaded pixel
 | ||||
| 
 | ||||
|   /// Checks internal index and possibly stops the writer.
 | ||||
|   /// (Handles mirroring at edges of the image.)
 | ||||
|   /// @param output  output buffer
 | ||||
|   /// @param item    item to put into the output
 | ||||
|   /// @param stride  increment of the pointer to get to next output index
 | ||||
|   __device__ int saveAndUpdate(T *const output, const T &item, | ||||
|                                const int &stride) { | ||||
|     //	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
 | ||||
|     ////test, Mar 20
 | ||||
|     if ((!CHECKED) || (next != this->end)) { | ||||
|       // if(next == 4) {
 | ||||
|       //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
 | ||||
|       // }
 | ||||
| 	return next; | ||||
|       output[next] = item; | ||||
|       next += stride; | ||||
|     } | ||||
|   public: | ||||
|     //	}
 | ||||
|     // if((!CHECKED) || (next != this->end)) { //the real one
 | ||||
|     // output[next] = item;
 | ||||
|     // next += stride;  //stride has been test
 | ||||
|     // }
 | ||||
|     return next; | ||||
|   } | ||||
| 
 | ||||
|     /// Initializes writer - sets output size and a position of first pixel.
 | ||||
|     /// @param output       output image
 | ||||
|     /// @param imageSizeX   width of the image
 | ||||
|     /// @param imageSizeY   height of the image
 | ||||
|     /// @param firstX       x-coordinate of first pixel to write
 | ||||
|     ///                     (Parity determines vertically low or high band.)
 | ||||
|     /// @param firstY       y-coordinate of first pixel to write
 | ||||
|     ///                     (Parity determines horizontally low or high band.)
 | ||||
|     __device__ void init(const int imageSizeX, const int imageSizeY, | ||||
|                          const int firstX, const int firstY) { | ||||
|       if (firstX < imageSizeX) { | ||||
|         next = this->initialize(imageSizeX, imageSizeY, firstX, firstY); | ||||
|       } else { | ||||
|         clear(); | ||||
|       } | ||||
| public: | ||||
|   /// Initializes writer - sets output size and a position of first pixel.
 | ||||
|   /// @param output       output image
 | ||||
|   /// @param imageSizeX   width of the image
 | ||||
|   /// @param imageSizeY   height of the image
 | ||||
|   /// @param firstX       x-coordinate of first pixel to write
 | ||||
|   ///                     (Parity determines vertically low or high band.)
 | ||||
|   /// @param firstY       y-coordinate of first pixel to write
 | ||||
|   ///                     (Parity determines horizontally low or high band.)
 | ||||
|   __device__ void init(const int imageSizeX, const int imageSizeY, | ||||
|                        const int firstX, const int firstY) { | ||||
|     if (firstX < imageSizeX) { | ||||
|       next = this->initialize(imageSizeX, imageSizeY, firstX, firstY); | ||||
|     } else { | ||||
|       clear(); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|     /// Sets all fields to zeros, for compiler not to complain about
 | ||||
|     /// uninitialized stuff.
 | ||||
|     __device__ void clear() { | ||||
|       this->end = 0; | ||||
|       this->strideHighToLow = 0; | ||||
|       this->strideLowToHigh = 0; | ||||
|       this->next = 0; | ||||
|     } | ||||
|   /// Sets all fields to zeros, for compiler not to complain about
 | ||||
|   /// uninitialized stuff.
 | ||||
|   __device__ void clear() { | ||||
|     this->end = 0; | ||||
|     this->strideHighToLow = 0; | ||||
|     this->strideLowToHigh = 0; | ||||
|     this->next = 0; | ||||
|   } | ||||
| 
 | ||||
|     /// Writes another coefficient into the band which was specified using
 | ||||
|     /// init's firstX and firstY parameters and advances internal pointer.
 | ||||
|     /// Call this method first if position of first pixel passed to init
 | ||||
|     /// was in lowpass band.
 | ||||
|     /// @param output  output image
 | ||||
|     /// @param low     lowpass coefficient to save into the lowpass band
 | ||||
|     __device__ int writeLowInto(T * const output, const T & primary) { | ||||
|       return saveAndUpdate(output, primary, this->strideLowToHigh); | ||||
|     } | ||||
|   /// Writes another coefficient into the band which was specified using
 | ||||
|   /// init's firstX and firstY parameters and advances internal pointer.
 | ||||
|   /// Call this method first if position of first pixel passed to init
 | ||||
|   /// was in lowpass band.
 | ||||
|   /// @param output  output image
 | ||||
|   /// @param low     lowpass coefficient to save into the lowpass band
 | ||||
|   __device__ int writeLowInto(T *const output, const T &primary) { | ||||
|     return saveAndUpdate(output, primary, this->strideLowToHigh); | ||||
|   } | ||||
| 
 | ||||
|     /// Writes another coefficient from the other band and advances pointer.
 | ||||
|     /// Call this method first if position of first pixel passed to init
 | ||||
|     /// was in highpass band.
 | ||||
|     /// @param output  output image
 | ||||
|     /// @param high    highpass coefficient to save into the highpass band
 | ||||
|     __device__ int writeHighInto(T * const output, const T & other) { | ||||
|       return saveAndUpdate(output, other, this->strideHighToLow); | ||||
|     } | ||||
|   /// Writes another coefficient from the other band and advances pointer.
 | ||||
|   /// Call this method first if position of first pixel passed to init
 | ||||
|   /// was in highpass band.
 | ||||
|   /// @param output  output image
 | ||||
|   /// @param high    highpass coefficient to save into the highpass band
 | ||||
|   __device__ int writeHighInto(T *const output, const T &other) { | ||||
|     return saveAndUpdate(output, other, this->strideHighToLow); | ||||
|   } | ||||
| 
 | ||||
| 	//*******Add three functions to get private values*******
 | ||||
| 	__device__ int getnext(){ | ||||
| 		return next; | ||||
| 	} | ||||
|   //*******Add three functions to get private values*******
 | ||||
|   __device__ int getnext() { return next; } | ||||
| 
 | ||||
| 	__device__ int getend(){ | ||||
| 		return this->end; | ||||
| 	} | ||||
|   __device__ int getend() { return this->end; } | ||||
| 
 | ||||
| 	__device__ int getstrideHighToLow(){ | ||||
| 		return this->strideHighToLow; | ||||
| 	} | ||||
| 	 | ||||
| 	__device__ int getstrideLowToHigh(){ | ||||
| 		return this->strideLowToHigh; | ||||
| 	} | ||||
| 	 | ||||
| 	//*******Add three functions to get private values*******
 | ||||
|   }; | ||||
|   __device__ int getstrideHighToLow() { return this->strideHighToLow; } | ||||
| 
 | ||||
|   __device__ int getstrideLowToHigh() { return this->strideLowToHigh; } | ||||
| 
 | ||||
|   //*******Add three functions to get private values*******
 | ||||
| }; | ||||
| 
 | ||||
| } // namespace dwt_cuda
 | ||||
| 
 | ||||
| 
 | ||||
| #endif	// IO_H
 | ||||
| 
 | ||||
| #endif // IO_H
 | ||||
|  |  | |||
|  | @ -30,344 +30,309 @@ | |||
| /// POSSIBILITY OF SUCH DAMAGE.
 | ||||
| ///
 | ||||
| 
 | ||||
| 
 | ||||
| #ifndef TRANSFORM_BUFFER_H | ||||
| #define	TRANSFORM_BUFFER_H | ||||
| 
 | ||||
| #define TRANSFORM_BUFFER_H | ||||
| 
 | ||||
| namespace dwt_cuda { | ||||
| 
 | ||||
| /// Buffer (in shared memory of GPU) where block of input image is stored,
 | ||||
| /// but odd and even lines are separated. (Generates less bank conflicts when
 | ||||
| /// using lifting schema.) All operations expect SIZE_X threads.
 | ||||
| /// Also implements basic building blocks of lifting schema.
 | ||||
| /// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
 | ||||
| ///                     a number of threads participating on all operations.)
 | ||||
| ///                     Must be divisible by 4.
 | ||||
| /// @tparam SIZE_Y      height of buffer (total number of lines)
 | ||||
| /// @tparam BOUNDARY_X  number of extra pixels at the left and right side
 | ||||
| ///                     boundary is expected to be smaller than half SIZE_X
 | ||||
| ///                     Must be divisible by 2.
 | ||||
| template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X> | ||||
| class TransformBuffer { | ||||
| public: | ||||
|   enum { | ||||
|     /// difference between pointers to two vertical neigbors
 | ||||
|     VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2) | ||||
|   }; | ||||
| 
 | ||||
|   /// Buffer (in shared memory of GPU) where block of input image is stored,
 | ||||
|   /// but odd and even lines are separated. (Generates less bank conflicts when 
 | ||||
|   /// using lifting schema.) All operations expect SIZE_X threads.
 | ||||
|   /// Also implements basic building blocks of lifting schema.
 | ||||
|   /// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
 | ||||
|   ///                     a number of threads participating on all operations.)
 | ||||
|   ///                     Must be divisible by 4.
 | ||||
|   /// @tparam SIZE_Y      height of buffer (total number of lines)
 | ||||
|   /// @tparam BOUNDARY_X  number of extra pixels at the left and right side
 | ||||
|   ///                     boundary is expected to be smaller than half SIZE_X
 | ||||
|   ///                     Must be divisible by 2.
 | ||||
|   template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X> | ||||
|   class TransformBuffer { | ||||
|   public: | ||||
|     enum { | ||||
|       /// difference between pointers to two vertical neigbors
 | ||||
|       VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2) | ||||
|     }; | ||||
| private: | ||||
|   enum { | ||||
| /// number of shared memory banks - needed for correct padding
 | ||||
| #ifdef __CUDA_ARCH__ | ||||
|     SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16), | ||||
| #else | ||||
|     SHM_BANKS = 16, // for host code only - can be anything, won't be used
 | ||||
| #endif | ||||
| 
 | ||||
|   private: | ||||
|     enum { | ||||
|       /// number of shared memory banks - needed for correct padding
 | ||||
|       #ifdef __CUDA_ARCH__ | ||||
|       SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16), | ||||
|       #else | ||||
|       SHM_BANKS = 16,  // for host code only - can be anything, won't be used
 | ||||
|       #endif | ||||
|     /// size of one of two buffers (odd or even)
 | ||||
|     BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y, | ||||
| 
 | ||||
|       /// size of one of two buffers (odd or even)
 | ||||
|       BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y, | ||||
|     /// unused space between two buffers
 | ||||
|     PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS), | ||||
| 
 | ||||
|       /// unused space between two buffers
 | ||||
|       PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS), | ||||
|     /// offset of the odd columns buffer from the beginning of data buffer
 | ||||
|     ODD_OFFSET = BUFFER_SIZE + PADDING, | ||||
|   }; | ||||
| 
 | ||||
|       /// offset of the odd columns buffer from the beginning of data buffer
 | ||||
|       ODD_OFFSET = BUFFER_SIZE + PADDING, | ||||
|     }; | ||||
|   /// buffer for both even and odd columns
 | ||||
|   T data[2 * BUFFER_SIZE + PADDING]; | ||||
| 
 | ||||
|     /// buffer for both even and odd columns
 | ||||
|     T data[2 * BUFFER_SIZE + PADDING]; | ||||
|   /// Applies specified function to all central elements while also passing
 | ||||
|   /// previous and next elements as parameters.
 | ||||
|   /// @param count         count of central elements to apply function to
 | ||||
|   /// @param prevOffset    offset of first central element
 | ||||
|   /// @param midOffset     offset of first central element's predecessor
 | ||||
|   /// @param nextOffset    offset of first central element's successor
 | ||||
|   /// @param function      the function itself
 | ||||
|   template <typename FUNC> | ||||
|   __device__ void horizontalStep(const int count, const int prevOffset, | ||||
|                                  const int midOffset, const int nextOffset, | ||||
|                                  const FUNC &function) { | ||||
|     // number of unchecked iterations
 | ||||
|     const int STEPS = count / SIZE_X; | ||||
| 
 | ||||
|     // items remaining after last unchecked iteration
 | ||||
|     const int finalCount = count % SIZE_X; | ||||
| 
 | ||||
|     // offset of items processed in last (checked) iteration
 | ||||
|     const int finalOffset = count - finalCount; | ||||
| 
 | ||||
|     /// Applies specified function to all central elements while also passing
 | ||||
|     /// previous and next elements as parameters.
 | ||||
|     /// @param count         count of central elements to apply function to
 | ||||
|     /// @param prevOffset    offset of first central element
 | ||||
|     /// @param midOffset     offset of first central element's predecessor
 | ||||
|     /// @param nextOffset    offset of first central element's successor
 | ||||
|     /// @param function      the function itself
 | ||||
|     template <typename FUNC> | ||||
|     __device__ void horizontalStep(const int count, const int prevOffset,  | ||||
|                                    const int midOffset, const int nextOffset, | ||||
|                                    const FUNC & function) { | ||||
|       // number of unchecked iterations
 | ||||
|       const int STEPS = count / SIZE_X; | ||||
|        | ||||
|       // items remaining after last unchecked iteration
 | ||||
|       const int finalCount = count % SIZE_X;  | ||||
|        | ||||
|       // offset of items processed in last (checked) iteration
 | ||||
|       const int finalOffset = count - finalCount;   | ||||
|        | ||||
|       // all threads perform fixed number of iterations ...
 | ||||
|       for(int i = 0; i < STEPS; i++) { | ||||
|     // all threads perform fixed number of iterations ...
 | ||||
|     for (int i = 0; i < STEPS; i++) { | ||||
|       // for(int i = 0; i < 3; i++) {
 | ||||
|         const T previous = data[prevOffset + i * SIZE_X + threadIdx.x]; | ||||
|         const T next     = data[nextOffset + i * SIZE_X + threadIdx.x]; | ||||
|         T & center       = data[midOffset  + i * SIZE_X + threadIdx.x]; | ||||
|         // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
 | ||||
|         function(previous, center, next);// the real one
 | ||||
|       } | ||||
|        | ||||
|       // ... but not all threads participate on final iteration
 | ||||
|       if(threadIdx.x < finalCount) { | ||||
|         const T previous = data[prevOffset + finalOffset + threadIdx.x]; | ||||
|         const T next     = data[nextOffset + finalOffset + threadIdx.x]; | ||||
|         T & center = data[midOffset + finalOffset + threadIdx.x]; | ||||
|         // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
 | ||||
|         // kaixi
 | ||||
|         function(previous, center, next);//the real one
 | ||||
|       } | ||||
|       const T previous = data[prevOffset + i * SIZE_X + threadIdx.x]; | ||||
|       const T next = data[nextOffset + i * SIZE_X + threadIdx.x]; | ||||
|       T ¢er = data[midOffset + i * SIZE_X + threadIdx.x]; | ||||
|       // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
 | ||||
|       function(previous, center, next); // the real one
 | ||||
|     } | ||||
| 
 | ||||
|   public: | ||||
| 
 | ||||
|     __device__ void getPrintData() { | ||||
|       //
 | ||||
|       for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) {      | ||||
|           printf(" index: %d  data: %f \n ", i ,data[i]);   | ||||
|       } | ||||
| 
 | ||||
|    } | ||||
| 
 | ||||
|      | ||||
|     /// Gets offset of the column with given index. Central columns have 
 | ||||
|     /// indices from 0 to NUM_LINES - 1, left boundary columns have negative 
 | ||||
|     /// indices and right boundary columns indices start with NUM_LINES.
 | ||||
|     /// @param columnIndex  index of column to get pointer to
 | ||||
|     /// @return  offset of the first item of column with specified index
 | ||||
|     __device__ int getColumnOffset(int columnIndex) { | ||||
|       columnIndex += BOUNDARY_X;             // skip boundary
 | ||||
|       return columnIndex / 2                 // select right column
 | ||||
|           + (columnIndex & 1) * ODD_OFFSET;  // select odd or even buffer
 | ||||
|     // ... but not all threads participate on final iteration
 | ||||
|     if (threadIdx.x < finalCount) { | ||||
|       const T previous = data[prevOffset + finalOffset + threadIdx.x]; | ||||
|       const T next = data[nextOffset + finalOffset + threadIdx.x]; | ||||
|       T ¢er = data[midOffset + finalOffset + threadIdx.x]; | ||||
|       // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
 | ||||
|       // kaixi
 | ||||
|       function(previous, center, next); // the real one
 | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|      | ||||
|     /// Provides access to data of the transform buffer.
 | ||||
|     /// @param index  index of the item to work with
 | ||||
|     /// @return reference to item at given index
 | ||||
|     __device__ T & operator[] (const int index) { | ||||
|       return data[index]; | ||||
| public: | ||||
|   __device__ void getPrintData() { | ||||
|     //
 | ||||
|     for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) { | ||||
|       printf(" index: %d  data: %f \n ", i, data[i]); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   /// Gets offset of the column with given index. Central columns have
 | ||||
|   /// indices from 0 to NUM_LINES - 1, left boundary columns have negative
 | ||||
|   /// indices and right boundary columns indices start with NUM_LINES.
 | ||||
|   /// @param columnIndex  index of column to get pointer to
 | ||||
|   /// @return  offset of the first item of column with specified index
 | ||||
|   __device__ int getColumnOffset(int columnIndex) { | ||||
|     columnIndex += BOUNDARY_X;               // skip boundary
 | ||||
|     return columnIndex / 2                   // select right column
 | ||||
|            + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
 | ||||
|   } | ||||
| 
 | ||||
|     /// Applies specified function to all horizontally even elements in 
 | ||||
|     /// specified lines. (Including even elements in boundaries except 
 | ||||
|     /// first even element in first left boundary.) SIZE_X threads participate 
 | ||||
|     /// and synchronization is needed before result can be used.
 | ||||
|     /// @param firstLine  index of first line
 | ||||
|     /// @param numLines   count of lines
 | ||||
|     /// @param func       function to be applied on all even elements
 | ||||
|     ///                   parameters: previous (odd) element, the even
 | ||||
|     ///                   element itself and finally next (odd) element
 | ||||
|     template <typename FUNC> | ||||
|     __device__ void forEachHorizontalEven(const int firstLine, | ||||
|                                           const int numLines, | ||||
|                                           const FUNC & func) { | ||||
|       // number of even elemens to apply function to
 | ||||
|       const int count = numLines * VERTICAL_STRIDE - 1; | ||||
|       // offset of first even element
 | ||||
|       const int centerOffset = firstLine * VERTICAL_STRIDE + 1; | ||||
|       // offset of odd predecessor of first even element
 | ||||
|       const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET; | ||||
|       // offset of odd successor of first even element
 | ||||
|       const int nextOffset = prevOffset + 1; | ||||
|   /// Provides access to data of the transform buffer.
 | ||||
|   /// @param index  index of the item to work with
 | ||||
|   /// @return reference to item at given index
 | ||||
|   __device__ T &operator[](const int index) { return data[index]; } | ||||
| 
 | ||||
|       // if(threadIdx.x == 0) {
 | ||||
|   /// Applies specified function to all horizontally even elements in
 | ||||
|   /// specified lines. (Including even elements in boundaries except
 | ||||
|   /// first even element in first left boundary.) SIZE_X threads participate
 | ||||
|   /// and synchronization is needed before result can be used.
 | ||||
|   /// @param firstLine  index of first line
 | ||||
|   /// @param numLines   count of lines
 | ||||
|   /// @param func       function to be applied on all even elements
 | ||||
|   ///                   parameters: previous (odd) element, the even
 | ||||
|   ///                   element itself and finally next (odd) element
 | ||||
|   template <typename FUNC> | ||||
|   __device__ void forEachHorizontalEven(const int firstLine, const int numLines, | ||||
|                                         const FUNC &func) { | ||||
|     // number of even elemens to apply function to
 | ||||
|     const int count = numLines * VERTICAL_STRIDE - 1; | ||||
|     // offset of first even element
 | ||||
|     const int centerOffset = firstLine * VERTICAL_STRIDE + 1; | ||||
|     // offset of odd predecessor of first even element
 | ||||
|     const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET; | ||||
|     // offset of odd successor of first even element
 | ||||
|     const int nextOffset = prevOffset + 1; | ||||
| 
 | ||||
|       //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
 | ||||
|       // }
 | ||||
|     // if(threadIdx.x == 0) {
 | ||||
| 
 | ||||
|       // call generic horizontal step function
 | ||||
|       horizontalStep(count, prevOffset, centerOffset, nextOffset, func); | ||||
|     } | ||||
|     //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d
 | ||||
|     //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
 | ||||
|     // }
 | ||||
| 
 | ||||
|     // call generic horizontal step function
 | ||||
|     horizontalStep(count, prevOffset, centerOffset, nextOffset, func); | ||||
|   } | ||||
| 
 | ||||
|     /// Applies given function to all horizontally odd elements in specified
 | ||||
|     /// lines. (Including odd elements in boundaries except last odd element
 | ||||
|     /// in last right boundary.) SIZE_X threads participate and synchronization
 | ||||
|     /// is needed before result can be used.
 | ||||
|     /// @param firstLine  index of first line
 | ||||
|     /// @param numLines   count of lines
 | ||||
|     /// @param func       function to be applied on all odd elements
 | ||||
|     ///                   parameters: previous (even) element, the odd
 | ||||
|     ///                   element itself and finally next (even) element
 | ||||
|     template <typename FUNC> | ||||
|     __device__ void forEachHorizontalOdd(const int firstLine, | ||||
|                                          const int numLines, | ||||
|                                          const FUNC & func) { | ||||
|       // numbet of odd elements to apply function to
 | ||||
|       const int count = numLines * VERTICAL_STRIDE - 1; | ||||
|       // offset of even predecessor of first odd element
 | ||||
|       const int prevOffset = firstLine * VERTICAL_STRIDE; | ||||
|       // offset of first odd element
 | ||||
|       const int centerOffset = prevOffset + ODD_OFFSET; | ||||
|       // offset of even successor of first odd element
 | ||||
|       const int nextOffset = prevOffset + 1; | ||||
|   /// Applies given function to all horizontally odd elements in specified
 | ||||
|   /// lines. (Including odd elements in boundaries except last odd element
 | ||||
|   /// in last right boundary.) SIZE_X threads participate and synchronization
 | ||||
|   /// is needed before result can be used.
 | ||||
|   /// @param firstLine  index of first line
 | ||||
|   /// @param numLines   count of lines
 | ||||
|   /// @param func       function to be applied on all odd elements
 | ||||
|   ///                   parameters: previous (even) element, the odd
 | ||||
|   ///                   element itself and finally next (even) element
 | ||||
|   template <typename FUNC> | ||||
|   __device__ void forEachHorizontalOdd(const int firstLine, const int numLines, | ||||
|                                        const FUNC &func) { | ||||
|     // numbet of odd elements to apply function to
 | ||||
|     const int count = numLines * VERTICAL_STRIDE - 1; | ||||
|     // offset of even predecessor of first odd element
 | ||||
|     const int prevOffset = firstLine * VERTICAL_STRIDE; | ||||
|     // offset of first odd element
 | ||||
|     const int centerOffset = prevOffset + ODD_OFFSET; | ||||
|     // offset of even successor of first odd element
 | ||||
|     const int nextOffset = prevOffset + 1; | ||||
| 
 | ||||
|       //  if(threadIdx.x == 0) {
 | ||||
|       //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
 | ||||
|       // }
 | ||||
|     //  if(threadIdx.x == 0) {
 | ||||
|     //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d
 | ||||
|     //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
 | ||||
|     // }
 | ||||
| 
 | ||||
|     // call generic horizontal step function
 | ||||
|     horizontalStep(count, prevOffset, centerOffset, nextOffset, func); | ||||
|   } | ||||
| 
 | ||||
|       // call generic horizontal step function
 | ||||
|       horizontalStep(count, prevOffset, centerOffset, nextOffset, func); | ||||
|     } | ||||
|      | ||||
|      | ||||
|     /// Applies specified function to all even elements (except element #0)
 | ||||
|     /// of given column. Each thread takes care of one column, so there's 
 | ||||
|     /// no need for synchronization.
 | ||||
|     /// @param columnOffset  offset of thread's column
 | ||||
|     /// @param f             function to be applied on all even elements
 | ||||
|     ///                      parameters: previous (odd) element, the even
 | ||||
|     ///                      element itself and finally next (odd) element
 | ||||
|     template <typename F> | ||||
|     __device__ void forEachVerticalEven(const int columnOffset, const F & f) { | ||||
|       if(SIZE_Y > 3) { // makes no sense otherwise
 | ||||
|         const int steps = SIZE_Y / 2 - 1; | ||||
|         for(int i = 0; i < steps; i++) { | ||||
|           const int row = 2 + i * 2; | ||||
|           const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; | ||||
|           const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; | ||||
|           f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next); | ||||
| 		   | ||||
| 		  //--------------- FOR TEST -----------------
 | ||||
| /*		__syncthreads();
 | ||||
| 		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ | ||||
| 			diffOut[2500]++; | ||||
| 			diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
 | ||||
| 		}	 | ||||
| 		__syncthreads(); | ||||
| */		  //--------------- FOR TEST -----------------
 | ||||
| 		   | ||||
| 		   | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|      | ||||
|      | ||||
|     /// Applies specified function to all odd elements of given column.
 | ||||
|     /// Each thread takes care of one column, so there's no need for
 | ||||
|     /// synchronization.
 | ||||
|     /// @param columnOffset  offset of thread's column
 | ||||
|     /// @param f             function to be applied on all odd elements
 | ||||
|     ///                      parameters: previous (even) element, the odd
 | ||||
|     ///                      element itself and finally next (even) element
 | ||||
|     template <typename F> | ||||
|     __device__ void forEachVerticalOdd(const int columnOffset, const F & f) { | ||||
|       const int steps = (SIZE_Y - 1) / 2; | ||||
|       for(int i = 0; i < steps; i++) { | ||||
|         const int row = i * 2 + 1; | ||||
|   /// Applies specified function to all even elements (except element #0)
 | ||||
|   /// of given column. Each thread takes care of one column, so there's
 | ||||
|   /// no need for synchronization.
 | ||||
|   /// @param columnOffset  offset of thread's column
 | ||||
|   /// @param f             function to be applied on all even elements
 | ||||
|   ///                      parameters: previous (odd) element, the even
 | ||||
|   ///                      element itself and finally next (odd) element
 | ||||
|   template <typename F> | ||||
|   __device__ void forEachVerticalEven(const int columnOffset, const F &f) { | ||||
|     if (SIZE_Y > 3) { // makes no sense otherwise
 | ||||
|       const int steps = SIZE_Y / 2 - 1; | ||||
|       for (int i = 0; i < steps; i++) { | ||||
|         const int row = 2 + i * 2; | ||||
|         const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; | ||||
|         const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; | ||||
|         f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); | ||||
| 
 | ||||
| 		f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); | ||||
| 		 | ||||
| 		 | ||||
| 		  //--------------- FOR TEST -----------------
 | ||||
| /*		__syncthreads();
 | ||||
| 		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ | ||||
| 			diffOut[2500]++; | ||||
| 			diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];
 | ||||
| 		} | ||||
| 
 | ||||
| 		__syncthreads(); | ||||
| */		  //--------------- FOR TEST -----------------
 | ||||
|         //--------------- FOR TEST -----------------
 | ||||
|         /*		__syncthreads();
 | ||||
|                         if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ | ||||
|                                 diffOut[2500]++; | ||||
|                                 diffOut[diffOut[2500]] = 2;//data[columnOffset +
 | ||||
|            row * VERTICAL_STRIDE]; | ||||
|                         } | ||||
|                         __syncthreads(); | ||||
|         */		  //--------------- FOR TEST -----------------
 | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   /// Applies specified function to all odd elements of given column.
 | ||||
|   /// Each thread takes care of one column, so there's no need for
 | ||||
|   /// synchronization.
 | ||||
|   /// @param columnOffset  offset of thread's column
 | ||||
|   /// @param f             function to be applied on all odd elements
 | ||||
|   ///                      parameters: previous (even) element, the odd
 | ||||
|   ///                      element itself and finally next (even) element
 | ||||
|   template <typename F> | ||||
|   __device__ void forEachVerticalOdd(const int columnOffset, const F &f) { | ||||
|     const int steps = (SIZE_Y - 1) / 2; | ||||
|     for (int i = 0; i < steps; i++) { | ||||
|       const int row = i * 2 + 1; | ||||
|       const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; | ||||
|       const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; | ||||
| 
 | ||||
|       f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); | ||||
| 
 | ||||
|     /// Scales elements at specified lines.
 | ||||
|     /// @param evenScale  scaling factor for horizontally even elements
 | ||||
|     /// @param oddScale   scaling factor for horizontally odd elements
 | ||||
|     /// @param numLines   number of lines, whose elements should be scaled
 | ||||
|     /// @param firstLine  index of first line to scale elements in
 | ||||
|     __device__ void scaleHorizontal(const T evenScale, const T oddScale, | ||||
|                                     const int firstLine, const int numLines) { | ||||
|       const int offset = firstLine * VERTICAL_STRIDE; | ||||
|       const int count = numLines * VERTICAL_STRIDE; | ||||
|       const int steps = count / SIZE_X; | ||||
|       const int finalCount = count % SIZE_X; | ||||
|       const int finalOffset = count - finalCount; | ||||
|       //--------------- FOR TEST -----------------
 | ||||
|       /*		__syncthreads();
 | ||||
|                       if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ | ||||
|                               diffOut[2500]++; | ||||
|                               diffOut[diffOut[2500]] = 1; //data[columnOffset +
 | ||||
|          row * VERTICAL_STRIDE]; | ||||
|                       } | ||||
| 
 | ||||
|       // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);
 | ||||
|                       __syncthreads(); | ||||
|       */		  //--------------- FOR TEST -----------------
 | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|       // run iterations, whete all threads participate
 | ||||
|       for(int i = 0; i < steps; i++) { | ||||
|         data[threadIdx.x + i * SIZE_X + offset] *= evenScale; | ||||
|         // if(threadIdx.x + i * SIZE_X + offset == 531) {
 | ||||
|         //   printf("threadidx 531: %d \n", threadIdx.x);
 | ||||
|         // }
 | ||||
|         // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
 | ||||
|         //   printf("threadidx 531: %d \n", threadIdx.x);
 | ||||
|         // }
 | ||||
|         data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale; | ||||
|       } | ||||
|   /// Scales elements at specified lines.
 | ||||
|   /// @param evenScale  scaling factor for horizontally even elements
 | ||||
|   /// @param oddScale   scaling factor for horizontally odd elements
 | ||||
|   /// @param numLines   number of lines, whose elements should be scaled
 | ||||
|   /// @param firstLine  index of first line to scale elements in
 | ||||
|   __device__ void scaleHorizontal(const T evenScale, const T oddScale, | ||||
|                                   const int firstLine, const int numLines) { | ||||
|     const int offset = firstLine * VERTICAL_STRIDE; | ||||
|     const int count = numLines * VERTICAL_STRIDE; | ||||
|     const int steps = count / SIZE_X; | ||||
|     const int finalCount = count % SIZE_X; | ||||
|     const int finalOffset = count - finalCount; | ||||
| 
 | ||||
|       // some threads also finish remaining unscaled items
 | ||||
|       if(threadIdx.x < finalCount) { | ||||
|         data[threadIdx.x + finalOffset + offset] *= evenScale; | ||||
|         // if(threadIdx.x + finalOffset + offset == 531) {
 | ||||
|         //   printf("threadidx 531: %d \n", threadIdx.x);
 | ||||
|         // }
 | ||||
|         //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
 | ||||
|         //   printf("threadidx 531: %d \n", threadIdx.x);
 | ||||
|         // }
 | ||||
|         data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale; | ||||
|       } | ||||
|     // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d,
 | ||||
|     // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
 | ||||
|     // finalCount, finalOffset);
 | ||||
| 
 | ||||
|     // run iterations, whete all threads participate
 | ||||
|     for (int i = 0; i < steps; i++) { | ||||
|       data[threadIdx.x + i * SIZE_X + offset] *= evenScale; | ||||
|       // if(threadIdx.x + i * SIZE_X + offset == 531) {
 | ||||
|       //   printf("threadidx 531: %d \n", threadIdx.x);
 | ||||
|       // }
 | ||||
|       // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
 | ||||
|       //   printf("threadidx 531: %d \n", threadIdx.x);
 | ||||
|       // }
 | ||||
|       data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale; | ||||
|     } | ||||
| 
 | ||||
|     // some threads also finish remaining unscaled items
 | ||||
|     if (threadIdx.x < finalCount) { | ||||
|       data[threadIdx.x + finalOffset + offset] *= evenScale; | ||||
|       // if(threadIdx.x + finalOffset + offset == 531) {
 | ||||
|       //   printf("threadidx 531: %d \n", threadIdx.x);
 | ||||
|       // }
 | ||||
|       //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
 | ||||
|       //   printf("threadidx 531: %d \n", threadIdx.x);
 | ||||
|       // }
 | ||||
|       data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|     /// Scales elements in specified column.
 | ||||
|     /// @param evenScale     scaling factor for vertically even elements
 | ||||
|     /// @param oddScale      scaling factor for vertically odd elements
 | ||||
|     /// @param columnOffset  offset of the column to work with
 | ||||
|     /// @param numLines      number of lines, whose elements should be scaled
 | ||||
|     /// @param firstLine     index of first line to scale elements in
 | ||||
|     __device__ void scaleVertical(const T evenScale, const T oddScale, | ||||
|                                   const int columnOffset, const int numLines, | ||||
|                                   const int firstLine) { | ||||
|       for(int i = firstLine; i < (numLines + firstLine); i++) { | ||||
|         if(i & 1) { | ||||
|           data[columnOffset + i * VERTICAL_STRIDE] *= oddScale; | ||||
|         } else { | ||||
|           data[columnOffset + i * VERTICAL_STRIDE] *= evenScale; | ||||
|         } | ||||
|   /// Scales elements in specified column.
 | ||||
|   /// @param evenScale     scaling factor for vertically even elements
 | ||||
|   /// @param oddScale      scaling factor for vertically odd elements
 | ||||
|   /// @param columnOffset  offset of the column to work with
 | ||||
|   /// @param numLines      number of lines, whose elements should be scaled
 | ||||
|   /// @param firstLine     index of first line to scale elements in
 | ||||
|   __device__ void scaleVertical(const T evenScale, const T oddScale, | ||||
|                                 const int columnOffset, const int numLines, | ||||
|                                 const int firstLine) { | ||||
|     for (int i = firstLine; i < (numLines + firstLine); i++) { | ||||
|       if (i & 1) { | ||||
|         data[columnOffset + i * VERTICAL_STRIDE] *= oddScale; | ||||
|       } else { | ||||
|         data[columnOffset + i * VERTICAL_STRIDE] *= evenScale; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   //****************For Test(Feb23), test inter parameters*************
 | ||||
|   __device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; } | ||||
|   __device__ int getSHM_BANKS() { return SHM_BANKS; } | ||||
|   __device__ int getBuffersize() { return BUFFER_SIZE; } | ||||
|   __device__ int getPADDING() { return PADDING; } | ||||
|   __device__ int getODD_OFFSET() { return ODD_OFFSET; } | ||||
| 
 | ||||
| 	//****************For Test(Feb23), test inter parameters*************
 | ||||
| 	__device__ int getVERTICAL_STRIDE(){ | ||||
| 		return VERTICAL_STRIDE; | ||||
| 	} | ||||
| 	__device__ int getSHM_BANKS(){ | ||||
| 		return SHM_BANKS; | ||||
| 	} | ||||
| 	__device__ int  getBuffersize(){		 | ||||
| 		return BUFFER_SIZE; | ||||
| 	} | ||||
| 	__device__ int getPADDING(){ | ||||
| 		return PADDING; | ||||
| 	} | ||||
| 	__device__ int getODD_OFFSET(){ | ||||
| 		return ODD_OFFSET; | ||||
| 	} | ||||
| 
 | ||||
| 
 | ||||
|     //****************For Test(Feb23), test inter parameters*************
 | ||||
| 	 | ||||
| 	 | ||||
|   };  // end of class TransformBuffer
 | ||||
|   //****************For Test(Feb23), test inter parameters*************
 | ||||
| 
 | ||||
| }; // end of class TransformBuffer
 | ||||
| 
 | ||||
| } // namespace dwt_cuda
 | ||||
| 
 | ||||
| 
 | ||||
| #endif	// TRANSFORM_BUFFER_H
 | ||||
| 
 | ||||
| #endif // TRANSFORM_BUFFER_H
 | ||||
|  |  | |||
|  | @ -5,4 +5,3 @@ | |||
| ./dwt2d 4.bmp  -d 4x4 -r -5 -l 3 | ||||
| # ./dwt2d 4.bmp  -d 4x4 -r -9 -l 3 | ||||
| # ./dwt2d 8.bmp  -d 8x8 -f -9 -l 3 | ||||
| 
 | ||||
|  |  | |||
|  | @ -7,12 +7,3 @@ | |||
| /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o | ||||
| /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o | ||||
| g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,38 +1,36 @@ | |||
| #include <stdio.h> | ||||
| 
 | ||||
| __global__ | ||||
| void saxpy(int n, float a, float *x, float *y) | ||||
| { | ||||
|   int i = blockIdx.x*blockDim.x + threadIdx.x; | ||||
|   if (i < n) y[i] = a*x[i] + y[i]; | ||||
| __global__ void saxpy(int n, float a, float *x, float *y) { | ||||
|   int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|   if (i < n) | ||||
|     y[i] = a * x[i] + y[i]; | ||||
| } | ||||
| 
 | ||||
| int main(void) | ||||
| { | ||||
|   int N = 1<<20; | ||||
| int main(void) { | ||||
|   int N = 1 << 20; | ||||
|   float *x, *y, *d_x, *d_y; | ||||
|   x = (float*)malloc(N*sizeof(float)); | ||||
|   y = (float*)malloc(N*sizeof(float)); | ||||
|   x = (float *)malloc(N * sizeof(float)); | ||||
|   y = (float *)malloc(N * sizeof(float)); | ||||
| 
 | ||||
|   cudaMalloc(&d_x, N*sizeof(float));  | ||||
|   cudaMalloc(&d_y, N*sizeof(float)); | ||||
|   cudaMalloc(&d_x, N * sizeof(float)); | ||||
|   cudaMalloc(&d_y, N * sizeof(float)); | ||||
| 
 | ||||
|   for (int i = 0; i < N; i++) { | ||||
|     x[i] = 1.0f; | ||||
|     y[i] = 2.0f; | ||||
|   } | ||||
| 
 | ||||
|   cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); | ||||
| 
 | ||||
|   // Perform SAXPY on 1M elements
 | ||||
|   // saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
 | ||||
| 
 | ||||
|   cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); | ||||
|   cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); | ||||
| 
 | ||||
|   float maxError = 0.0f; | ||||
|   for (int i = 0; i < N; i++) | ||||
|     maxError = max(maxError, abs(y[i]-4.0f)); | ||||
|     maxError = max(maxError, abs(y[i] - 4.0f)); | ||||
|   printf("Max error: %f\n", maxError); | ||||
| 
 | ||||
|   cudaFree(d_x); | ||||
|  |  | |||
|  | @ -1,38 +1,35 @@ | |||
| #include <stdio.h> | ||||
| 
 | ||||
| __global__ | ||||
| void saxpy(void) | ||||
| { | ||||
|   int i = blockIdx.x*blockDim.x + threadIdx.x; | ||||
| __global__ void saxpy(void) { | ||||
|   int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|   printf("block_id:%d thread_id:%d \n", i) | ||||
| } | ||||
| 
 | ||||
| int main(void) | ||||
| { | ||||
|   int N = 1<<20; | ||||
| int main(void) { | ||||
|   int N = 1 << 20; | ||||
|   float *x, *y, *d_x, *d_y; | ||||
|   x = (float*)malloc(N*sizeof(float)); | ||||
|   y = (float*)malloc(N*sizeof(float)); | ||||
|   x = (float *)malloc(N * sizeof(float)); | ||||
|   y = (float *)malloc(N * sizeof(float)); | ||||
| 
 | ||||
|   cudaMalloc(&d_x, N*sizeof(float));  | ||||
|   cudaMalloc(&d_y, N*sizeof(float)); | ||||
|   cudaMalloc(&d_x, N * sizeof(float)); | ||||
|   cudaMalloc(&d_y, N * sizeof(float)); | ||||
| 
 | ||||
|   for (int i = 0; i < N; i++) { | ||||
|     x[i] = 1.0f; | ||||
|     y[i] = 2.0f; | ||||
|   } | ||||
| 
 | ||||
|   cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); | ||||
| 
 | ||||
|   // Perform SAXPY on 1M elements
 | ||||
|     saxpy<<<(1,1)>>>; | ||||
|   saxpy<<<(1, 1)>>>; | ||||
| 
 | ||||
|   cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); | ||||
|   cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); | ||||
| 
 | ||||
|   float maxError = 0.0f; | ||||
|   for (int i = 0; i < N; i++) | ||||
|     maxError = max(maxError, abs(y[i]-4.0f)); | ||||
|     maxError = max(maxError, abs(y[i] - 4.0f)); | ||||
|   printf("Max error: %f\n", maxError); | ||||
| 
 | ||||
|   cudaFree(d_x); | ||||
|  |  | |||
|  | @ -1,37 +1,32 @@ | |||
| #include <stdio.h> | ||||
| 
 | ||||
| __global__ | ||||
| void saxpy(int N) | ||||
| { | ||||
| printf("hello!: %d\n", N); | ||||
| } | ||||
| __global__ void saxpy(int N) { printf("hello!: %d\n", N); } | ||||
| 
 | ||||
| int main(void) | ||||
| { | ||||
|   int N = 1<<20; | ||||
| int main(void) { | ||||
|   int N = 1 << 20; | ||||
|   float *x, *y, *d_x, *d_y; | ||||
|   x = (float*)malloc(N*sizeof(float)); | ||||
|   y = (float*)malloc(N*sizeof(float)); | ||||
|   x = (float *)malloc(N * sizeof(float)); | ||||
|   y = (float *)malloc(N * sizeof(float)); | ||||
| 
 | ||||
|   cudaMalloc(&d_x, N*sizeof(float));  | ||||
|   cudaMalloc(&d_y, N*sizeof(float)); | ||||
|   cudaMalloc(&d_x, N * sizeof(float)); | ||||
|   cudaMalloc(&d_y, N * sizeof(float)); | ||||
| 
 | ||||
|   for (int i = 0; i < N; i++) { | ||||
|     x[i] = 1.0f; | ||||
|     y[i] = 2.0f; | ||||
|   } | ||||
| 
 | ||||
|   cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); | ||||
| 
 | ||||
|   // Perform SAXPY on 1M elements
 | ||||
|     saxpy<<<(1,1)>>>(N); | ||||
|   saxpy<<<(1, 1)>>>(N); | ||||
| 
 | ||||
|   cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); | ||||
|   cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); | ||||
| 
 | ||||
|   float maxError = 0.0f; | ||||
|   for (int i = 0; i < N; i++) | ||||
|     maxError = max(maxError, abs(y[i]-4.0f)); | ||||
|     maxError = max(maxError, abs(y[i] - 4.0f)); | ||||
|   printf("Max error: %f\n", maxError); | ||||
| 
 | ||||
|   cudaFree(d_x); | ||||
|  |  | |||
|  | @ -1,37 +1,32 @@ | |||
| #include <stdio.h> | ||||
| 
 | ||||
| __global__ | ||||
| void saxpy(void) | ||||
| { | ||||
| printf("hello!\n"); | ||||
| } | ||||
| __global__ void saxpy(void) { printf("hello!\n"); } | ||||
| 
 | ||||
| int main(void) | ||||
| { | ||||
|   int N = 1<<20; | ||||
| int main(void) { | ||||
|   int N = 1 << 20; | ||||
|   float *x, *y, *d_x, *d_y; | ||||
|   x = (float*)malloc(N*sizeof(float)); | ||||
|   y = (float*)malloc(N*sizeof(float)); | ||||
|   x = (float *)malloc(N * sizeof(float)); | ||||
|   y = (float *)malloc(N * sizeof(float)); | ||||
| 
 | ||||
|   cudaMalloc(&d_x, N*sizeof(float));  | ||||
|   cudaMalloc(&d_y, N*sizeof(float)); | ||||
|   cudaMalloc(&d_x, N * sizeof(float)); | ||||
|   cudaMalloc(&d_y, N * sizeof(float)); | ||||
| 
 | ||||
|   for (int i = 0; i < N; i++) { | ||||
|     x[i] = 1.0f; | ||||
|     y[i] = 2.0f; | ||||
|   } | ||||
| 
 | ||||
|   cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); | ||||
|   cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); | ||||
| 
 | ||||
|   // Perform SAXPY on 1M elements
 | ||||
|     saxpy<<<(1,1)>>>; | ||||
|   saxpy<<<(1, 1)>>>; | ||||
| 
 | ||||
|   cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); | ||||
|   cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); | ||||
| 
 | ||||
|   float maxError = 0.0f; | ||||
|   for (int i = 0; i < N; i++) | ||||
|     maxError = max(maxError, abs(y[i]-4.0f)); | ||||
|     maxError = max(maxError, abs(y[i] - 4.0f)); | ||||
|   printf("Max error: %f\n", maxError); | ||||
| 
 | ||||
|   cudaFree(d_x); | ||||
|  |  | |||
|  | @ -43,7 +43,7 @@ cudaError_t cudaMallocHost(void **devPtr, size_t size) { | |||
|   *devPtr = malloc(size); | ||||
|   if (devPtr == NULL) | ||||
|     return cudaErrorMemoryAllocation; | ||||
|  return cudaSuccess; | ||||
|   return cudaSuccess; | ||||
| } | ||||
| cudaError_t cudaMemset(void *devPtr, int value, size_t count) { | ||||
|   memset(devPtr, value, count); | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue