Merge pull request #10 from unahb/master

fix linting issues
2022-05-25 01:12:33 +00:00 · 2022-05-25 01:12:33 +00:00 · 197abc867d
parent d834f31626 308e9284cb
commit 197abc867d
27 changed files with 1246 additions and 1384 deletions
--- a/README.md
+++ b/README.md
@ -27,10 +27,10 @@ Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RIS
   export CuPBoP_PATH=`pwd`
   export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH
   ```
-If you are using boson, you can pre-installed llvm 10.0.0 

- LLVM_PATH=/opt/llvm-10.0.0
- export PATH=$LLVM_PATH/bin:$PATH
+   If you are using boson, you can pre-installed llvm 10.0.0\
+   `LLVM_PATH=/opt/llvm-10.0.0`\
+   `export PATH=$LLVM_PATH/bin:$PATH`

 2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file

--- a/compilation/KernelTranslation/lib/init.cpp
+++ b/compilation/KernelTranslation/lib/init.cpp
@ -396,7 +396,7 @@ void init_block(llvm::Module *M, std::ofstream &fout) {
  replace_asm_call(M);
  // replace dynamic shared memory
  auto dynamic_shared_memory_addr =
-        M->getGlobalVariable("dynamic_shared_memory");
+      M->getGlobalVariable("dynamic_shared_memory");
  if (dynamic_shared_memory_addr) {
    replace_dynamic_shared_memory(M);
  }
--- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
@ -272,13 +272,12 @@ void AddContextSaveRestore(llvm::Instruction *instruction,
  std::vector<Instruction *> uses;
  Function *f2 = instruction->getParent()->getParent();

-
  for (Instruction::use_iterator ui = instruction->use_begin(),
                                 ue = instruction->use_end();
       ui != ue; ++ui) {
    llvm::Instruction *user = cast<Instruction>(ui->getUser());
    Function *f1 = user->getParent()->getParent();
-    if(f2->getName() != f1->getName()) {
+    if (f2->getName() != f1->getName()) {
      continue;
    }
    if (user == NULL)
--- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
@ -89,11 +89,12 @@ void mem_share2global(llvm::Module *M) {
          } else if (element_type->isStructTy()) {
            auto undef = llvm::UndefValue::get(element_type);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
-                *M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef,
-                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
-                false);
+                *M, element_type, false, llvm::GlobalValue::ExternalLinkage,
+                undef, new_name, NULL,
+                llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
            global_memory->setDSOLocal(true);
-            Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName()));
+            Comdat *comdat =
+                M->getOrInsertComdat(StringRef(share_memory->getName()));
            comdat->setSelectionKind(Comdat::SelectionKind::Any);
            global_memory->setComdat(comdat);
            global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
@ -101,8 +102,7 @@ void mem_share2global(llvm::Module *M) {
            global_memory->setAlignment(share_memory->getAlignment());
            corresponding_global_memory.insert(
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
-                global_memory));
-
+                                                              global_memory));

          } else {
            assert(0 && "The required Share Memory Type is not supported\n");
--- a/examples/dwt2d/common.h
+++ b/examples/dwt2d/common.h
@ -27,9 +27,9 @@
 #ifndef _COMMON_H
 #define _COMMON_H

-//24-bit multiplication is faster on G80,
-//but we must be sure to multiply integers
-//only within [-8M, 8M - 1] range
+// 24-bit multiplication is faster on G80,
+// but we must be sure to multiply integers
+// only within [-8M, 8M - 1] range
 #define IMUL(a, b) __mul24(a, b)

 ////cuda timing macros
@ -42,21 +42,23 @@
 //                          cudaEventSynchronize(cstop); \
 //                          cudaEventElapsedTime(&elapsedTime, cstart, cstop)

-//divide and round up macro
+// divide and round up macro
 #define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))

-#  define cudaCheckError( msg ) {                                            \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "%s: %i: %s: %s.\n",                                 \
-                __FILE__, __LINE__, msg, cudaGetErrorString( err) );         \
-        exit(-1);                                                            \
-    } }
-
-#  define cudaCheckAsyncError( msg ) {                                       \
-    cudaThreadSynchronize();                                                 \
-    cudaCheckError( msg );                                                   \
-    }
+#define cudaCheckError(msg)                                                    \
+  {                                                                            \
+    cudaError_t err = cudaGetLastError();                                      \
+    if (cudaSuccess != err) {                                                  \
+      fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg,            \
+              cudaGetErrorString(err));                                        \
+      exit(-1);                                                                \
+    }                                                                          \
+  }

+#define cudaCheckAsyncError(msg)                                               \
+  {                                                                            \
+    cudaThreadSynchronize();                                                   \
+    cudaCheckError(msg);                                                       \
+  }

 #endif
--- a/examples/dwt2d/components.h
+++ b/examples/dwt2d/components.h
@ -28,11 +28,12 @@
 #define _COMPONENTS_H

 /* Separate compoents of source 8bit RGB image */
-template<typename T>
-void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height);
+template <typename T>
+void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
+                     int height);

 /* Copy a 8bit source image data into a color compoment of type T */
-template<typename T>
-void bwToComponent(T *d_c, unsigned char * src, int width, int height);
+template <typename T>
+void bwToComponent(T *d_c, unsigned char *src, int width, int height);

 #endif
--- a/examples/dwt2d/dwt.h
+++ b/examples/dwt2d/dwt.h
@ -27,14 +27,15 @@
 #ifndef _DWT_H
 #define _DWT_H

-template<typename T> 
-int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
+template <typename T>
+int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
+                int stages, bool forward);

-template<typename T>
-int writeNStage2DDWT(T *component_cuda, int width, int height, 
-                     int stages, const char * filename, const char * suffix);
-template<typename T>
-int writeLinear(T *component_cuda, int width, int height, 
-                     const char * filename, const char * suffix);
+template <typename T>
+int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
+                     const char *filename, const char *suffix);
+template <typename T>
+int writeLinear(T *component_cuda, int width, int height, const char *filename,
+                const char *suffix);

 #endif
--- a/examples/dwt2d/dwt_cuda/common.h
+++ b/examples/dwt2d/dwt_cuda/common.h
@ -29,233 +29,204 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///

-
 #ifndef DWT_COMMON_H
-#define	DWT_COMMON_H
+#define DWT_COMMON_H

-
-#include <cstdio>
 #include <algorithm>
+#include <cstdio>
 #include <vector>

-
-
 // compile time minimum macro
-#define CTMIN(a,b) (((a) < (b)) ? (a) : (b))
-
-
+#define CTMIN(a, b) (((a) < (b)) ? (a) : (b))

 // performance testing macros
 #if defined(GPU_DWT_TESTING)
-  #define PERF_BEGIN  \
-  { \
-    dwt_cuda::CudaDWTTester PERF_TESTER; \
-    for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \
-    { \
+#define PERF_BEGIN                                                             \
+  {                                                                            \
+    dwt_cuda::CudaDWTTester PERF_TESTER;                                       \
+    for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) {             \
      PERF_TESTER.beginTestIteration();

-  #define PERF_END(PERF_NAME, PERF_W, PERF_H)  \
-      PERF_TESTER.endTestIteration(); \
-    } \
-    PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
+#define PERF_END(PERF_NAME, PERF_W, PERF_H)                                    \
+  PERF_TESTER.endTestIteration();                                              \
+  }                                                                            \
+  PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H);                      \
  }
 #else // GPU_DWT_TESTING
-  #define PERF_BEGIN
-  #define PERF_END(PERF_NAME, PERF_W, PERF_H)
+#define PERF_BEGIN
+#define PERF_END(PERF_NAME, PERF_W, PERF_H)
 #endif // GPU_DWT_TESTING

-
-
 namespace dwt_cuda {

+/// Divide and round up.
+template <typename T>
+__device__ __host__ inline T divRndUp(const T &n, const T &d) {
+  return (n / d) + ((n % d) ? 1 : 0);
+}

-  /// Divide and round up.
-  template <typename T>
-  __device__ __host__ inline T divRndUp(const T & n, const T & d) {
-    return (n / d) + ((n % d) ? 1 : 0);
+// 9/7 forward DWT lifting schema coefficients
+const float f97Predict1 = -1.586134342;  ///< forward 9/7 predict 1
+const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
+const float f97Predict2 = 0.8829110762;  ///< forward 9/7 predict 2
+const float f97Update2 = 0.4435068522;   ///< forward 9/7 update 2
+
+// 9/7 reverse DWT lifting schema coefficients
+const float r97update2 = -f97Update2;   ///< undo 9/7 update 2
+const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
+const float r97update1 = -f97Update1;   ///< undo 9/7 update 1
+const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
+
+// FDWT 9/7 scaling coefficients
+const float scale97Mul = 1.23017410491400f;
+const float scale97Div = 1.0 / scale97Mul;
+
+// 5/3 forward DWT lifting schema coefficients
+const float forward53Predict = -0.5f; /// forward 5/3 predict
+const float forward53Update = 0.25f;  /// forward 5/3 update
+
+// 5/3 forward DWT lifting schema coefficients
+const float reverse53Update = -forward53Update;   /// undo 5/3 update
+const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
+
+/// Functor which adds scaled sum of neighbors to given central pixel.
+struct AddScaledSum {
+  const float scale; // scale of neighbors
+  __device__ AddScaledSum(const float scale) : scale(scale) {}
+  __device__ void operator()(const float p, float &c, const float n) const {
+
+    // if(threadIdx.x == 0) {
+
+    //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
+    //   scale * (p + n) );
+
+    // }
+
+    c += scale * (p + n);
+  }
+};
+
+/// Returns index ranging from 0 to num threads, such that first half
+/// of threads get even indices and others get odd indices. Each thread
+/// gets different index.
+/// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
+///                              parityIdx:   0  2  4  6  1  3  5  7
+/// @tparam THREADS  total count of participating threads
+/// @return parity-separated index of thread
+template <int THREADS> __device__ inline int parityIdx() {
+  return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
+}
+
+/// size of shared memory
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+const int SHM_SIZE = 48 * 1024;
+#else
+const int SHM_SIZE = 16 * 1024;
+#endif
+
+/// Perrformance and return code tester.
+class CudaDWTTester {
+private:
+  static bool testRunning;  ///< true if any test is currently running
+  cudaEvent_t beginEvent;   ///< begin CUDA event
+  cudaEvent_t endEvent;     ///< end CUDA event
+  std::vector<float> times; ///< collected times
+  const bool disabled;      ///< true if this object is disabled
+public:
+  /// Checks CUDA related error.
+  /// @param status   return code to be checked
+  /// @param message  message to be shown if there was an error
+  /// @return true if there was no error, false otherwise
+  static bool check(const cudaError_t &status, const char *message) {
+#if defined(GPU_DWT_TESTING)
+    if ((!testRunning) && status != cudaSuccess) {
+      const char *errorString = cudaGetErrorString(status);
+      fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
+      fflush(stderr);
+      return false;
+    }
+#endif // GPU_DWT_TESTING
+    return true;
  }

-  
-  // 9/7 forward DWT lifting schema coefficients
-  const float f97Predict1 = -1.586134342;   ///< forward 9/7 predict 1
-  const float f97Update1 = -0.05298011854;  ///< forward 9/7 update 1
-  const float f97Predict2 = 0.8829110762;   ///< forward 9/7 predict 2
-  const float f97Update2 = 0.4435068522;    ///< forward 9/7 update 2
-
-
-  // 9/7 reverse DWT lifting schema coefficients
-  const float r97update2 = -f97Update2;    ///< undo 9/7 update 2
-  const float r97predict2 = -f97Predict2;  ///< undo 9/7 predict 2
-  const float r97update1 = -f97Update1;    ///< undo 9/7 update 1
-  const float r97Predict1 = -f97Predict1;  ///< undo 9/7 predict 1
-  
-  // FDWT 9/7 scaling coefficients
-  const float scale97Mul = 1.23017410491400f;
-  const float scale97Div = 1.0 / scale97Mul;
-  
-  
-  // 5/3 forward DWT lifting schema coefficients
-  const float forward53Predict = -0.5f;   /// forward 5/3 predict
-  const float forward53Update = 0.25f;    /// forward 5/3 update
-  
-  // 5/3 forward DWT lifting schema coefficients
-  const float reverse53Update = -forward53Update;    /// undo 5/3 update
-  const float reverse53Predict = -forward53Predict;  /// undo 5/3 predict
-  
-  
-  
-  /// Functor which adds scaled sum of neighbors to given central pixel.
-  struct AddScaledSum {
-    const float scale;  // scale of neighbors
-    __device__ AddScaledSum(const float scale) : scale(scale) {}
-    __device__ void operator()(const float p, float & c, const float n) const {
-
-      // if(threadIdx.x == 0) {
-
-      //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );
-      
-      // }
-      
-      c += scale * (p + n);
-    }
-  };
-  
-  
-  
-  /// Returns index ranging from 0 to num threads, such that first half
-  /// of threads get even indices and others get odd indices. Each thread
-  /// gets different index.
-  /// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
-  ///                              parityIdx:   0  2  4  6  1  3  5  7
-  /// @tparam THREADS  total count of participating threads
-  /// @return parity-separated index of thread
-  template <int THREADS>
-  __device__ inline int parityIdx() {
-    return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
+  /// Checks last kernel call for errors.
+  /// @param message  description of the kernel call
+  /// @return true if there was no error, false otherwise
+  static bool checkLastKernelCall(const char *message) {
+#if defined(GPU_DWT_TESTING)
+    return testRunning ? true : check(cudaThreadSynchronize(), message);
+#else  // GPU_DWT_TESTING
+    return true;
+#endif // GPU_DWT_TESTING
  }

+  /// Initializes DWT tester for time measurement
+  CudaDWTTester() : disabled(testRunning) {}

+  /// Gets rpefered number of iterations
+  int getNumIterations() { return disabled ? 1 : 31; }

-  /// size of shared memory
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-  const int SHM_SIZE = 48 * 1024;
-  #else
-  const int SHM_SIZE = 16 * 1024;
-  #endif
-  
-  
-  
-  /// Perrformance and return code tester.
-  class CudaDWTTester {
-  private:
-    static bool testRunning;    ///< true if any test is currently running
-    cudaEvent_t beginEvent;     ///< begin CUDA event
-    cudaEvent_t endEvent;       ///< end CUDA event
-    std::vector<float> times;   ///< collected times
-    const bool disabled;        ///< true if this object is disabled
-  public:
-    /// Checks CUDA related error.
-    /// @param status   return code to be checked
-    /// @param message  message to be shown if there was an error
-    /// @return true if there was no error, false otherwise
-    static bool check(const cudaError_t & status, const char * message) {
-      #if defined(GPU_DWT_TESTING)
-      if((!testRunning) && status != cudaSuccess) {
-        const char * errorString = cudaGetErrorString(status);
-        fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
-        fflush(stderr);
-        return false;
-      }
-      #endif // GPU_DWT_TESTING
-      return true;
+  /// Starts one test iteration.
+  void beginTestIteration() {
+    if (!disabled) {
+      cudaEventCreate(&beginEvent);
+      cudaEventCreate(&endEvent);
+      cudaEventRecord(beginEvent, 0);
+      testRunning = true;
    }
-
-    /// Checks last kernel call for errors.
-    /// @param message  description of the kernel call
-    /// @return true if there was no error, false otherwise
-    static bool checkLastKernelCall(const char * message) {
-      #if defined(GPU_DWT_TESTING)
-      return testRunning ? true : check(cudaThreadSynchronize(), message);
-      #else // GPU_DWT_TESTING
-      return true;
-      #endif // GPU_DWT_TESTING
-    }
-    
-    /// Initializes DWT tester for time measurement
-    CudaDWTTester() : disabled(testRunning) {}
-    
-    /// Gets rpefered number of iterations
-    int getNumIterations() {
-      return disabled ? 1 : 31;
-    }
-    
-    /// Starts one test iteration.
-    void beginTestIteration() {
-      if(!disabled) {
-        cudaEventCreate(&beginEvent);
-        cudaEventCreate(&endEvent);
-        cudaEventRecord(beginEvent, 0);
-        testRunning = true;
-      }
-    }
-    
-    /// Ends on etest iteration.
-    void endTestIteration() {
-      if(!disabled) {
-        float time;
-        testRunning = false;
-        cudaEventRecord(endEvent, 0);
-        cudaEventSynchronize(endEvent);
-        cudaEventElapsedTime(&time, beginEvent, endEvent);
-        cudaEventDestroy(beginEvent);
-        cudaEventDestroy(endEvent);
-        times.push_back(time);
-      }
-    }
-    
-    /// Shows brief info about all iterations.
-    /// @param name   name of processing method
-    /// @param sizeX  width of processed image
-    /// @param sizeY  height of processed image
-    void showPerformance(const char * name, const int sizeX, const int sizeY) {
-      if(!disabled) {
-        // compute mean and median
-        std::sort(times.begin(), times.end());
-        double sum = 0;
-        for(int i = times.size(); i--; ) {
-          sum += times[i];
-        }
-        const double median = (times[times.size() / 2]
-                             + times[(times.size() - 1) / 2]) * 0.5f;
-        printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
-               "(%d x %d)\n", name, (sum / times.size()), median, 
-               times[times.size() - 1], sizeX, sizeY);
-      }
-    }
-  };
-  
-  
-  
-  /// Simple cudaMemcpy wrapped in performance tester.
-  /// @param dest  destination bufer
-  /// @param src   source buffer
-  /// @param sx    width of copied image
-  /// @param sy    height of copied image
-  template <typename T>
-  inline void memCopy(T * const dest, const T * const src,
-                      const size_t sx, const size_t sy) {
-    cudaError_t status;
-    PERF_BEGIN
-    status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice);
-    PERF_END("        memcpy", sx, sy)
-    CudaDWTTester::check(status, "memcpy device > device");
  }

+  /// Ends on etest iteration.
+  void endTestIteration() {
+    if (!disabled) {
+      float time;
+      testRunning = false;
+      cudaEventRecord(endEvent, 0);
+      cudaEventSynchronize(endEvent);
+      cudaEventElapsedTime(&time, beginEvent, endEvent);
+      cudaEventDestroy(beginEvent);
+      cudaEventDestroy(endEvent);
+      times.push_back(time);
+    }
+  }

+  /// Shows brief info about all iterations.
+  /// @param name   name of processing method
+  /// @param sizeX  width of processed image
+  /// @param sizeY  height of processed image
+  void showPerformance(const char *name, const int sizeX, const int sizeY) {
+    if (!disabled) {
+      // compute mean and median
+      std::sort(times.begin(), times.end());
+      double sum = 0;
+      for (int i = times.size(); i--;) {
+        sum += times[i];
+      }
+      const double median =
+          (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
+      printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
+             "(%d x %d)\n",
+             name, (sum / times.size()), median, times[times.size() - 1], sizeX,
+             sizeY);
+    }
+  }
+};
+
+/// Simple cudaMemcpy wrapped in performance tester.
+/// @param dest  destination bufer
+/// @param src   source buffer
+/// @param sx    width of copied image
+/// @param sy    height of copied image
+template <typename T>
+inline void memCopy(T *const dest, const T *const src, const size_t sx,
+                    const size_t sy) {
+  cudaError_t status;
+  PERF_BEGIN
+  status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
+  PERF_END("        memcpy", sx, sy)
+  CudaDWTTester::check(status, "memcpy device > device");
+}

 } // end of namespace dwt_cuda

-
-
-#endif	// DWT_COMMON_CUDA_H
-
+#endif // DWT_COMMON_CUDA_H
--- a/examples/dwt2d/dwt_cuda/dwt.h
+++ b/examples/dwt2d/dwt_cuda/dwt.h
@ -56,57 +56,48 @@
 ///

 #ifndef DWT_CUDA_H
-#define	DWT_CUDA_H
-
+#define DWT_CUDA_H

 namespace dwt_cuda {

+/// Forward 5/3 2D DWT. See common rules (above) for more details.
+/// @param in      Expected to be normalized into range [-128, 127].
+///                Will not be preserved (will be overwritten).
+/// @param out     output buffer on GPU
+/// @param sizeX   width of input image (in pixels)
+/// @param sizeY   height of input image (in pixels)
+/// @param levels  number of recursive DWT levels
+void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);

-  /// Forward 5/3 2D DWT. See common rules (above) for more details.
-  /// @param in      Expected to be normalized into range [-128, 127].
-  ///                Will not be preserved (will be overwritten).
-  /// @param out     output buffer on GPU
-  /// @param sizeX   width of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
-  /// @param levels  number of recursive DWT levels
-  void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
+/// Reverse 5/3 2D DWT. See common rules (above) for more details.
+/// @param in      Input DWT coefficients. Format described in common rules.
+///                Will not be preserved (will be overwritten).
+/// @param out     output buffer on GPU - will contain original image
+///                in normalized range [-128, 127].
+/// @param sizeX   width of input image (in pixels)
+/// @param sizeY   height of input image (in pixels)
+/// @param levels  number of recursive DWT levels
+void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);

+/// Forward 9/7 2D DWT. See common rules (above) for more details.
+/// @param in      Input DWT coefficients. Should be normalized (in range
+///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
+/// @param out     output buffer on GPU - format specified in common rules
+/// @param sizeX   width of input image (in pixels)
+/// @param sizeY   height of input image (in pixels)
+/// @param levels  number of recursive DWT levels
+void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);

-  /// Reverse 5/3 2D DWT. See common rules (above) for more details.
-  /// @param in      Input DWT coefficients. Format described in common rules.
-  ///                Will not be preserved (will be overwritten).
-  /// @param out     output buffer on GPU - will contain original image
-  ///                in normalized range [-128, 127].
-  /// @param sizeX   width of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
-  /// @param levels  number of recursive DWT levels
-  void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
-  
-  
-  /// Forward 9/7 2D DWT. See common rules (above) for more details.
-  /// @param in      Input DWT coefficients. Should be normalized (in range 
-  ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
-  /// @param out     output buffer on GPU - format specified in common rules
-  /// @param sizeX   width of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
-  /// @param levels  number of recursive DWT levels
-  void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
-  
-  
-  /// Reverse 9/7 2D DWT. See common rules (above) for more details.
-  /// @param in      Input DWT coefficients. Format described in common rules.
-  ///                Will not be preserved (will be overwritten).
-  /// @param out     output buffer on GPU - will contain original image
-  ///                in normalized range [-0.5, 0.5].
-  /// @param sizeX   width of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
-  /// @param levels  number of recursive DWT levels
-  void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
-  
+/// Reverse 9/7 2D DWT. See common rules (above) for more details.
+/// @param in      Input DWT coefficients. Format described in common rules.
+///                Will not be preserved (will be overwritten).
+/// @param out     output buffer on GPU - will contain original image
+///                in normalized range [-0.5, 0.5].
+/// @param sizeX   width of input image (in pixels)
+/// @param sizeY   height of input image (in pixels)
+/// @param levels  number of recursive DWT levels
+void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);

 } // namespace dwt_cuda

-
-
-#endif	// DWT_CUDA_H
-
+#endif // DWT_CUDA_H
--- a/examples/dwt2d/dwt_cuda/io.h
+++ b/examples/dwt2d/dwt_cuda/io.h
@ -30,454 +30,411 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///

-
 #ifndef IO_H
-#define	IO_H
-
+#define IO_H

 #include "common.h"

 namespace dwt_cuda {

-  
-  /// Base for all IO classes - manages mirroring.
-  class DWTIO {
-  protected:
-    /// Handles mirroring of image at edges in a DWT correct way.
-    /// @param d      a position in the image (will be replaced by mirrored d)
-    /// @param sizeD  size of the image along the dimension of 'd'
-    __device__ static void mirror(int & d, const int & sizeD) {
-      // TODO: enable multiple mirroring:
-//      if(sizeD > 1) {
-//        if(d < 0) {
-//          const int underflow = -1 - d;
-//          const int phase = (underflow / (sizeD - 1)) & 1;
-//          const int remainder = underflow % (sizeD - 1);
-//          if(phase == 0) {
-//            d = remainder + 1;
-//          } else {
-//            d = sizeD - 2 - remainder;
-//          }
-//        } else if(d >= sizeD) {
-//          const int overflow = d - sizeD;
-//          const int phase = (overflow / (sizeD - 1)) & 1;
-//          const int remainder = overflow % (sizeD - 1);
-//          if(phase == 0) {
-//            d = sizeD - 2 - remainder;
-//          } else {
-//            d = remainder + 1;
-//          }
-//        }
-//      } else {
-//        d = 0;
-//      }
-  //for test the mirror's use Feb 17
-      if(d >= sizeD) {
-        d = 2 * sizeD - 2 - d;
-      } else if(d < 0) {
-        d = -d;
-      }
+/// Base for all IO classes - manages mirroring.
+class DWTIO {
+protected:
+  /// Handles mirroring of image at edges in a DWT correct way.
+  /// @param d      a position in the image (will be replaced by mirrored d)
+  /// @param sizeD  size of the image along the dimension of 'd'
+  __device__ static void mirror(int &d, const int &sizeD) {
+    // TODO: enable multiple mirroring:
+    //      if(sizeD > 1) {
+    //        if(d < 0) {
+    //          const int underflow = -1 - d;
+    //          const int phase = (underflow / (sizeD - 1)) & 1;
+    //          const int remainder = underflow % (sizeD - 1);
+    //          if(phase == 0) {
+    //            d = remainder + 1;
+    //          } else {
+    //            d = sizeD - 2 - remainder;
+    //          }
+    //        } else if(d >= sizeD) {
+    //          const int overflow = d - sizeD;
+    //          const int phase = (overflow / (sizeD - 1)) & 1;
+    //          const int remainder = overflow % (sizeD - 1);
+    //          if(phase == 0) {
+    //            d = sizeD - 2 - remainder;
+    //          } else {
+    //            d = remainder + 1;
+    //          }
+    //        }
+    //      } else {
+    //        d = 0;
+    //      }
+    // for test the mirror's use Feb 17
+    if (d >= sizeD) {
+      d = 2 * sizeD - 2 - d;
+    } else if (d < 0) {
+      d = -d;
    }
-  };
+  }
+};

+/// Base class for pixel loader and writer - manages computing start index,
+/// stride and end of image for loading column of pixels.
+/// @tparam T        type of image pixels
+/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
+protected:
+  int end;    ///< index of bottom neightbor of last pixel of column
+  int stride; ///< increment of pointer to get to next pixel

-  /// Base class for pixel loader and writer - manages computing start index,
-  /// stride and end of image for loading column of pixels.
-  /// @tparam T        type of image pixels
-  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-  template <typename T, bool CHECKED>
-  class VerticalDWTPixelIO : protected DWTIO {
-  protected:
-    int end;         ///< index of bottom neightbor of last pixel of column
-    int stride;      ///< increment of pointer to get to next pixel
+  /// Initializes pixel IO - sets end index and a position of first pixel.
+  /// @param sizeX   width of the image
+  /// @param sizeY   height of the image
+  /// @param firstX  x-coordinate of first pixel to use
+  /// @param firstY  y-coordinate of first pixel to use
+  /// @return index of pixel at position [x, y] in the image
+  __device__ int initialize(const int sizeX, const int sizeY, int firstX,
+                            int firstY) {
+    // initialize all pointers and stride
+    end = CHECKED ? (sizeY * sizeX + firstX) : 0;
+    stride = sizeX;
+    return firstX + sizeX * firstY;
+  }
+};

-    /// Initializes pixel IO - sets end index and a position of first pixel.
-    /// @param sizeX   width of the image
-    /// @param sizeY   height of the image
-    /// @param firstX  x-coordinate of first pixel to use
-    /// @param firstY  y-coordinate of first pixel to use
-    /// @return index of pixel at position [x, y] in the image
-    __device__ int initialize(const int sizeX, const int sizeY,
-                              int firstX, int firstY) {
-      // initialize all pointers and stride
-      end = CHECKED ? (sizeY * sizeX + firstX) : 0;
-      stride = sizeX;
-      return firstX + sizeX * firstY;
-    }
-  };
+/// Writes reverse transformed pixels directly into output image.
+/// @tparam T        type of output pixels
+/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+template <typename T, bool CHECKED>
+class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
+private:
+  int next; // index of the next pixel to be loaded

-
-
-  /// Writes reverse transformed pixels directly into output image.
-  /// @tparam T        type of output pixels
-  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-  template <typename T, bool CHECKED>
-  class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
-  private:
-    int next;   // index of the next pixel to be loaded
-
-  public:
-    /// Initializes writer - sets output buffer and a position of first pixel.
-    /// @param sizeX   width of the image
-    /// @param sizeY   height of the image
-    /// @param firstX  x-coordinate of first pixel to write into
-    /// @param firstY  y-coordinate of first pixel to write into
-    __device__ void init(const int sizeX, const int sizeY, 
-                         int firstX, int firstY) {
-      if(firstX < sizeX) {
-        next = this->initialize(sizeX, sizeY, firstX, firstY);
-      } else {
-        this->end = 0;
-        this->stride = 0;
-        next = 0;
-      }
-    }
-
-    /// Writes given value at next position and advances internal pointer while
-    /// correctly handling mirroring.
-    /// @param output  output image to write pixel into
-    /// @param value   value of the pixel to be written
-    __device__ void writeInto(T * const output, const T & value) {
-      if((!CHECKED) || (next != this->end)) {
-        output[next] = value;
-        next += this->stride;
-      }
-    }
-  };
-
-
-  
-  /// Loads pixels from input image.
-  /// @tparam T        type of image input pixels
-  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-  template <typename T, bool CHECKED>
-  class VerticalDWTPixelLoader
-          : protected VerticalDWTPixelIO<const T, CHECKED> {
-  private:
-    int last;  ///< index of last loaded pixel
-  public:
-  
-
-  //******************* FOR TEST **********************
-  __device__ int getlast(){
-		return last;
-	}
-  __device__ int getend(){
-		return this->end;
-	}
-  __device__ int getstride(){
-		return this->stride;
-	}
-  __device__ void setend(int a){
-      this->end=a;
-	}
-	//******************* FOR TEST **********************
-  
-  
-  
-    /// Initializes loader - sets input size and a position of first pixel.
-    /// @param sizeX   width of the image
-    /// @param sizeY   height of the image
-    /// @param firstX  x-coordinate of first pixel to load
-    /// @param firstY  y-coordinate of first pixel to load
-    __device__ void init(const int sizeX, const int sizeY,
-                         int firstX, int firstY) {
-      // correctly mirror x coordinate
-      this->mirror(firstX, sizeX);
-      
-      // 'last' always points to already loaded pixel (subtract sizeX = stride)
-      last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
-      //last = (FirstX + sizeX * FirstY) - sizeX
-    }
-    
-    /// Sets all fields to zeros, for compiler not to complain about
-    /// uninitialized stuff.
-    __device__ void clear() {
+public:
+  /// Initializes writer - sets output buffer and a position of first pixel.
+  /// @param sizeX   width of the image
+  /// @param sizeY   height of the image
+  /// @param firstX  x-coordinate of first pixel to write into
+  /// @param firstY  y-coordinate of first pixel to write into
+  __device__ void init(const int sizeX, const int sizeY, int firstX,
+                       int firstY) {
+    if (firstX < sizeX) {
+      next = this->initialize(sizeX, sizeY, firstX, firstY);
+    } else {
      this->end = 0;
      this->stride = 0;
-      this->last = 0;
+      next = 0;
+    }
+  }
+
+  /// Writes given value at next position and advances internal pointer while
+  /// correctly handling mirroring.
+  /// @param output  output image to write pixel into
+  /// @param value   value of the pixel to be written
+  __device__ void writeInto(T *const output, const T &value) {
+    if ((!CHECKED) || (next != this->end)) {
+      output[next] = value;
+      next += this->stride;
+    }
+  }
+};
+
+/// Loads pixels from input image.
+/// @tparam T        type of image input pixels
+/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+template <typename T, bool CHECKED>
+class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
+private:
+  int last; ///< index of last loaded pixel
+public:
+  //******************* FOR TEST **********************
+  __device__ int getlast() { return last; }
+  __device__ int getend() { return this->end; }
+  __device__ int getstride() { return this->stride; }
+  __device__ void setend(int a) { this->end = a; }
+  //******************* FOR TEST **********************
+
+  /// Initializes loader - sets input size and a position of first pixel.
+  /// @param sizeX   width of the image
+  /// @param sizeY   height of the image
+  /// @param firstX  x-coordinate of first pixel to load
+  /// @param firstY  y-coordinate of first pixel to load
+  __device__ void init(const int sizeX, const int sizeY, int firstX,
+                       int firstY) {
+    // correctly mirror x coordinate
+    this->mirror(firstX, sizeX);
+
+    // 'last' always points to already loaded pixel (subtract sizeX = stride)
+    last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
+    // last = (FirstX + sizeX * FirstY) - sizeX
+  }
+
+  /// Sets all fields to zeros, for compiler not to complain about
+  /// uninitialized stuff.
+  __device__ void clear() {
+    this->end = 0;
+    this->stride = 0;
+    this->last = 0;
+  }
+
+  /// Gets another pixel and advancees internal pointer to following one.
+  /// @param input  input image to load next pixel from
+  /// @return next pixel from given image
+  __device__ T loadFrom(const T *const input) {
+    last += this->stride;
+    if (CHECKED && (last == this->end)) {
+      last -= 2 * this->stride;
+      this->stride = -this->stride; // reverse loader's direction
+    }
+    // avoid reading from negative indices if loader is checked
+    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
+    // checked variant later
+    if (last < 0) {
+      return 0;
    }

-    /// Gets another pixel and advancees internal pointer to following one.
-    /// @param input  input image to load next pixel from
-    /// @return next pixel from given image
-    __device__ T loadFrom(const T * const input) {
-      last += this->stride;
-      if(CHECKED && (last == this->end)) {
-        last -= 2 * this->stride;
-        this->stride = -this->stride; // reverse loader's direction
-      }
-      // avoid reading from negative indices if loader is checked
-      // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
-      if(last < 0 ) {
-        return 0;
-      }
+    return input[last];
+    // return this->end;
+    // return last;
+    // return this->stride;
+  }
+};

-      return input[last];
-      // return this->end;
-      // return last;
-      // return this->stride;
-    }
-  };
+/// Base for band write and loader. Manages computing strides and pointers
+/// to first and last pixels in a linearly-stored-bands correct way.
+/// @tparam T        type of band coefficients
+/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
+protected:
+  /// index of bottom neighbor of last pixel of loaded column
+  int end;

+  /// increment of index to get from highpass band to the lowpass one
+  int strideHighToLow;

+  /// increment of index to get from the lowpass band to the highpass one
+  int strideLowToHigh;

-  /// Base for band write and loader. Manages computing strides and pointers
-  /// to first and last pixels in a linearly-stored-bands correct way.
-  /// @tparam T        type of band coefficients
-  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-  template <typename T, bool CHECKED>
-  class VerticalDWTBandIO : protected DWTIO {
-  protected:
-    /// index of bottom neighbor of last pixel of loaded column
-    int end;
+  /// Initializes IO - sets size of image and a position of first pixel.
+  /// @param imageSizeX   width of the image
+  /// @param imageSizeY   height of the image
+  /// @param firstX       x-coordinate of first pixel to use
+  ///                     (Parity determines vertically low or high band.)
+  /// @param firstY       y-coordinate of first pixel to use
+  ///                     (Parity determines horizontally low or high band.)
+  /// @return index of first item specified by firstX and firstY
+  __device__ int initialize(const int imageSizeX, const int imageSizeY,
+                            int firstX, int firstY) {
+    // index of first pixel (topmost one) of the column with index firstX
+    int columnOffset = firstX / 2;

-    /// increment of index to get from highpass band to the lowpass one
-    int strideHighToLow;
+    // difference between indices of two vertically neighboring pixels
+    // in the same band
+    int verticalStride;

-    /// increment of index to get from the lowpass band to the highpass one
-    int strideLowToHigh;
-
-    /// Initializes IO - sets size of image and a position of first pixel.
-    /// @param imageSizeX   width of the image
-    /// @param imageSizeY   height of the image
-    /// @param firstX       x-coordinate of first pixel to use
-    ///                     (Parity determines vertically low or high band.)
-    /// @param firstY       y-coordinate of first pixel to use
-    ///                     (Parity determines horizontally low or high band.)
-    /// @return index of first item specified by firstX and firstY
-    __device__ int initialize(const int imageSizeX, const int imageSizeY,
-                              int firstX, int firstY) {
-      // index of first pixel (topmost one) of the column with index firstX
-      int columnOffset = firstX / 2;
-      
-      // difference between indices of two vertically neighboring pixels
-      // in the same band
-      int verticalStride;
-      
-      // resolve index of first pixel according to horizontal parity
-      if(firstX & 1) {
-        // first pixel in one of right bands
-        verticalStride = imageSizeX / 2;
-        columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
-        strideLowToHigh = (imageSizeX * imageSizeY) / 2;
-      } else {
-        // first pixel in one of left bands
-        verticalStride = imageSizeX / 2 + (imageSizeX & 1);
-        strideLowToHigh = divRndUp(imageSizeY, 2)  * imageSizeX;
-      }
-      
-      // set the other stride
-      strideHighToLow = verticalStride - strideLowToHigh;
-
-      // compute index of coefficient which indicates end of image
-      if(CHECKED) {
-        end = columnOffset                            // right column
-                + (imageSizeY / 2) * verticalStride   // right row
-                + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
-      } else {
-        end = 0;
-      }
-
-
-	//***********for test**************
-	//	end = CHECKED;
-	//***********for test**************
-	
-	
-      // finally, return index of the first item
-      return columnOffset                        // right column
-              + (firstY / 2) * verticalStride    // right row
-              + (firstY & 1) * strideLowToHigh;  // possibly in high band
-    }
-  };
-
-
-
-
-  /// Directly loads coefficients from four consecutively stored transformed
-  /// bands.
-  /// @tparam T        type of input band coefficients
-  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-  template <typename T, bool CHECKED>
-  class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
-  private:
-    int last;  ///< index of last loaded pixel
-
-    /// Checks internal index and possibly reverses direction of loader.
-    /// (Handles mirroring at the bottom of the image.)
-    /// @param input   input image to load next coefficient from
-    /// @param stride  stride to use now (one of two loader's strides)
-    /// @return loaded coefficient
-    __device__ T updateAndLoad(const T * const input, const int & stride) {
-      last += stride;
-      if(CHECKED && (last == this->end)) {
-        // undo last two updates of index (to get to previous mirrored item)
-        last -= (this->strideLowToHigh + this->strideHighToLow);
-
-        // swap and reverse strides (to move up in the loaded column now)
-        const int temp = this->strideLowToHigh;
-        this->strideLowToHigh = -this->strideHighToLow;
-        this->strideHighToLow = -temp;
-      }
-      if(last < 0 ) {
-        return 0;
-      }
-      // avoid reading from negative indices if loader is checked
-      // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
-      return input[last];
-    }
-  public:
-
-    /// Initializes loader - sets input size and a position of first pixel.
-    /// @param imageSizeX   width of the image
-    /// @param imageSizeY   height of the image
-    /// @param firstX       x-coordinate of first pixel to load
-    ///                     (Parity determines vertically low or high band.)
-    /// @param firstY       y-coordinate of first pixel to load
-    ///                     (Parity determines horizontally low or high band.)
-    __device__ void init(const int imageSizeX, const int imageSizeY,
-                         int firstX, const int firstY) {
-      this->mirror(firstX, imageSizeX);
-      last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
-      
-      // adjust to point to previous item
-      last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow; 
+    // resolve index of first pixel according to horizontal parity
+    if (firstX & 1) {
+      // first pixel in one of right bands
+      verticalStride = imageSizeX / 2;
+      columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
+      strideLowToHigh = (imageSizeX * imageSizeY) / 2;
+    } else {
+      // first pixel in one of left bands
+      verticalStride = imageSizeX / 2 + (imageSizeX & 1);
+      strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
    }

-    /// Sets all fields to zeros, for compiler not to complain about
-    /// uninitialized stuff.
-    __device__ void clear() {
-      this->end = 0;
-      this->strideHighToLow = 0;
-      this->strideLowToHigh = 0;
-      this->last = 0;
+    // set the other stride
+    strideHighToLow = verticalStride - strideLowToHigh;
+
+    // compute index of coefficient which indicates end of image
+    if (CHECKED) {
+      end = columnOffset                          // right column
+            + (imageSizeY / 2) * verticalStride   // right row
+            + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
+    } else {
+      end = 0;
    }

-    /// Gets another coefficient from lowpass band and advances internal index.
-    /// Call this method first if position of first pixel passed to init
-    /// was in high band.
-    /// @param input   input image to load next coefficient from
-    /// @return next coefficient from the lowpass band of the given image
-    __device__ T loadLowFrom(const T * const input) {
-      return updateAndLoad(input, this->strideHighToLow);
+    //***********for test**************
+    //	end = CHECKED;
+    //***********for test**************
+
+    // finally, return index of the first item
+    return columnOffset                      // right column
+           + (firstY / 2) * verticalStride   // right row
+           + (firstY & 1) * strideLowToHigh; // possibly in high band
+  }
+};
+
+/// Directly loads coefficients from four consecutively stored transformed
+/// bands.
+/// @tparam T        type of input band coefficients
+/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+template <typename T, bool CHECKED>
+class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
+private:
+  int last; ///< index of last loaded pixel
+
+  /// Checks internal index and possibly reverses direction of loader.
+  /// (Handles mirroring at the bottom of the image.)
+  /// @param input   input image to load next coefficient from
+  /// @param stride  stride to use now (one of two loader's strides)
+  /// @return loaded coefficient
+  __device__ T updateAndLoad(const T *const input, const int &stride) {
+    last += stride;
+    if (CHECKED && (last == this->end)) {
+      // undo last two updates of index (to get to previous mirrored item)
+      last -= (this->strideLowToHigh + this->strideHighToLow);
+
+      // swap and reverse strides (to move up in the loaded column now)
+      const int temp = this->strideLowToHigh;
+      this->strideLowToHigh = -this->strideHighToLow;
+      this->strideHighToLow = -temp;
    }
-
-    /// Gets another coefficient from the highpass band and advances index.
-    /// Call this method first if position of first pixel passed to init
-    /// was in high band.
-    /// @param input   input image to load next coefficient from
-    /// @return next coefficient from the highbass band of the given image
-    __device__ T loadHighFrom(const T * const input) {
-      return updateAndLoad(input, this->strideLowToHigh);
+    if (last < 0) {
+      return 0;
    }
+    // avoid reading from negative indices if loader is checked
+    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
+    // checked variant later
+    return input[last];
+  }

-  };
+public:
+  /// Initializes loader - sets input size and a position of first pixel.
+  /// @param imageSizeX   width of the image
+  /// @param imageSizeY   height of the image
+  /// @param firstX       x-coordinate of first pixel to load
+  ///                     (Parity determines vertically low or high band.)
+  /// @param firstY       y-coordinate of first pixel to load
+  ///                     (Parity determines horizontally low or high band.)
+  __device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
+                       const int firstY) {
+    this->mirror(firstX, imageSizeX);
+    last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);

+    // adjust to point to previous item
+    last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
+  }

+  /// Sets all fields to zeros, for compiler not to complain about
+  /// uninitialized stuff.
+  __device__ void clear() {
+    this->end = 0;
+    this->strideHighToLow = 0;
+    this->strideLowToHigh = 0;
+    this->last = 0;
+  }

+  /// Gets another coefficient from lowpass band and advances internal index.
+  /// Call this method first if position of first pixel passed to init
+  /// was in high band.
+  /// @param input   input image to load next coefficient from
+  /// @return next coefficient from the lowpass band of the given image
+  __device__ T loadLowFrom(const T *const input) {
+    return updateAndLoad(input, this->strideHighToLow);
+  }

-  /// Directly saves coefficients into four transformed bands.
-  /// @tparam T        type of output band coefficients
-  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-  template <typename T, bool CHECKED>
-  class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
-  private:
-    int next;  ///< index of last loaded pixel
+  /// Gets another coefficient from the highpass band and advances index.
+  /// Call this method first if position of first pixel passed to init
+  /// was in high band.
+  /// @param input   input image to load next coefficient from
+  /// @return next coefficient from the highbass band of the given image
+  __device__ T loadHighFrom(const T *const input) {
+    return updateAndLoad(input, this->strideLowToHigh);
+  }
+};

-    /// Checks internal index and possibly stops the writer.
-    /// (Handles mirroring at edges of the image.)
-    /// @param output  output buffer
-    /// @param item    item to put into the output
-    /// @param stride  increment of the pointer to get to next output index
-    __device__ int saveAndUpdate(T * const output, const T & item,
-                                  const int & stride) {
-//	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){		//test, Mar 20					  
-      if((!CHECKED) || (next != this->end)) {
-        // if(next == 4) {
-        //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
-        // }
-        output[next] = item;
-        next += stride;
-      } 
-//	}
-      // if((!CHECKED) || (next != this->end)) { //the real one
-        // output[next] = item;
-        // next += stride;  //stride has been test
+/// Directly saves coefficients into four transformed bands.
+/// @tparam T        type of output band coefficients
+/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+template <typename T, bool CHECKED>
+class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
+private:
+  int next; ///< index of last loaded pixel
+
+  /// Checks internal index and possibly stops the writer.
+  /// (Handles mirroring at edges of the image.)
+  /// @param output  output buffer
+  /// @param item    item to put into the output
+  /// @param stride  increment of the pointer to get to next output index
+  __device__ int saveAndUpdate(T *const output, const T &item,
+                               const int &stride) {
+    //	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
+    ////test, Mar 20
+    if ((!CHECKED) || (next != this->end)) {
+      // if(next == 4) {
+      //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
      // }
-	return next;
+      output[next] = item;
+      next += stride;
    }
-  public:
+    //	}
+    // if((!CHECKED) || (next != this->end)) { //the real one
+    // output[next] = item;
+    // next += stride;  //stride has been test
+    // }
+    return next;
+  }

-    /// Initializes writer - sets output size and a position of first pixel.
-    /// @param output       output image
-    /// @param imageSizeX   width of the image
-    /// @param imageSizeY   height of the image
-    /// @param firstX       x-coordinate of first pixel to write
-    ///                     (Parity determines vertically low or high band.)
-    /// @param firstY       y-coordinate of first pixel to write
-    ///                     (Parity determines horizontally low or high band.)
-    __device__ void init(const int imageSizeX, const int imageSizeY,
-                         const int firstX, const int firstY) {
-      if (firstX < imageSizeX) {
-        next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
-      } else {
-        clear();
-      }
+public:
+  /// Initializes writer - sets output size and a position of first pixel.
+  /// @param output       output image
+  /// @param imageSizeX   width of the image
+  /// @param imageSizeY   height of the image
+  /// @param firstX       x-coordinate of first pixel to write
+  ///                     (Parity determines vertically low or high band.)
+  /// @param firstY       y-coordinate of first pixel to write
+  ///                     (Parity determines horizontally low or high band.)
+  __device__ void init(const int imageSizeX, const int imageSizeY,
+                       const int firstX, const int firstY) {
+    if (firstX < imageSizeX) {
+      next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
+    } else {
+      clear();
    }
+  }

-    /// Sets all fields to zeros, for compiler not to complain about
-    /// uninitialized stuff.
-    __device__ void clear() {
-      this->end = 0;
-      this->strideHighToLow = 0;
-      this->strideLowToHigh = 0;
-      this->next = 0;
-    }
+  /// Sets all fields to zeros, for compiler not to complain about
+  /// uninitialized stuff.
+  __device__ void clear() {
+    this->end = 0;
+    this->strideHighToLow = 0;
+    this->strideLowToHigh = 0;
+    this->next = 0;
+  }

-    /// Writes another coefficient into the band which was specified using
-    /// init's firstX and firstY parameters and advances internal pointer.
-    /// Call this method first if position of first pixel passed to init
-    /// was in lowpass band.
-    /// @param output  output image
-    /// @param low     lowpass coefficient to save into the lowpass band
-    __device__ int writeLowInto(T * const output, const T & primary) {
-      return saveAndUpdate(output, primary, this->strideLowToHigh);
-    }
+  /// Writes another coefficient into the band which was specified using
+  /// init's firstX and firstY parameters and advances internal pointer.
+  /// Call this method first if position of first pixel passed to init
+  /// was in lowpass band.
+  /// @param output  output image
+  /// @param low     lowpass coefficient to save into the lowpass band
+  __device__ int writeLowInto(T *const output, const T &primary) {
+    return saveAndUpdate(output, primary, this->strideLowToHigh);
+  }

-    /// Writes another coefficient from the other band and advances pointer.
-    /// Call this method first if position of first pixel passed to init
-    /// was in highpass band.
-    /// @param output  output image
-    /// @param high    highpass coefficient to save into the highpass band
-    __device__ int writeHighInto(T * const output, const T & other) {
-      return saveAndUpdate(output, other, this->strideHighToLow);
-    }
+  /// Writes another coefficient from the other band and advances pointer.
+  /// Call this method first if position of first pixel passed to init
+  /// was in highpass band.
+  /// @param output  output image
+  /// @param high    highpass coefficient to save into the highpass band
+  __device__ int writeHighInto(T *const output, const T &other) {
+    return saveAndUpdate(output, other, this->strideHighToLow);
+  }

-	//*******Add three functions to get private values*******
-	__device__ int getnext(){
-		return next;
-	}
+  //*******Add three functions to get private values*******
+  __device__ int getnext() { return next; }

-	__device__ int getend(){
-		return this->end;
-	}
+  __device__ int getend() { return this->end; }

-	__device__ int getstrideHighToLow(){
-		return this->strideHighToLow;
-	}
-	
-	__device__ int getstrideLowToHigh(){
-		return this->strideLowToHigh;
-	}
-	
-	//*******Add three functions to get private values*******
-  };
+  __device__ int getstrideHighToLow() { return this->strideHighToLow; }

+  __device__ int getstrideLowToHigh() { return this->strideLowToHigh; }

+  //*******Add three functions to get private values*******
+};

 } // namespace dwt_cuda

-
-#endif	// IO_H
-
+#endif // IO_H
--- a/examples/dwt2d/dwt_cuda/transform_buffer.h
+++ b/examples/dwt2d/dwt_cuda/transform_buffer.h
@ -30,344 +30,309 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///

-
 #ifndef TRANSFORM_BUFFER_H
-#define	TRANSFORM_BUFFER_H
-
+#define TRANSFORM_BUFFER_H

 namespace dwt_cuda {

+/// Buffer (in shared memory of GPU) where block of input image is stored,
+/// but odd and even lines are separated. (Generates less bank conflicts when
+/// using lifting schema.) All operations expect SIZE_X threads.
+/// Also implements basic building blocks of lifting schema.
+/// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
+///                     a number of threads participating on all operations.)
+///                     Must be divisible by 4.
+/// @tparam SIZE_Y      height of buffer (total number of lines)
+/// @tparam BOUNDARY_X  number of extra pixels at the left and right side
+///                     boundary is expected to be smaller than half SIZE_X
+///                     Must be divisible by 2.
+template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
+class TransformBuffer {
+public:
+  enum {
+    /// difference between pointers to two vertical neigbors
+    VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
+  };

-  /// Buffer (in shared memory of GPU) where block of input image is stored,
-  /// but odd and even lines are separated. (Generates less bank conflicts when 
-  /// using lifting schema.) All operations expect SIZE_X threads.
-  /// Also implements basic building blocks of lifting schema.
-  /// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
-  ///                     a number of threads participating on all operations.)
-  ///                     Must be divisible by 4.
-  /// @tparam SIZE_Y      height of buffer (total number of lines)
-  /// @tparam BOUNDARY_X  number of extra pixels at the left and right side
-  ///                     boundary is expected to be smaller than half SIZE_X
-  ///                     Must be divisible by 2.
-  template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
-  class TransformBuffer {
-  public:
-    enum {
-      /// difference between pointers to two vertical neigbors
-      VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
-    };
+private:
+  enum {
+/// number of shared memory banks - needed for correct padding
+#ifdef __CUDA_ARCH__
+    SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
+#else
+    SHM_BANKS = 16, // for host code only - can be anything, won't be used
+#endif

-  private:
-    enum {
-      /// number of shared memory banks - needed for correct padding
-      #ifdef __CUDA_ARCH__
-      SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
-      #else
-      SHM_BANKS = 16,  // for host code only - can be anything, won't be used
-      #endif
+    /// size of one of two buffers (odd or even)
+    BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,

-      /// size of one of two buffers (odd or even)
-      BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
+    /// unused space between two buffers
+    PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),

-      /// unused space between two buffers
-      PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
+    /// offset of the odd columns buffer from the beginning of data buffer
+    ODD_OFFSET = BUFFER_SIZE + PADDING,
+  };

-      /// offset of the odd columns buffer from the beginning of data buffer
-      ODD_OFFSET = BUFFER_SIZE + PADDING,
-    };
+  /// buffer for both even and odd columns
+  T data[2 * BUFFER_SIZE + PADDING];

-    /// buffer for both even and odd columns
-    T data[2 * BUFFER_SIZE + PADDING];
+  /// Applies specified function to all central elements while also passing
+  /// previous and next elements as parameters.
+  /// @param count         count of central elements to apply function to
+  /// @param prevOffset    offset of first central element
+  /// @param midOffset     offset of first central element's predecessor
+  /// @param nextOffset    offset of first central element's successor
+  /// @param function      the function itself
+  template <typename FUNC>
+  __device__ void horizontalStep(const int count, const int prevOffset,
+                                 const int midOffset, const int nextOffset,
+                                 const FUNC &function) {
+    // number of unchecked iterations
+    const int STEPS = count / SIZE_X;

+    // items remaining after last unchecked iteration
+    const int finalCount = count % SIZE_X;

+    // offset of items processed in last (checked) iteration
+    const int finalOffset = count - finalCount;

-    /// Applies specified function to all central elements while also passing
-    /// previous and next elements as parameters.
-    /// @param count         count of central elements to apply function to
-    /// @param prevOffset    offset of first central element
-    /// @param midOffset     offset of first central element's predecessor
-    /// @param nextOffset    offset of first central element's successor
-    /// @param function      the function itself
-    template <typename FUNC>
-    __device__ void horizontalStep(const int count, const int prevOffset, 
-                                   const int midOffset, const int nextOffset,
-                                   const FUNC & function) {
-      // number of unchecked iterations
-      const int STEPS = count / SIZE_X;
-      
-      // items remaining after last unchecked iteration
-      const int finalCount = count % SIZE_X; 
-      
-      // offset of items processed in last (checked) iteration
-      const int finalOffset = count - finalCount;  
-      
-      // all threads perform fixed number of iterations ...
-      for(int i = 0; i < STEPS; i++) {
+    // all threads perform fixed number of iterations ...
+    for (int i = 0; i < STEPS; i++) {
      // for(int i = 0; i < 3; i++) {
-        const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
-        const T next     = data[nextOffset + i * SIZE_X + threadIdx.x];
-        T & center       = data[midOffset  + i * SIZE_X + threadIdx.x];
-        // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
-        function(previous, center, next);// the real one
-      }
-      
-      // ... but not all threads participate on final iteration
-      if(threadIdx.x < finalCount) {
-        const T previous = data[prevOffset + finalOffset + threadIdx.x];
-        const T next     = data[nextOffset + finalOffset + threadIdx.x];
-        T & center = data[midOffset + finalOffset + threadIdx.x];
-        // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
-        // kaixi
-        function(previous, center, next);//the real one
-      }
+      const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
+      const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
+      T &center = data[midOffset + i * SIZE_X + threadIdx.x];
+      // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
+      function(previous, center, next); // the real one
    }

-  public:
-
-    __device__ void getPrintData() {
-      //
-      for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) {     
-          printf(" index: %d  data: %f \n ", i ,data[i]);  
-      }
-
-   }
-
-    
-    /// Gets offset of the column with given index. Central columns have 
-    /// indices from 0 to NUM_LINES - 1, left boundary columns have negative 
-    /// indices and right boundary columns indices start with NUM_LINES.
-    /// @param columnIndex  index of column to get pointer to
-    /// @return  offset of the first item of column with specified index
-    __device__ int getColumnOffset(int columnIndex) {
-      columnIndex += BOUNDARY_X;             // skip boundary
-      return columnIndex / 2                 // select right column
-          + (columnIndex & 1) * ODD_OFFSET;  // select odd or even buffer
+    // ... but not all threads participate on final iteration
+    if (threadIdx.x < finalCount) {
+      const T previous = data[prevOffset + finalOffset + threadIdx.x];
+      const T next = data[nextOffset + finalOffset + threadIdx.x];
+      T &center = data[midOffset + finalOffset + threadIdx.x];
+      // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
+      // kaixi
+      function(previous, center, next); // the real one
    }
+  }

-    
-    /// Provides access to data of the transform buffer.
-    /// @param index  index of the item to work with
-    /// @return reference to item at given index
-    __device__ T & operator[] (const int index) {
-      return data[index];
+public:
+  __device__ void getPrintData() {
+    //
+    for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
+      printf(" index: %d  data: %f \n ", i, data[i]);
    }
+  }

+  /// Gets offset of the column with given index. Central columns have
+  /// indices from 0 to NUM_LINES - 1, left boundary columns have negative
+  /// indices and right boundary columns indices start with NUM_LINES.
+  /// @param columnIndex  index of column to get pointer to
+  /// @return  offset of the first item of column with specified index
+  __device__ int getColumnOffset(int columnIndex) {
+    columnIndex += BOUNDARY_X;               // skip boundary
+    return columnIndex / 2                   // select right column
+           + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
+  }

-    /// Applies specified function to all horizontally even elements in 
-    /// specified lines. (Including even elements in boundaries except 
-    /// first even element in first left boundary.) SIZE_X threads participate 
-    /// and synchronization is needed before result can be used.
-    /// @param firstLine  index of first line
-    /// @param numLines   count of lines
-    /// @param func       function to be applied on all even elements
-    ///                   parameters: previous (odd) element, the even
-    ///                   element itself and finally next (odd) element
-    template <typename FUNC>
-    __device__ void forEachHorizontalEven(const int firstLine,
-                                          const int numLines,
-                                          const FUNC & func) {
-      // number of even elemens to apply function to
-      const int count = numLines * VERTICAL_STRIDE - 1;
-      // offset of first even element
-      const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
-      // offset of odd predecessor of first even element
-      const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
-      // offset of odd successor of first even element
-      const int nextOffset = prevOffset + 1;
+  /// Provides access to data of the transform buffer.
+  /// @param index  index of the item to work with
+  /// @return reference to item at given index
+  __device__ T &operator[](const int index) { return data[index]; }

-      // if(threadIdx.x == 0) {
+  /// Applies specified function to all horizontally even elements in
+  /// specified lines. (Including even elements in boundaries except
+  /// first even element in first left boundary.) SIZE_X threads participate
+  /// and synchronization is needed before result can be used.
+  /// @param firstLine  index of first line
+  /// @param numLines   count of lines
+  /// @param func       function to be applied on all even elements
+  ///                   parameters: previous (odd) element, the even
+  ///                   element itself and finally next (odd) element
+  template <typename FUNC>
+  __device__ void forEachHorizontalEven(const int firstLine, const int numLines,
+                                        const FUNC &func) {
+    // number of even elemens to apply function to
+    const int count = numLines * VERTICAL_STRIDE - 1;
+    // offset of first even element
+    const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
+    // offset of odd predecessor of first even element
+    const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
+    // offset of odd successor of first even element
+    const int nextOffset = prevOffset + 1;

-      //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
-      // }
+    // if(threadIdx.x == 0) {

-      // call generic horizontal step function
-      horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
-    }
+    //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d
+    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
+    // }

+    // call generic horizontal step function
+    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
+  }

-    /// Applies given function to all horizontally odd elements in specified
-    /// lines. (Including odd elements in boundaries except last odd element
-    /// in last right boundary.) SIZE_X threads participate and synchronization
-    /// is needed before result can be used.
-    /// @param firstLine  index of first line
-    /// @param numLines   count of lines
-    /// @param func       function to be applied on all odd elements
-    ///                   parameters: previous (even) element, the odd
-    ///                   element itself and finally next (even) element
-    template <typename FUNC>
-    __device__ void forEachHorizontalOdd(const int firstLine,
-                                         const int numLines,
-                                         const FUNC & func) {
-      // numbet of odd elements to apply function to
-      const int count = numLines * VERTICAL_STRIDE - 1;
-      // offset of even predecessor of first odd element
-      const int prevOffset = firstLine * VERTICAL_STRIDE;
-      // offset of first odd element
-      const int centerOffset = prevOffset + ODD_OFFSET;
-      // offset of even successor of first odd element
-      const int nextOffset = prevOffset + 1;
+  /// Applies given function to all horizontally odd elements in specified
+  /// lines. (Including odd elements in boundaries except last odd element
+  /// in last right boundary.) SIZE_X threads participate and synchronization
+  /// is needed before result can be used.
+  /// @param firstLine  index of first line
+  /// @param numLines   count of lines
+  /// @param func       function to be applied on all odd elements
+  ///                   parameters: previous (even) element, the odd
+  ///                   element itself and finally next (even) element
+  template <typename FUNC>
+  __device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
+                                       const FUNC &func) {
+    // numbet of odd elements to apply function to
+    const int count = numLines * VERTICAL_STRIDE - 1;
+    // offset of even predecessor of first odd element
+    const int prevOffset = firstLine * VERTICAL_STRIDE;
+    // offset of first odd element
+    const int centerOffset = prevOffset + ODD_OFFSET;
+    // offset of even successor of first odd element
+    const int nextOffset = prevOffset + 1;

-      //  if(threadIdx.x == 0) {
-      //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
-      // }
+    //  if(threadIdx.x == 0) {
+    //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d
+    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
+    // }

+    // call generic horizontal step function
+    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
+  }

-      // call generic horizontal step function
-      horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
-    }
-    
-    
-    /// Applies specified function to all even elements (except element #0)
-    /// of given column. Each thread takes care of one column, so there's 
-    /// no need for synchronization.
-    /// @param columnOffset  offset of thread's column
-    /// @param f             function to be applied on all even elements
-    ///                      parameters: previous (odd) element, the even
-    ///                      element itself and finally next (odd) element
-    template <typename F>
-    __device__ void forEachVerticalEven(const int columnOffset, const F & f) {
-      if(SIZE_Y > 3) { // makes no sense otherwise
-        const int steps = SIZE_Y / 2 - 1;
-        for(int i = 0; i < steps; i++) {
-          const int row = 2 + i * 2;
-          const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
-          const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
-          f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next);
-		  
-		  //--------------- FOR TEST -----------------
-/*		__syncthreads();
-		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
-			diffOut[2500]++;
-			diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
-		}	
-		__syncthreads();
-*/		  //--------------- FOR TEST -----------------
-		  
-		  
-        }
-      }
-    }
-    
-    
-    /// Applies specified function to all odd elements of given column.
-    /// Each thread takes care of one column, so there's no need for
-    /// synchronization.
-    /// @param columnOffset  offset of thread's column
-    /// @param f             function to be applied on all odd elements
-    ///                      parameters: previous (even) element, the odd
-    ///                      element itself and finally next (even) element
-    template <typename F>
-    __device__ void forEachVerticalOdd(const int columnOffset, const F & f) {
-      const int steps = (SIZE_Y - 1) / 2;
-      for(int i = 0; i < steps; i++) {
-        const int row = i * 2 + 1;
+  /// Applies specified function to all even elements (except element #0)
+  /// of given column. Each thread takes care of one column, so there's
+  /// no need for synchronization.
+  /// @param columnOffset  offset of thread's column
+  /// @param f             function to be applied on all even elements
+  ///                      parameters: previous (odd) element, the even
+  ///                      element itself and finally next (odd) element
+  template <typename F>
+  __device__ void forEachVerticalEven(const int columnOffset, const F &f) {
+    if (SIZE_Y > 3) { // makes no sense otherwise
+      const int steps = SIZE_Y / 2 - 1;
+      for (int i = 0; i < steps; i++) {
+        const int row = 2 + i * 2;
        const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
        const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
+        f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);

-		f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
-		
-		
-		  //--------------- FOR TEST -----------------
-/*		__syncthreads();
-		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
-			diffOut[2500]++;
-			diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];
-		}
-
-		__syncthreads();
-*/		  //--------------- FOR TEST -----------------
+        //--------------- FOR TEST -----------------
+        /*		__syncthreads();
+                        if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
+                                diffOut[2500]++;
+                                diffOut[diffOut[2500]] = 2;//data[columnOffset +
+           row * VERTICAL_STRIDE];
+                        }
+                        __syncthreads();
+        */		  //--------------- FOR TEST -----------------
      }
    }
+  }

+  /// Applies specified function to all odd elements of given column.
+  /// Each thread takes care of one column, so there's no need for
+  /// synchronization.
+  /// @param columnOffset  offset of thread's column
+  /// @param f             function to be applied on all odd elements
+  ///                      parameters: previous (even) element, the odd
+  ///                      element itself and finally next (even) element
+  template <typename F>
+  __device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
+    const int steps = (SIZE_Y - 1) / 2;
+    for (int i = 0; i < steps; i++) {
+      const int row = i * 2 + 1;
+      const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
+      const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];

+      f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);

-    /// Scales elements at specified lines.
-    /// @param evenScale  scaling factor for horizontally even elements
-    /// @param oddScale   scaling factor for horizontally odd elements
-    /// @param numLines   number of lines, whose elements should be scaled
-    /// @param firstLine  index of first line to scale elements in
-    __device__ void scaleHorizontal(const T evenScale, const T oddScale,
-                                    const int firstLine, const int numLines) {
-      const int offset = firstLine * VERTICAL_STRIDE;
-      const int count = numLines * VERTICAL_STRIDE;
-      const int steps = count / SIZE_X;
-      const int finalCount = count % SIZE_X;
-      const int finalOffset = count - finalCount;
+      //--------------- FOR TEST -----------------
+      /*		__syncthreads();
+                      if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
+                              diffOut[2500]++;
+                              diffOut[diffOut[2500]] = 1; //data[columnOffset +
+         row * VERTICAL_STRIDE];
+                      }

-      // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);
+                      __syncthreads();
+      */		  //--------------- FOR TEST -----------------
+    }
+  }

-      // run iterations, whete all threads participate
-      for(int i = 0; i < steps; i++) {
-        data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
-        // if(threadIdx.x + i * SIZE_X + offset == 531) {
-        //   printf("threadidx 531: %d \n", threadIdx.x);
-        // }
-        // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
-        //   printf("threadidx 531: %d \n", threadIdx.x);
-        // }
-        data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
-      }
+  /// Scales elements at specified lines.
+  /// @param evenScale  scaling factor for horizontally even elements
+  /// @param oddScale   scaling factor for horizontally odd elements
+  /// @param numLines   number of lines, whose elements should be scaled
+  /// @param firstLine  index of first line to scale elements in
+  __device__ void scaleHorizontal(const T evenScale, const T oddScale,
+                                  const int firstLine, const int numLines) {
+    const int offset = firstLine * VERTICAL_STRIDE;
+    const int count = numLines * VERTICAL_STRIDE;
+    const int steps = count / SIZE_X;
+    const int finalCount = count % SIZE_X;
+    const int finalOffset = count - finalCount;

-      // some threads also finish remaining unscaled items
-      if(threadIdx.x < finalCount) {
-        data[threadIdx.x + finalOffset + offset] *= evenScale;
-        // if(threadIdx.x + finalOffset + offset == 531) {
-        //   printf("threadidx 531: %d \n", threadIdx.x);
-        // }
-        //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
-        //   printf("threadidx 531: %d \n", threadIdx.x);
-        // }
-        data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
-      }
+    // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d,
+    // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
+    // finalCount, finalOffset);

+    // run iterations, whete all threads participate
+    for (int i = 0; i < steps; i++) {
+      data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
+      // if(threadIdx.x + i * SIZE_X + offset == 531) {
+      //   printf("threadidx 531: %d \n", threadIdx.x);
+      // }
+      // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
+      //   printf("threadidx 531: %d \n", threadIdx.x);
+      // }
+      data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
    }

+    // some threads also finish remaining unscaled items
+    if (threadIdx.x < finalCount) {
+      data[threadIdx.x + finalOffset + offset] *= evenScale;
+      // if(threadIdx.x + finalOffset + offset == 531) {
+      //   printf("threadidx 531: %d \n", threadIdx.x);
+      // }
+      //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
+      //   printf("threadidx 531: %d \n", threadIdx.x);
+      // }
+      data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
+    }
+  }

-    /// Scales elements in specified column.
-    /// @param evenScale     scaling factor for vertically even elements
-    /// @param oddScale      scaling factor for vertically odd elements
-    /// @param columnOffset  offset of the column to work with
-    /// @param numLines      number of lines, whose elements should be scaled
-    /// @param firstLine     index of first line to scale elements in
-    __device__ void scaleVertical(const T evenScale, const T oddScale,
-                                  const int columnOffset, const int numLines,
-                                  const int firstLine) {
-      for(int i = firstLine; i < (numLines + firstLine); i++) {
-        if(i & 1) {
-          data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
-        } else {
-          data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
-        }
+  /// Scales elements in specified column.
+  /// @param evenScale     scaling factor for vertically even elements
+  /// @param oddScale      scaling factor for vertically odd elements
+  /// @param columnOffset  offset of the column to work with
+  /// @param numLines      number of lines, whose elements should be scaled
+  /// @param firstLine     index of first line to scale elements in
+  __device__ void scaleVertical(const T evenScale, const T oddScale,
+                                const int columnOffset, const int numLines,
+                                const int firstLine) {
+    for (int i = firstLine; i < (numLines + firstLine); i++) {
+      if (i & 1) {
+        data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
+      } else {
+        data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
      }
    }
+  }

+  //****************For Test(Feb23), test inter parameters*************
+  __device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
+  __device__ int getSHM_BANKS() { return SHM_BANKS; }
+  __device__ int getBuffersize() { return BUFFER_SIZE; }
+  __device__ int getPADDING() { return PADDING; }
+  __device__ int getODD_OFFSET() { return ODD_OFFSET; }

-	//****************For Test(Feb23), test inter parameters*************
-	__device__ int getVERTICAL_STRIDE(){
-		return VERTICAL_STRIDE;
-	}
-	__device__ int getSHM_BANKS(){
-		return SHM_BANKS;
-	}
-	__device__ int  getBuffersize(){		
-		return BUFFER_SIZE;
-	}
-	__device__ int getPADDING(){
-		return PADDING;
-	}
-	__device__ int getODD_OFFSET(){
-		return ODD_OFFSET;
-	}
-
-
-    //****************For Test(Feb23), test inter parameters*************
-	
-	
-  };  // end of class TransformBuffer
+  //****************For Test(Feb23), test inter parameters*************

+}; // end of class TransformBuffer

 } // namespace dwt_cuda

-
-#endif	// TRANSFORM_BUFFER_H
-
+#endif // TRANSFORM_BUFFER_H
--- a/examples/dwt2d/run_cpu.sh
+++ b/examples/dwt2d/run_cpu.sh
@ -5,4 +5,3 @@
 ./dwt2d 4.bmp  -d 4x4 -r -5 -l 3
 # ./dwt2d 4.bmp  -d 4x4 -r -9 -l 3
 # ./dwt2d 8.bmp  -d 8x8 -f -9 -l 3
-
--- a/examples/dwt2d/test_compile_nvcc.sh
+++ b/examples/dwt2d/test_compile_nvcc.sh
@ -7,12 +7,3 @@
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
 g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
-
-
-
-
-
-
-
-
-
--- a/examples/microbench/cudamemcpy_test.cc
+++ b/examples/microbench/cudamemcpy_test.cc
@ -1,38 +1,36 @@
 #include <stdio.h>

-__global__
-void saxpy(int n, float a, float *x, float *y)
-{
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  if (i < n) y[i] = a*x[i] + y[i];
+__global__ void saxpy(int n, float a, float *x, float *y) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    y[i] = a * x[i] + y[i];
 }

-int main(void)
-{
-  int N = 1<<20;
+int main(void) {
+  int N = 1 << 20;
  float *x, *y, *d_x, *d_y;
-  x = (float*)malloc(N*sizeof(float));
-  y = (float*)malloc(N*sizeof(float));
+  x = (float *)malloc(N * sizeof(float));
+  y = (float *)malloc(N * sizeof(float));

-  cudaMalloc(&d_x, N*sizeof(float)); 
-  cudaMalloc(&d_y, N*sizeof(float));
+  cudaMalloc(&d_x, N * sizeof(float));
+  cudaMalloc(&d_y, N * sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

-  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
  // saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

-  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
-    maxError = max(maxError, abs(y[i]-4.0f));
+    maxError = max(maxError, abs(y[i] - 4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
--- a/examples/microbench/dummy_kernel.cc
+++ b/examples/microbench/dummy_kernel.cc
@ -1,38 +1,35 @@
 #include <stdio.h>

-__global__
-void saxpy(void)
-{
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
+__global__ void saxpy(void) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
  printf("block_id:%d thread_id:%d \n", i)
 }

-int main(void)
-{
-  int N = 1<<20;
+int main(void) {
+  int N = 1 << 20;
  float *x, *y, *d_x, *d_y;
-  x = (float*)malloc(N*sizeof(float));
-  y = (float*)malloc(N*sizeof(float));
+  x = (float *)malloc(N * sizeof(float));
+  y = (float *)malloc(N * sizeof(float));

-  cudaMalloc(&d_x, N*sizeof(float)); 
-  cudaMalloc(&d_y, N*sizeof(float));
+  cudaMalloc(&d_x, N * sizeof(float));
+  cudaMalloc(&d_y, N * sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

-  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
-    saxpy<<<(1,1)>>>;
+  saxpy<<<(1, 1)>>>;

-  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
-    maxError = max(maxError, abs(y[i]-4.0f));
+    maxError = max(maxError, abs(y[i] - 4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
--- a/examples/microbench/kerne_arg.cc
+++ b/examples/microbench/kerne_arg.cc
@ -1,37 +1,32 @@
 #include <stdio.h>

-__global__
-void saxpy(int N)
-{
-printf("hello!: %d\n", N);
-}
+__global__ void saxpy(int N) { printf("hello!: %d\n", N); }

-int main(void)
-{
-  int N = 1<<20;
+int main(void) {
+  int N = 1 << 20;
  float *x, *y, *d_x, *d_y;
-  x = (float*)malloc(N*sizeof(float));
-  y = (float*)malloc(N*sizeof(float));
+  x = (float *)malloc(N * sizeof(float));
+  y = (float *)malloc(N * sizeof(float));

-  cudaMalloc(&d_x, N*sizeof(float)); 
-  cudaMalloc(&d_y, N*sizeof(float));
+  cudaMalloc(&d_x, N * sizeof(float));
+  cudaMalloc(&d_y, N * sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

-  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
-    saxpy<<<(1,1)>>>(N);
+  saxpy<<<(1, 1)>>>(N);

-  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
-    maxError = max(maxError, abs(y[i]-4.0f));
+    maxError = max(maxError, abs(y[i] - 4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
--- a/examples/microbench/one_thread_kernel.cc
+++ b/examples/microbench/one_thread_kernel.cc
@ -1,37 +1,32 @@
 #include <stdio.h>

-__global__
-void saxpy(void)
-{
-printf("hello!\n");
-}
+__global__ void saxpy(void) { printf("hello!\n"); }

-int main(void)
-{
-  int N = 1<<20;
+int main(void) {
+  int N = 1 << 20;
  float *x, *y, *d_x, *d_y;
-  x = (float*)malloc(N*sizeof(float));
-  y = (float*)malloc(N*sizeof(float));
+  x = (float *)malloc(N * sizeof(float));
+  y = (float *)malloc(N * sizeof(float));

-  cudaMalloc(&d_x, N*sizeof(float)); 
-  cudaMalloc(&d_y, N*sizeof(float));
+  cudaMalloc(&d_x, N * sizeof(float));
+  cudaMalloc(&d_y, N * sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

-  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
-    saxpy<<<(1,1)>>>;
+  saxpy<<<(1, 1)>>>;

-  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
-    maxError = max(maxError, abs(y[i]-4.0f));
+    maxError = max(maxError, abs(y[i] - 4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
--- a/runtime/lib/cudaRuntimeImpl.cpp
+++ b/runtime/lib/cudaRuntimeImpl.cpp
@ -43,7 +43,7 @@ cudaError_t cudaMallocHost(void **devPtr, size_t size) {
  *devPtr = malloc(size);
  if (devPtr == NULL)
    return cudaErrorMemoryAllocation;
- return cudaSuccess;
+  return cudaSuccess;
 }
 cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
  memset(devPtr, value, count);