Merge branch 'cupbop:master' into master

2022-05-24 21:16:37 -04:00 · 2022-05-24 21:16:37 -04:00 · 21f298524e
parent 01fcd6e0cc 197abc867d
commit 21f298524e
27 changed files with 1246 additions and 1384 deletions
--- a/README.md
+++ b/README.md
@ -27,10 +27,10 @@ Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RIS
   export CuPBoP_PATH=`pwd`
   export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH
   ```
 If you are using boson, you can pre-installed llvm 10.0.0 
- LLVM_PATH=/opt/llvm-10.0.0
+   If you are using boson, you can pre-installed llvm 10.0.0\
- export PATH=$LLVM_PATH/bin:$PATH
+   `LLVM_PATH=/opt/llvm-10.0.0`\
   `export PATH=$LLVM_PATH/bin:$PATH`
 2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file
--- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
@ -272,7 +272,6 @@ void AddContextSaveRestore(llvm::Instruction *instruction,
  std::vector<Instruction *> uses;
  Function *f2 = instruction->getParent()->getParent();
  for (Instruction::use_iterator ui = instruction->use_begin(),
                                 ue = instruction->use_end();
       ui != ue; ++ui) {
--- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
@ -89,11 +89,12 @@ void mem_share2global(llvm::Module *M) {
          } else if (element_type->isStructTy()) {
            auto undef = llvm::UndefValue::get(element_type);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
-                *M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef,
+                *M, element_type, false, llvm::GlobalValue::ExternalLinkage,
-                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
+                undef, new_name, NULL,
-                false);
+                llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
            global_memory->setDSOLocal(true);
-            Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName()));
+            Comdat *comdat =
                M->getOrInsertComdat(StringRef(share_memory->getName()));
            comdat->setSelectionKind(Comdat::SelectionKind::Any);
            global_memory->setComdat(comdat);
            global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
@ -103,7 +104,6 @@ void mem_share2global(llvm::Module *M) {
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
          } else {
            assert(0 && "The required Share Memory Type is not supported\n");
          }
--- a/examples/dwt2d/common.h
+++ b/examples/dwt2d/common.h
@ -45,18 +45,20 @@
 // divide and round up macro
 #define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
-#  define cudaCheckError( msg ) {                                            \
+#define cudaCheckError(msg)                                                    \
  {                                                                            \
    cudaError_t err = cudaGetLastError();                                      \
    if (cudaSuccess != err) {                                                  \
-        fprintf(stderr, "%s: %i: %s: %s.\n",                                 \
+      fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg,            \
-                __FILE__, __LINE__, msg, cudaGetErrorString( err) );         \
+              cudaGetErrorString(err));                                        \
      exit(-1);                                                                \
-    } }
+    }                                                                          \
  }
-#  define cudaCheckAsyncError( msg ) {                                       \
+#define cudaCheckAsyncError(msg)                                               \
  {                                                                            \
    cudaThreadSynchronize();                                                   \
    cudaCheckError(msg);                                                       \
  }
 #endif
--- a/examples/dwt2d/components.h
+++ b/examples/dwt2d/components.h
@ -29,7 +29,8 @@
 /* Separate compoents of source 8bit RGB image */
 template <typename T>
-void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height);
+void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
                     int height);
 /* Copy a 8bit source image data into a color compoment of type T */
 template <typename T>
--- a/examples/dwt2d/dwt.h
+++ b/examples/dwt2d/dwt.h
@ -28,13 +28,14 @@
 #define _DWT_H
 template <typename T>
-int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
+int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
                int stages, bool forward);
 template <typename T>
-int writeNStage2DDWT(T *component_cuda, int width, int height, 
+int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
                     int stages, const char * filename, const char * suffix);
 template<typename T>
 int writeLinear(T *component_cuda, int width, int height, 
                     const char *filename, const char *suffix);
 template <typename T>
 int writeLinear(T *component_cuda, int width, int height, const char *filename,
                const char *suffix);
 #endif
--- a/examples/dwt2d/dwt_cuda/common.h
+++ b/examples/dwt2d/dwt_cuda/common.h
@ -29,29 +29,22 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef DWT_COMMON_H
 #define DWT_COMMON_H
 #include <cstdio>
 #include <algorithm>
 #include <cstdio>
 #include <vector>
 // compile time minimum macro
 #define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
 // performance testing macros
 #if defined(GPU_DWT_TESTING)
 #define PERF_BEGIN                                                             \
  {                                                                            \
    dwt_cuda::CudaDWTTester PERF_TESTER;                                       \
-    for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \
+    for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) {             \
    { \
      PERF_TESTER.beginTestIteration();
 #define PERF_END(PERF_NAME, PERF_W, PERF_H)                                    \
@ -64,25 +57,20 @@
 #define PERF_END(PERF_NAME, PERF_W, PERF_H)
 #endif // GPU_DWT_TESTING
 namespace dwt_cuda {
 /// Divide and round up.
 template <typename T>
 __device__ __host__ inline T divRndUp(const T &n, const T &d) {
  return (n / d) + ((n % d) ? 1 : 0);
 }
 // 9/7 forward DWT lifting schema coefficients
 const float f97Predict1 = -1.586134342;  ///< forward 9/7 predict 1
 const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
 const float f97Predict2 = 0.8829110762;  ///< forward 9/7 predict 2
 const float f97Update2 = 0.4435068522;   ///< forward 9/7 update 2
 // 9/7 reverse DWT lifting schema coefficients
 const float r97update2 = -f97Update2;   ///< undo 9/7 update 2
 const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
@ -93,7 +81,6 @@ namespace dwt_cuda {
 const float scale97Mul = 1.23017410491400f;
 const float scale97Div = 1.0 / scale97Mul;
 // 5/3 forward DWT lifting schema coefficients
 const float forward53Predict = -0.5f; /// forward 5/3 predict
 const float forward53Update = 0.25f;  /// forward 5/3 update
@ -102,8 +89,6 @@ namespace dwt_cuda {
 const float reverse53Update = -forward53Update;   /// undo 5/3 update
 const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
 /// Functor which adds scaled sum of neighbors to given central pixel.
 struct AddScaledSum {
  const float scale; // scale of neighbors
@ -112,7 +97,8 @@ namespace dwt_cuda {
    // if(threadIdx.x == 0) {
-      //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );
+    //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
    //   scale * (p + n) );
    // }
@ -120,8 +106,6 @@ namespace dwt_cuda {
  }
 };
 /// Returns index ranging from 0 to num threads, such that first half
 /// of threads get even indices and others get odd indices. Each thread
 /// gets different index.
@ -129,13 +113,10 @@ namespace dwt_cuda {
 ///                              parityIdx:   0  2  4  6  1  3  5  7
 /// @tparam THREADS  total count of participating threads
 /// @return parity-separated index of thread
-  template <int THREADS>
+template <int THREADS> __device__ inline int parityIdx() {
  __device__ inline int parityIdx() {
  return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
 }
 /// size of shared memory
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
 const int SHM_SIZE = 48 * 1024;
@ -143,8 +124,6 @@ namespace dwt_cuda {
 const int SHM_SIZE = 16 * 1024;
 #endif
 /// Perrformance and return code tester.
 class CudaDWTTester {
 private:
@ -185,9 +164,7 @@ namespace dwt_cuda {
  CudaDWTTester() : disabled(testRunning) {}
  /// Gets rpefered number of iterations
-    int getNumIterations() {
+  int getNumIterations() { return disabled ? 1 : 31; }
      return disabled ? 1 : 31;
    }
  /// Starts one test iteration.
  void beginTestIteration() {
@ -225,25 +202,24 @@ namespace dwt_cuda {
      for (int i = times.size(); i--;) {
        sum += times[i];
      }
-        const double median = (times[times.size() / 2]
+      const double median =
-                             + times[(times.size() - 1) / 2]) * 0.5f;
+          (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
      printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
-               "(%d x %d)\n", name, (sum / times.size()), median, 
+             "(%d x %d)\n",
-               times[times.size() - 1], sizeX, sizeY);
+             name, (sum / times.size()), median, times[times.size() - 1], sizeX,
             sizeY);
    }
  }
 };
 /// Simple cudaMemcpy wrapped in performance tester.
 /// @param dest  destination bufer
 /// @param src   source buffer
 /// @param sx    width of copied image
 /// @param sy    height of copied image
 template <typename T>
-  inline void memCopy(T * const dest, const T * const src,
+inline void memCopy(T *const dest, const T *const src, const size_t sx,
-                      const size_t sx, const size_t sy) {
+                    const size_t sy) {
  cudaError_t status;
  PERF_BEGIN
  status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
@ -251,11 +227,6 @@ namespace dwt_cuda {
  CudaDWTTester::check(status, "memcpy device > device");
 }
 } // end of namespace dwt_cuda
 #endif // DWT_COMMON_CUDA_H
--- a/examples/dwt2d/dwt_cuda/dwt.h
+++ b/examples/dwt2d/dwt_cuda/dwt.h
@ -58,10 +58,8 @@
 #ifndef DWT_CUDA_H
 #define DWT_CUDA_H
 namespace dwt_cuda {
 /// Forward 5/3 2D DWT. See common rules (above) for more details.
 /// @param in      Expected to be normalized into range [-128, 127].
 ///                Will not be preserved (will be overwritten).
@ -71,7 +69,6 @@ namespace dwt_cuda {
 /// @param levels  number of recursive DWT levels
 void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
 /// Reverse 5/3 2D DWT. See common rules (above) for more details.
 /// @param in      Input DWT coefficients. Format described in common rules.
 ///                Will not be preserved (will be overwritten).
@ -82,7 +79,6 @@ namespace dwt_cuda {
 /// @param levels  number of recursive DWT levels
 void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
 /// Forward 9/7 2D DWT. See common rules (above) for more details.
 /// @param in      Input DWT coefficients. Should be normalized (in range
 ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
@ -92,7 +88,6 @@ namespace dwt_cuda {
 /// @param levels  number of recursive DWT levels
 void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
 /// Reverse 9/7 2D DWT. See common rules (above) for more details.
 /// @param in      Input DWT coefficients. Format described in common rules.
 ///                Will not be preserved (will be overwritten).
@ -103,10 +98,6 @@ namespace dwt_cuda {
 /// @param levels  number of recursive DWT levels
 void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
 } // namespace dwt_cuda
 #endif // DWT_CUDA_H
--- a/examples/dwt2d/dwt_cuda/io.h
+++ b/examples/dwt2d/dwt_cuda/io.h
@ -30,16 +30,13 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef IO_H
 #define IO_H
 #include "common.h"
 namespace dwt_cuda {
 /// Base for all IO classes - manages mirroring.
 class DWTIO {
 protected:
@ -80,13 +77,11 @@ namespace dwt_cuda {
  }
 };
 /// Base class for pixel loader and writer - manages computing start index,
 /// stride and end of image for loading column of pixels.
 /// @tparam T        type of image pixels
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-  template <typename T, bool CHECKED>
+template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
  class VerticalDWTPixelIO : protected DWTIO {
 protected:
  int end;    ///< index of bottom neightbor of last pixel of column
  int stride; ///< increment of pointer to get to next pixel
@ -97,8 +92,8 @@ namespace dwt_cuda {
  /// @param firstX  x-coordinate of first pixel to use
  /// @param firstY  y-coordinate of first pixel to use
  /// @return index of pixel at position [x, y] in the image
-    __device__ int initialize(const int sizeX, const int sizeY,
+  __device__ int initialize(const int sizeX, const int sizeY, int firstX,
-                              int firstX, int firstY) {
+                            int firstY) {
    // initialize all pointers and stride
    end = CHECKED ? (sizeY * sizeX + firstX) : 0;
    stride = sizeX;
@ -106,8 +101,6 @@ namespace dwt_cuda {
  }
 };
 /// Writes reverse transformed pixels directly into output image.
 /// @tparam T        type of output pixels
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
@ -122,8 +115,8 @@ namespace dwt_cuda {
  /// @param sizeY   height of the image
  /// @param firstX  x-coordinate of first pixel to write into
  /// @param firstY  y-coordinate of first pixel to write into
-    __device__ void init(const int sizeX, const int sizeY, 
+  __device__ void init(const int sizeX, const int sizeY, int firstX,
-                         int firstX, int firstY) {
+                       int firstY) {
    if (firstX < sizeX) {
      next = this->initialize(sizeX, sizeY, firstX, firstY);
    } else {
@ -145,43 +138,28 @@ namespace dwt_cuda {
  }
 };
 /// Loads pixels from input image.
 /// @tparam T        type of image input pixels
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED>
-  class VerticalDWTPixelLoader
+class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
          : protected VerticalDWTPixelIO<const T, CHECKED> {
 private:
  int last; ///< index of last loaded pixel
 public:
  //******************* FOR TEST **********************
-  __device__ int getlast(){
+  __device__ int getlast() { return last; }
-		return last;
+  __device__ int getend() { return this->end; }
-	}
+  __device__ int getstride() { return this->stride; }
-  __device__ int getend(){
+  __device__ void setend(int a) { this->end = a; }
 		return this->end;
 	}
  __device__ int getstride(){
 		return this->stride;
 	}
  __device__ void setend(int a){
      this->end=a;
 	}
  //******************* FOR TEST **********************
  /// Initializes loader - sets input size and a position of first pixel.
  /// @param sizeX   width of the image
  /// @param sizeY   height of the image
  /// @param firstX  x-coordinate of first pixel to load
  /// @param firstY  y-coordinate of first pixel to load
-    __device__ void init(const int sizeX, const int sizeY,
+  __device__ void init(const int sizeX, const int sizeY, int firstX,
-                         int firstX, int firstY) {
+                       int firstY) {
    // correctly mirror x coordinate
    this->mirror(firstX, sizeX);
@ -208,7 +186,8 @@ namespace dwt_cuda {
      this->stride = -this->stride; // reverse loader's direction
    }
    // avoid reading from negative indices if loader is checked
-      // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
+    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
    // checked variant later
    if (last < 0) {
      return 0;
    }
@ -220,14 +199,11 @@ namespace dwt_cuda {
  }
 };
 /// Base for band write and loader. Manages computing strides and pointers
 /// to first and last pixels in a linearly-stored-bands correct way.
 /// @tparam T        type of band coefficients
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-  template <typename T, bool CHECKED>
+template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
  class VerticalDWTBandIO : protected DWTIO {
 protected:
  /// index of bottom neighbor of last pixel of loaded column
  int end;
@ -279,12 +255,10 @@ namespace dwt_cuda {
      end = 0;
    }
    //***********for test**************
    //	end = CHECKED;
    //***********for test**************
    // finally, return index of the first item
    return columnOffset                      // right column
           + (firstY / 2) * verticalStride   // right row
@ -292,9 +266,6 @@ namespace dwt_cuda {
  }
 };
 /// Directly loads coefficients from four consecutively stored transformed
 /// bands.
 /// @tparam T        type of input band coefficients
@ -324,11 +295,12 @@ namespace dwt_cuda {
      return 0;
    }
    // avoid reading from negative indices if loader is checked
-      // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
+    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
    // checked variant later
    return input[last];
  }
  public:
 public:
  /// Initializes loader - sets input size and a position of first pixel.
  /// @param imageSizeX   width of the image
  /// @param imageSizeY   height of the image
@ -336,8 +308,8 @@ namespace dwt_cuda {
  ///                     (Parity determines vertically low or high band.)
  /// @param firstY       y-coordinate of first pixel to load
  ///                     (Parity determines horizontally low or high band.)
-    __device__ void init(const int imageSizeX, const int imageSizeY,
+  __device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
-                         int firstX, const int firstY) {
+                       const int firstY) {
    this->mirror(firstX, imageSizeX);
    last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
@ -371,12 +343,8 @@ namespace dwt_cuda {
  __device__ T loadHighFrom(const T *const input) {
    return updateAndLoad(input, this->strideLowToHigh);
  }
 };
 /// Directly saves coefficients into four transformed bands.
 /// @tparam T        type of output band coefficients
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
@ -392,7 +360,8 @@ namespace dwt_cuda {
  /// @param stride  increment of the pointer to get to next output index
  __device__ int saveAndUpdate(T *const output, const T &item,
                               const int &stride) {
-//	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){		//test, Mar 20					  
+    //	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
    ////test, Mar 20
    if ((!CHECKED) || (next != this->end)) {
      // if(next == 4) {
      //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
@ -407,8 +376,8 @@ namespace dwt_cuda {
    // }
    return next;
  }
  public:
 public:
  /// Initializes writer - sets output size and a position of first pixel.
  /// @param output       output image
  /// @param imageSizeX   width of the image
@ -455,29 +424,17 @@ namespace dwt_cuda {
  }
  //*******Add three functions to get private values*******
-	__device__ int getnext(){
+  __device__ int getnext() { return next; }
 		return next;
 	}
-	__device__ int getend(){
+  __device__ int getend() { return this->end; }
 		return this->end;
 	}
-	__device__ int getstrideHighToLow(){
+  __device__ int getstrideHighToLow() { return this->strideHighToLow; }
 		return this->strideHighToLow;
 	}
-	__device__ int getstrideLowToHigh(){
+  __device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
 		return this->strideLowToHigh;
 	}
  //*******Add three functions to get private values*******
 };
 } // namespace dwt_cuda
 #endif // IO_H
--- a/examples/dwt2d/dwt_cuda/transform_buffer.h
+++ b/examples/dwt2d/dwt_cuda/transform_buffer.h
@ -30,14 +30,11 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef TRANSFORM_BUFFER_H
 #define TRANSFORM_BUFFER_H
 namespace dwt_cuda {
 /// Buffer (in shared memory of GPU) where block of input image is stored,
 /// but odd and even lines are separated. (Generates less bank conflicts when
 /// using lifting schema.) All operations expect SIZE_X threads.
@ -79,8 +76,6 @@ namespace dwt_cuda {
  /// buffer for both even and odd columns
  T data[2 * BUFFER_SIZE + PADDING];
  /// Applies specified function to all central elements while also passing
  /// previous and next elements as parameters.
  /// @param count         count of central elements to apply function to
@ -123,16 +118,13 @@ namespace dwt_cuda {
  }
 public:
  __device__ void getPrintData() {
    //
    for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
      printf(" index: %d  data: %f \n ", i, data[i]);
    }
  }
  /// Gets offset of the column with given index. Central columns have
  /// indices from 0 to NUM_LINES - 1, left boundary columns have negative
  /// indices and right boundary columns indices start with NUM_LINES.
@ -144,14 +136,10 @@ namespace dwt_cuda {
           + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
  }
  /// Provides access to data of the transform buffer.
  /// @param index  index of the item to work with
  /// @return reference to item at given index
-    __device__ T & operator[] (const int index) {
+  __device__ T &operator[](const int index) { return data[index]; }
      return data[index];
    }
  /// Applies specified function to all horizontally even elements in
  /// specified lines. (Including even elements in boundaries except
@ -163,8 +151,7 @@ namespace dwt_cuda {
  ///                   parameters: previous (odd) element, the even
  ///                   element itself and finally next (odd) element
  template <typename FUNC>
-    __device__ void forEachHorizontalEven(const int firstLine,
+  __device__ void forEachHorizontalEven(const int firstLine, const int numLines,
                                          const int numLines,
                                        const FUNC &func) {
    // number of even elemens to apply function to
    const int count = numLines * VERTICAL_STRIDE - 1;
@ -177,14 +164,14 @@ namespace dwt_cuda {
    // if(threadIdx.x == 0) {
-      //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
+    //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d
    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
    // }
    // call generic horizontal step function
    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
  }
  /// Applies given function to all horizontally odd elements in specified
  /// lines. (Including odd elements in boundaries except last odd element
  /// in last right boundary.) SIZE_X threads participate and synchronization
@ -195,8 +182,7 @@ namespace dwt_cuda {
  ///                   parameters: previous (even) element, the odd
  ///                   element itself and finally next (even) element
  template <typename FUNC>
-    __device__ void forEachHorizontalOdd(const int firstLine,
+  __device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
                                         const int numLines,
                                       const FUNC &func) {
    // numbet of odd elements to apply function to
    const int count = numLines * VERTICAL_STRIDE - 1;
@ -208,15 +194,14 @@ namespace dwt_cuda {
    const int nextOffset = prevOffset + 1;
    //  if(threadIdx.x == 0) {
-      //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
+    //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d
    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
    // }
    // call generic horizontal step function
    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
  }
  /// Applies specified function to all even elements (except element #0)
  /// of given column. Each thread takes care of one column, so there's
  /// no need for synchronization.
@ -238,17 +223,15 @@ namespace dwt_cuda {
        /*		__syncthreads();
                        if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
                                diffOut[2500]++;
-			diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
+                                diffOut[diffOut[2500]] = 2;//data[columnOffset +
           row * VERTICAL_STRIDE];
                        }
                        __syncthreads();
        */		  //--------------- FOR TEST -----------------
      }
    }
  }
  /// Applies specified function to all odd elements of given column.
  /// Each thread takes care of one column, so there's no need for
  /// synchronization.
@ -266,12 +249,12 @@ namespace dwt_cuda {
      f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
      //--------------- FOR TEST -----------------
      /*		__syncthreads();
                      if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
                              diffOut[2500]++;
-			diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];
+                              diffOut[diffOut[2500]] = 1; //data[columnOffset +
         row * VERTICAL_STRIDE];
                      }
                      __syncthreads();
@ -279,8 +262,6 @@ namespace dwt_cuda {
    }
  }
  /// Scales elements at specified lines.
  /// @param evenScale  scaling factor for horizontally even elements
  /// @param oddScale   scaling factor for horizontally odd elements
@ -294,7 +275,9 @@ namespace dwt_cuda {
    const int finalCount = count % SIZE_X;
    const int finalOffset = count - finalCount;
-      // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);
+    // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d,
    // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
    // finalCount, finalOffset);
    // run iterations, whete all threads participate
    for (int i = 0; i < steps; i++) {
@ -319,10 +302,8 @@ namespace dwt_cuda {
      // }
      data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
    }
  }
  /// Scales elements in specified column.
  /// @param evenScale     scaling factor for vertically even elements
  /// @param oddScale      scaling factor for vertically odd elements
@ -341,33 +322,17 @@ namespace dwt_cuda {
    }
  }
  //****************For Test(Feb23), test inter parameters*************
  __device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
  __device__ int getSHM_BANKS() { return SHM_BANKS; }
  __device__ int getBuffersize() { return BUFFER_SIZE; }
  __device__ int getPADDING() { return PADDING; }
  __device__ int getODD_OFFSET() { return ODD_OFFSET; }
  //****************For Test(Feb23), test inter parameters*************
 	__device__ int getVERTICAL_STRIDE(){
 		return VERTICAL_STRIDE;
 	}
 	__device__ int getSHM_BANKS(){
 		return SHM_BANKS;
 	}
 	__device__ int  getBuffersize(){		
 		return BUFFER_SIZE;
 	}
 	__device__ int getPADDING(){
 		return PADDING;
 	}
 	__device__ int getODD_OFFSET(){
 		return ODD_OFFSET;
 	}
    //****************For Test(Feb23), test inter parameters*************
 }; // end of class TransformBuffer
 } // namespace dwt_cuda
 #endif // TRANSFORM_BUFFER_H
--- a/examples/dwt2d/run_cpu.sh
+++ b/examples/dwt2d/run_cpu.sh
@ -5,4 +5,3 @@
 ./dwt2d 4.bmp  -d 4x4 -r -5 -l 3
 # ./dwt2d 4.bmp  -d 4x4 -r -9 -l 3
 # ./dwt2d 8.bmp  -d 8x8 -f -9 -l 3
--- a/examples/dwt2d/test_compile_nvcc.sh
+++ b/examples/dwt2d/test_compile_nvcc.sh
@ -7,12 +7,3 @@
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
 g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
--- a/examples/microbench/cudamemcpy_test.cc
+++ b/examples/microbench/cudamemcpy_test.cc
@ -1,14 +1,12 @@
 #include <stdio.h>
-__global__
+__global__ void saxpy(int n, float a, float *x, float *y) {
 void saxpy(int n, float a, float *x, float *y)
 {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < n) y[i] = a*x[i] + y[i];
+  if (i < n)
    y[i] = a * x[i] + y[i];
 }
-int main(void)
+int main(void) {
 {
  int N = 1 << 20;
  float *x, *y, *d_x, *d_y;
  x = (float *)malloc(N * sizeof(float));
--- a/examples/microbench/dummy_kernel.cc
+++ b/examples/microbench/dummy_kernel.cc
@ -1,14 +1,11 @@
 #include <stdio.h>
-__global__
+__global__ void saxpy(void) {
 void saxpy(void)
 {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  printf("block_id:%d thread_id:%d \n", i)
 }
-int main(void)
+int main(void) {
 {
  int N = 1 << 20;
  float *x, *y, *d_x, *d_y;
  x = (float *)malloc(N * sizeof(float));
--- a/examples/microbench/kerne_arg.cc
+++ b/examples/microbench/kerne_arg.cc
@ -1,13 +1,8 @@
 #include <stdio.h>
-__global__
+__global__ void saxpy(int N) { printf("hello!: %d\n", N); }
 void saxpy(int N)
 {
 printf("hello!: %d\n", N);
 }
-int main(void)
+int main(void) {
 {
  int N = 1 << 20;
  float *x, *y, *d_x, *d_y;
  x = (float *)malloc(N * sizeof(float));
--- a/examples/microbench/one_thread_kernel.cc
+++ b/examples/microbench/one_thread_kernel.cc
@ -1,13 +1,8 @@
 #include <stdio.h>
-__global__
+__global__ void saxpy(void) { printf("hello!\n"); }
 void saxpy(void)
 {
 printf("hello!\n");
 }
-int main(void)
+int main(void) {
 {
  int N = 1 << 20;
  float *x, *y, *d_x, *d_y;
  x = (float *)malloc(N * sizeof(float));