Merge pull request #10 from unahb/master

fix linting issues
2022-05-25 01:12:33 +00:00 · 2022-05-25 01:12:33 +00:00 · 197abc867d
parent d834f31626 308e9284cb
commit 197abc867d
27 changed files with 1246 additions and 1384 deletions
--- a/.gitignore
+++ b/.gitignore
@ -44,4 +44,4 @@ CMakeCache.txt
 # OS generated files
 .DS_Store
-.DS_Store?
+.DS_Store?
--- a/README.md
+++ b/README.md
@ -27,11 +27,11 @@ Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RIS
   export CuPBoP_PATH=`pwd`
   export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH
   ```
 If you are using boson, you can pre-installed llvm 10.0.0 
- LLVM_PATH=/opt/llvm-10.0.0
+   If you are using boson, you can pre-installed llvm 10.0.0\
- export PATH=$LLVM_PATH/bin:$PATH
+   `LLVM_PATH=/opt/llvm-10.0.0`\
- 
+   `export PATH=$LLVM_PATH/bin:$PATH`
 2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file
   ```bash
--- a/compilation/KernelTranslation/lib/init.cpp
+++ b/compilation/KernelTranslation/lib/init.cpp
@ -396,7 +396,7 @@ void init_block(llvm::Module *M, std::ofstream &fout) {
  replace_asm_call(M);
  // replace dynamic shared memory
  auto dynamic_shared_memory_addr =
-        M->getGlobalVariable("dynamic_shared_memory");
+      M->getGlobalVariable("dynamic_shared_memory");
  if (dynamic_shared_memory_addr) {
    replace_dynamic_shared_memory(M);
  }
--- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
@ -272,13 +272,12 @@ void AddContextSaveRestore(llvm::Instruction *instruction,
  std::vector<Instruction *> uses;
  Function *f2 = instruction->getParent()->getParent();
  for (Instruction::use_iterator ui = instruction->use_begin(),
                                 ue = instruction->use_end();
       ui != ue; ++ui) {
    llvm::Instruction *user = cast<Instruction>(ui->getUser());
    Function *f1 = user->getParent()->getParent();
-    if(f2->getName() != f1->getName()) {
+    if (f2->getName() != f1->getName()) {
      continue;
    }
    if (user == NULL)
--- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
@ -89,20 +89,20 @@ void mem_share2global(llvm::Module *M) {
          } else if (element_type->isStructTy()) {
            auto undef = llvm::UndefValue::get(element_type);
            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
-                *M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef,
+                *M, element_type, false, llvm::GlobalValue::ExternalLinkage,
-                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
+                undef, new_name, NULL,
-                false);
+                llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
            global_memory->setDSOLocal(true);
-            Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName()));
+            Comdat *comdat =
                M->getOrInsertComdat(StringRef(share_memory->getName()));
            comdat->setSelectionKind(Comdat::SelectionKind::Any);
            global_memory->setComdat(comdat);
            global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
            global_memory->setInitializer(undef);
            global_memory->setAlignment(share_memory->getAlignment());
            corresponding_global_memory.insert(
-                std::pair<GlobalVariable *, GlobalVariable *>(share_memory, 
+                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
-                global_memory));
+                                                              global_memory));
          } else {
            assert(0 && "The required Share Memory Type is not supported\n");
--- a/examples/dwt2d/common.h
+++ b/examples/dwt2d/common.h
@ -1,16 +1,16 @@
-/* 
+/*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
- * 
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
- * 
+ *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
- * 
+ *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -27,9 +27,9 @@
 #ifndef _COMMON_H
 #define _COMMON_H
-//24-bit multiplication is faster on G80,
+// 24-bit multiplication is faster on G80,
-//but we must be sure to multiply integers
+// but we must be sure to multiply integers
-//only within [-8M, 8M - 1] range
+// only within [-8M, 8M - 1] range
 #define IMUL(a, b) __mul24(a, b)
 ////cuda timing macros
@ -42,21 +42,23 @@
 //                          cudaEventSynchronize(cstop); \
 //                          cudaEventElapsedTime(&elapsedTime, cstart, cstop)
-//divide and round up macro
+// divide and round up macro
 #define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
-#  define cudaCheckError( msg ) {                                            \
+#define cudaCheckError(msg)                                                    \
-    cudaError_t err = cudaGetLastError();                                    \
+  {                                                                            \
-    if( cudaSuccess != err) {                                                \
+    cudaError_t err = cudaGetLastError();                                      \
-        fprintf(stderr, "%s: %i: %s: %s.\n",                                 \
+    if (cudaSuccess != err) {                                                  \
-                __FILE__, __LINE__, msg, cudaGetErrorString( err) );         \
+      fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg,            \
-        exit(-1);                                                            \
+              cudaGetErrorString(err));                                        \
-    } }
+      exit(-1);                                                                \
-
+    }                                                                          \
-#  define cudaCheckAsyncError( msg ) {                                       \
+  }
    cudaThreadSynchronize();                                                 \
    cudaCheckError( msg );                                                   \
    }
 #define cudaCheckAsyncError(msg)                                               \
  {                                                                            \
    cudaThreadSynchronize();                                                   \
    cudaCheckError(msg);                                                       \
  }
 #endif
--- a/examples/dwt2d/components.cu
+++ b/examples/dwt2d/components.cu
@ -1,16 +1,16 @@
-/* 
+/*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
- * 
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
- * 
+ *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
- * 
+ *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -23,7 +23,7 @@
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
- 
+
 #include <unistd.h>
 #include <error.h>
 #include <stdio.h>
@ -50,7 +50,7 @@ __device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int
    d_r[pos] = r - 128;
    d_g[pos] = g - 128;
    d_b[pos] = b - 128;
-} 
+}
 /* Store float component */
 __device__ void storeComponent(float *d_c, float c, int pos)
@ -66,8 +66,8 @@ __device__ void storeComponent(int *d_c, int c, int pos)
 /* Copy img src data into three separated component buffers */
 template<typename T>
-__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b, 
+__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
-                                  unsigned char * d_src, 
+                                  unsigned char * d_src,
                                  int pixels)
 {
    int x  = threadIdx.x;
@ -75,8 +75,8 @@ __global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
    __shared__ unsigned char sData[THREADS*3];
-    /* Copy data to shared mem by 4bytes 
+    /* Copy data to shared mem by 4bytes
-       other checks are not necessary, since 
+       other checks are not necessary, since
       d_src buffer is aligned to sharedDataSize */
    if ( (x*4) < THREADS*3 ) {
        float *s = (float *)d_src;
@ -107,8 +107,8 @@ __global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
    __shared__ unsigned char sData[THREADS];
-    /* Copy data to shared mem by 4bytes 
+    /* Copy data to shared mem by 4bytes
-       other checks are not necessary, since 
+       other checks are not necessary, since
       d_src buffer is aligned to sharedDataSize */
    if ( (x*4) < THREADS) {
        float *s = (float *)d_src;
--- a/examples/dwt2d/components.h
+++ b/examples/dwt2d/components.h
@ -1,16 +1,16 @@
-/* 
+/*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
- * 
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
- * 
+ *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
- * 
+ *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -28,11 +28,12 @@
 #define _COMPONENTS_H
 /* Separate compoents of source 8bit RGB image */
-template<typename T>
+template <typename T>
-void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height);
+void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
                     int height);
 /* Copy a 8bit source image data into a color compoment of type T */
-template<typename T>
+template <typename T>
-void bwToComponent(T *d_c, unsigned char * src, int width, int height);
+void bwToComponent(T *d_c, unsigned char *src, int width, int height);
 #endif
--- a/examples/dwt2d/dwt.cu
+++ b/examples/dwt2d/dwt.cu
@ -1,16 +1,16 @@
-/* 
+/*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
- * 
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
- * 
+ *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
- * 
+ *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -85,25 +85,25 @@ template<typename T>
 int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
 {
    printf("\n*** %d stages of 2D forward DWT:\n", stages);
-    
+
    /* create backup of input, because each test iteration overwrites it */
    const int size = pixHeight * pixWidth * sizeof(T);
    cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
    cudaCheckError("Memcopy device to device");
-    
+
    /* Measure time of individual levels. */
    if(forward)
        fdwt(in, out, pixWidth, pixHeight, stages);
    else
        rdwt(in, out, pixWidth, pixHeight, stages);
-    
+
-    // Measure overall time of DWT. 
+    // Measure overall time of DWT.
 /*    #ifdef GPU_DWT_TESTING_1
-	
+
    dwt_cuda::CudaDWTTester tester;
    for(int i = tester.getNumIterations(); i--; ) {
-        // Recover input and measure one overall DWT run. 
+        // Recover input and measure one overall DWT run.
-        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); 
+        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
        cudaCheckError("Memcopy device to device");
        tester.beginTestIteration();
        if(forward)
@ -113,8 +113,8 @@ int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int st
        tester.endTestIteration();
    }
    tester.showPerformance("   Overall DWT", pixWidth, pixHeight);
-    #endif  // GPU_DWT_TESTING 
+    #endif  // GPU_DWT_TESTING
-    
+
    cudaCheckAsyncError("DWT Kernel calls");
 */    return 0;
 }
@ -128,25 +128,25 @@ template<typename T>
 int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
 {
    printf("*** %d stages of 2D forward DWT:\n", stages);
-    
+
-    // create backup of input, because each test iteration overwrites it 
+    // create backup of input, because each test iteration overwrites it
    const int size = pixHeight * pixWidth * sizeof(T);
    cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
    cudaCheckError("Memcopy device to device");
-    
+
-    // Measure time of individual levels. 
+    // Measure time of individual levels.
    if(forward)
        fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
    else
        rdwt(in, out, pixWidth, pixHeight, stages);
-    
+
-    // Measure overall time of DWT. 
+    // Measure overall time of DWT.
    #ifdef GPU_DWT_TESTING_1
-	
+
    dwt_cuda::CudaDWTTester tester;
    for(int i = tester.getNumIterations(); i--; ) {
-        // Recover input and measure one overall DWT run. 
+        // Recover input and measure one overall DWT run.
-        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); 
+        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
        cudaCheckError("Memcopy device to device");
        tester.beginTestIteration();
        if(forward)
@ -156,8 +156,8 @@ int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int st
        tester.endTestIteration();
    }
    tester.showPerformance("   Overall DWT", pixWidth, pixHeight);
-    #endif  // GPU_DWT_TESTING 
+    #endif  // GPU_DWT_TESTING
-    
+
    cudaCheckAsyncError("DWT Kernel calls");
    return 0;
 }
@ -178,8 +178,8 @@ void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char
    for(i = 0; i < samplesNum; i++) {
        float r = (src[i]+0.5f) * 255;
-        if (r > 255) r = 255; 
+        if (r > 255) r = 255;
-        if (r < 0)   r = 0; 
+        if (r < 0)   r = 0;
        dst[i] = (unsigned char)r;
        outputFile << "index: " << i  << " val: "<< r <<" \n";
@ -199,7 +199,7 @@ void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char *
    for(i = 0; i < samplesNum; i++) {
        int r = src[i]+128;
        if (r > 255) r = 255;
-        if (r < 0)   r = 0; 
+        if (r < 0)   r = 0;
        dst[i] = (unsigned char)r;
        // added this line to output check
        outputFile << "index: " << i  << " val: "<< r <<" \n";
@ -250,16 +250,16 @@ int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
    if(x == 0) return 1;
    return 0;
 }
-template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); 
+template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
-template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); 
+template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
 /* Write output visual ordered */
 template<typename T>
-int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, 
+int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
-                     int stages, const char * filename, const char * suffix) 
+                     int stages, const char * filename, const char * suffix)
 {
    struct band {
-        int dimX; 
+        int dimX;
        int dimY;
    };
    struct dimensions {
@ -309,7 +309,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
        printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
    }
 #endif
-    
+
    size = samplesNum*sizeof(T);
    cudaMallocHost((void **)&src, size);
    cudaCheckError("Malloc host");
@ -332,7 +332,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
        offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
        for (i = 0; i < bandDims[s].HL.dimY; i++) {
            memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
-                src+offset+i*bandDims[s].HL.dimX, 
+                src+offset+i*bandDims[s].HL.dimX,
                size);
        }
@ -342,7 +342,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
        yOffset = bandDims[s].LL.dimY;
        for (i = 0; i < bandDims[s].HL.dimY; i++) {
            memcpy(dst+(yOffset+i)*pixWidth,
-                src+offset+i*bandDims[s].LH.dimX, 
+                src+offset+i*bandDims[s].LH.dimX,
                size);
        }
@ -352,7 +352,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
        yOffset = bandDims[s].HL.dimY;
        for (i = 0; i < bandDims[s].HH.dimY; i++) {
            memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
-                src+offset+i*bandDims[s].HH.dimX, 
+                src+offset+i*bandDims[s].HH.dimX,
                size);
        }
    }
@ -381,5 +381,5 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
    if (x == 0) return 1;
    return 0;
 }
-template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); 
+template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
-template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); 
+template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
--- a/examples/dwt2d/dwt.h
+++ b/examples/dwt2d/dwt.h
@ -1,16 +1,16 @@
-/* 
+/*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
- * 
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
- * 
+ *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
- * 
+ *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -27,14 +27,15 @@
 #ifndef _DWT_H
 #define _DWT_H
-template<typename T> 
+template <typename T>
-int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
+int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
                int stages, bool forward);
-template<typename T>
+template <typename T>
-int writeNStage2DDWT(T *component_cuda, int width, int height, 
+int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
-                     int stages, const char * filename, const char * suffix);
+                     const char *filename, const char *suffix);
-template<typename T>
+template <typename T>
-int writeLinear(T *component_cuda, int width, int height, 
+int writeLinear(T *component_cuda, int width, int height, const char *filename,
-                     const char * filename, const char * suffix);
+                const char *suffix);
 #endif
--- a/examples/dwt2d/dwt_cuda/common.cu
+++ b/examples/dwt2d/dwt_cuda/common.cu
@ -1,20 +1,20 @@
-///  
+///
 /// @file    common.cu
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-01-20 14:37
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -27,7 +27,7 @@
 /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
- 
+
 #include "common.h"
 namespace dwt_cuda {
--- a/examples/dwt2d/dwt_cuda/common.h
+++ b/examples/dwt2d/dwt_cuda/common.h
@ -1,4 +1,4 @@
-///  
+///
 /// @file    common.h
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @brief   Common stuff for all CUDA dwt functions.
@ -6,16 +6,16 @@
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -29,233 +29,204 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef DWT_COMMON_H
-#define	DWT_COMMON_H
+#define DWT_COMMON_H
 #include <cstdio>
 #include <algorithm>
 #include <cstdio>
 #include <vector>
 // compile time minimum macro
-#define CTMIN(a,b) (((a) < (b)) ? (a) : (b))
+#define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
 // performance testing macros
 #if defined(GPU_DWT_TESTING)
-  #define PERF_BEGIN  \
+#define PERF_BEGIN                                                             \
-  { \
+  {                                                                            \
-    dwt_cuda::CudaDWTTester PERF_TESTER; \
+    dwt_cuda::CudaDWTTester PERF_TESTER;                                       \
-    for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \
+    for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) {             \
    { \
      PERF_TESTER.beginTestIteration();
-  #define PERF_END(PERF_NAME, PERF_W, PERF_H)  \
+#define PERF_END(PERF_NAME, PERF_W, PERF_H)                                    \
-      PERF_TESTER.endTestIteration(); \
+  PERF_TESTER.endTestIteration();                                              \
-    } \
+  }                                                                            \
-    PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
+  PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H);                      \
  }
 #else // GPU_DWT_TESTING
-  #define PERF_BEGIN
+#define PERF_BEGIN
-  #define PERF_END(PERF_NAME, PERF_W, PERF_H)
+#define PERF_END(PERF_NAME, PERF_W, PERF_H)
 #endif // GPU_DWT_TESTING
 namespace dwt_cuda {
-  
+
-  
+/// Divide and round up.
-  /// Divide and round up.
+template <typename T>
-  template <typename T>
+__device__ __host__ inline T divRndUp(const T &n, const T &d) {
-  __device__ __host__ inline T divRndUp(const T & n, const T & d) {
+  return (n / d) + ((n % d) ? 1 : 0);
-    return (n / d) + ((n % d) ? 1 : 0);
+}
 // 9/7 forward DWT lifting schema coefficients
 const float f97Predict1 = -1.586134342;  ///< forward 9/7 predict 1
 const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
 const float f97Predict2 = 0.8829110762;  ///< forward 9/7 predict 2
 const float f97Update2 = 0.4435068522;   ///< forward 9/7 update 2
 // 9/7 reverse DWT lifting schema coefficients
 const float r97update2 = -f97Update2;   ///< undo 9/7 update 2
 const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
 const float r97update1 = -f97Update1;   ///< undo 9/7 update 1
 const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
 // FDWT 9/7 scaling coefficients
 const float scale97Mul = 1.23017410491400f;
 const float scale97Div = 1.0 / scale97Mul;
 // 5/3 forward DWT lifting schema coefficients
 const float forward53Predict = -0.5f; /// forward 5/3 predict
 const float forward53Update = 0.25f;  /// forward 5/3 update
 // 5/3 forward DWT lifting schema coefficients
 const float reverse53Update = -forward53Update;   /// undo 5/3 update
 const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
 /// Functor which adds scaled sum of neighbors to given central pixel.
 struct AddScaledSum {
  const float scale; // scale of neighbors
  __device__ AddScaledSum(const float scale) : scale(scale) {}
  __device__ void operator()(const float p, float &c, const float n) const {
    // if(threadIdx.x == 0) {
    //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
    //   scale * (p + n) );
    // }
    c += scale * (p + n);
  }
-  
+};
  // 9/7 forward DWT lifting schema coefficients
  const float f97Predict1 = -1.586134342;   ///< forward 9/7 predict 1
  const float f97Update1 = -0.05298011854;  ///< forward 9/7 update 1
  const float f97Predict2 = 0.8829110762;   ///< forward 9/7 predict 2
  const float f97Update2 = 0.4435068522;    ///< forward 9/7 update 2
 /// Returns index ranging from 0 to num threads, such that first half
 /// of threads get even indices and others get odd indices. Each thread
 /// gets different index.
 /// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
 ///                              parityIdx:   0  2  4  6  1  3  5  7
 /// @tparam THREADS  total count of participating threads
 /// @return parity-separated index of thread
 template <int THREADS> __device__ inline int parityIdx() {
  return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
 }
-  // 9/7 reverse DWT lifting schema coefficients
+/// size of shared memory
-  const float r97update2 = -f97Update2;    ///< undo 9/7 update 2
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-  const float r97predict2 = -f97Predict2;  ///< undo 9/7 predict 2
+const int SHM_SIZE = 48 * 1024;
-  const float r97update1 = -f97Update1;    ///< undo 9/7 update 1
+#else
-  const float r97Predict1 = -f97Predict1;  ///< undo 9/7 predict 1
+const int SHM_SIZE = 16 * 1024;
-  
+#endif
  // FDWT 9/7 scaling coefficients
  const float scale97Mul = 1.23017410491400f;
  const float scale97Div = 1.0 / scale97Mul;
  // 5/3 forward DWT lifting schema coefficients
  const float forward53Predict = -0.5f;   /// forward 5/3 predict
  const float forward53Update = 0.25f;    /// forward 5/3 update
  // 5/3 forward DWT lifting schema coefficients
  const float reverse53Update = -forward53Update;    /// undo 5/3 update
  const float reverse53Predict = -forward53Predict;  /// undo 5/3 predict
  /// Functor which adds scaled sum of neighbors to given central pixel.
  struct AddScaledSum {
    const float scale;  // scale of neighbors
    __device__ AddScaledSum(const float scale) : scale(scale) {}
    __device__ void operator()(const float p, float & c, const float n) const {
-      // if(threadIdx.x == 0) {
+/// Perrformance and return code tester.
-
+class CudaDWTTester {
-      //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );
+private:
-      
+  static bool testRunning;  ///< true if any test is currently running
-      // }
+  cudaEvent_t beginEvent;   ///< begin CUDA event
-      
+  cudaEvent_t endEvent;     ///< end CUDA event
-      c += scale * (p + n);
+  std::vector<float> times; ///< collected times
  const bool disabled;      ///< true if this object is disabled
 public:
  /// Checks CUDA related error.
  /// @param status   return code to be checked
  /// @param message  message to be shown if there was an error
  /// @return true if there was no error, false otherwise
  static bool check(const cudaError_t &status, const char *message) {
 #if defined(GPU_DWT_TESTING)
    if ((!testRunning) && status != cudaSuccess) {
      const char *errorString = cudaGetErrorString(status);
      fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
      fflush(stderr);
      return false;
    }
-  };
+#endif // GPU_DWT_TESTING
-  
+    return true;
  /// Returns index ranging from 0 to num threads, such that first half
  /// of threads get even indices and others get odd indices. Each thread
  /// gets different index.
  /// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
  ///                              parityIdx:   0  2  4  6  1  3  5  7
  /// @tparam THREADS  total count of participating threads
  /// @return parity-separated index of thread
  template <int THREADS>
  __device__ inline int parityIdx() {
    return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
  }
  /// size of shared memory
  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
  const int SHM_SIZE = 48 * 1024;
  #else
  const int SHM_SIZE = 16 * 1024;
  #endif
  /// Perrformance and return code tester.
  class CudaDWTTester {
  private:
    static bool testRunning;    ///< true if any test is currently running
    cudaEvent_t beginEvent;     ///< begin CUDA event
    cudaEvent_t endEvent;       ///< end CUDA event
    std::vector<float> times;   ///< collected times
    const bool disabled;        ///< true if this object is disabled
  public:
    /// Checks CUDA related error.
    /// @param status   return code to be checked
    /// @param message  message to be shown if there was an error
    /// @return true if there was no error, false otherwise
    static bool check(const cudaError_t & status, const char * message) {
      #if defined(GPU_DWT_TESTING)
      if((!testRunning) && status != cudaSuccess) {
        const char * errorString = cudaGetErrorString(status);
        fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
        fflush(stderr);
        return false;
      }
      #endif // GPU_DWT_TESTING
      return true;
    }
-    /// Checks last kernel call for errors.
+  /// Checks last kernel call for errors.
-    /// @param message  description of the kernel call
+  /// @param message  description of the kernel call
-    /// @return true if there was no error, false otherwise
+  /// @return true if there was no error, false otherwise
-    static bool checkLastKernelCall(const char * message) {
+  static bool checkLastKernelCall(const char *message) {
-      #if defined(GPU_DWT_TESTING)
+#if defined(GPU_DWT_TESTING)
-      return testRunning ? true : check(cudaThreadSynchronize(), message);
+    return testRunning ? true : check(cudaThreadSynchronize(), message);
-      #else // GPU_DWT_TESTING
+#else  // GPU_DWT_TESTING
-      return true;
+    return true;
-      #endif // GPU_DWT_TESTING
+#endif // GPU_DWT_TESTING
    }
    /// Initializes DWT tester for time measurement
    CudaDWTTester() : disabled(testRunning) {}
    /// Gets rpefered number of iterations
    int getNumIterations() {
      return disabled ? 1 : 31;
    }
    /// Starts one test iteration.
    void beginTestIteration() {
      if(!disabled) {
        cudaEventCreate(&beginEvent);
        cudaEventCreate(&endEvent);
        cudaEventRecord(beginEvent, 0);
        testRunning = true;
      }
    }
    /// Ends on etest iteration.
    void endTestIteration() {
      if(!disabled) {
        float time;
        testRunning = false;
        cudaEventRecord(endEvent, 0);
        cudaEventSynchronize(endEvent);
        cudaEventElapsedTime(&time, beginEvent, endEvent);
        cudaEventDestroy(beginEvent);
        cudaEventDestroy(endEvent);
        times.push_back(time);
      }
    }
    /// Shows brief info about all iterations.
    /// @param name   name of processing method
    /// @param sizeX  width of processed image
    /// @param sizeY  height of processed image
    void showPerformance(const char * name, const int sizeX, const int sizeY) {
      if(!disabled) {
        // compute mean and median
        std::sort(times.begin(), times.end());
        double sum = 0;
        for(int i = times.size(); i--; ) {
          sum += times[i];
        }
        const double median = (times[times.size() / 2]
                             + times[(times.size() - 1) / 2]) * 0.5f;
        printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
               "(%d x %d)\n", name, (sum / times.size()), median, 
               times[times.size() - 1], sizeX, sizeY);
      }
    }
  };
  /// Simple cudaMemcpy wrapped in performance tester.
  /// @param dest  destination bufer
  /// @param src   source buffer
  /// @param sx    width of copied image
  /// @param sy    height of copied image
  template <typename T>
  inline void memCopy(T * const dest, const T * const src,
                      const size_t sx, const size_t sy) {
    cudaError_t status;
    PERF_BEGIN
    status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice);
    PERF_END("        memcpy", sx, sy)
    CudaDWTTester::check(status, "memcpy device > device");
  }
-  
+
-  
+  /// Initializes DWT tester for time measurement
-  
+  CudaDWTTester() : disabled(testRunning) {}
  /// Gets rpefered number of iterations
  int getNumIterations() { return disabled ? 1 : 31; }
  /// Starts one test iteration.
  void beginTestIteration() {
    if (!disabled) {
      cudaEventCreate(&beginEvent);
      cudaEventCreate(&endEvent);
      cudaEventRecord(beginEvent, 0);
      testRunning = true;
    }
  }
  /// Ends on etest iteration.
  void endTestIteration() {
    if (!disabled) {
      float time;
      testRunning = false;
      cudaEventRecord(endEvent, 0);
      cudaEventSynchronize(endEvent);
      cudaEventElapsedTime(&time, beginEvent, endEvent);
      cudaEventDestroy(beginEvent);
      cudaEventDestroy(endEvent);
      times.push_back(time);
    }
  }
  /// Shows brief info about all iterations.
  /// @param name   name of processing method
  /// @param sizeX  width of processed image
  /// @param sizeY  height of processed image
  void showPerformance(const char *name, const int sizeX, const int sizeY) {
    if (!disabled) {
      // compute mean and median
      std::sort(times.begin(), times.end());
      double sum = 0;
      for (int i = times.size(); i--;) {
        sum += times[i];
      }
      const double median =
          (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
      printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
             "(%d x %d)\n",
             name, (sum / times.size()), median, times[times.size() - 1], sizeX,
             sizeY);
    }
  }
 };
 /// Simple cudaMemcpy wrapped in performance tester.
 /// @param dest  destination bufer
 /// @param src   source buffer
 /// @param sx    width of copied image
 /// @param sy    height of copied image
 template <typename T>
 inline void memCopy(T *const dest, const T *const src, const size_t sx,
                    const size_t sy) {
  cudaError_t status;
  PERF_BEGIN
  status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
  PERF_END("        memcpy", sx, sy)
  CudaDWTTester::check(status, "memcpy device > device");
 }
 } // end of namespace dwt_cuda
-
+#endif // DWT_COMMON_CUDA_H
 #endif	// DWT_COMMON_CUDA_H
--- a/examples/dwt2d/dwt_cuda/dwt.h
+++ b/examples/dwt2d/dwt_cuda/dwt.h
@ -1,4 +1,4 @@
-/// 
+///
 /// @file    dwt.h
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @brief   Entry points for CUDA implementaion of 9/7 and 5/3 DWT.
@ -8,16 +8,16 @@
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -56,57 +56,48 @@
 ///
 #ifndef DWT_CUDA_H
-#define	DWT_CUDA_H
+#define DWT_CUDA_H
 namespace dwt_cuda {
-  
+
-  
+/// Forward 5/3 2D DWT. See common rules (above) for more details.
-  /// Forward 5/3 2D DWT. See common rules (above) for more details.
+/// @param in      Expected to be normalized into range [-128, 127].
-  /// @param in      Expected to be normalized into range [-128, 127].
+///                Will not be preserved (will be overwritten).
-  ///                Will not be preserved (will be overwritten).
+/// @param out     output buffer on GPU
-  /// @param out     output buffer on GPU
+/// @param sizeX   width of input image (in pixels)
-  /// @param sizeX   width of input image (in pixels)
+/// @param sizeY   height of input image (in pixels)
-  /// @param sizeY   height of input image (in pixels)
+/// @param levels  number of recursive DWT levels
-  /// @param levels  number of recursive DWT levels
+void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
-  void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
+
-  
+/// Reverse 5/3 2D DWT. See common rules (above) for more details.
-  
+/// @param in      Input DWT coefficients. Format described in common rules.
-  /// Reverse 5/3 2D DWT. See common rules (above) for more details.
+///                Will not be preserved (will be overwritten).
-  /// @param in      Input DWT coefficients. Format described in common rules.
+/// @param out     output buffer on GPU - will contain original image
-  ///                Will not be preserved (will be overwritten).
+///                in normalized range [-128, 127].
-  /// @param out     output buffer on GPU - will contain original image
+/// @param sizeX   width of input image (in pixels)
-  ///                in normalized range [-128, 127].
+/// @param sizeY   height of input image (in pixels)
-  /// @param sizeX   width of input image (in pixels)
+/// @param levels  number of recursive DWT levels
-  /// @param sizeY   height of input image (in pixels)
+void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
-  /// @param levels  number of recursive DWT levels
+
-  void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
+/// Forward 9/7 2D DWT. See common rules (above) for more details.
-  
+/// @param in      Input DWT coefficients. Should be normalized (in range
-  
+///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
-  /// Forward 9/7 2D DWT. See common rules (above) for more details.
+/// @param out     output buffer on GPU - format specified in common rules
-  /// @param in      Input DWT coefficients. Should be normalized (in range 
+/// @param sizeX   width of input image (in pixels)
-  ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
+/// @param sizeY   height of input image (in pixels)
-  /// @param out     output buffer on GPU - format specified in common rules
+/// @param levels  number of recursive DWT levels
-  /// @param sizeX   width of input image (in pixels)
+void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
-  /// @param sizeY   height of input image (in pixels)
+
-  /// @param levels  number of recursive DWT levels
+/// Reverse 9/7 2D DWT. See common rules (above) for more details.
-  void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
+/// @param in      Input DWT coefficients. Format described in common rules.
-  
+///                Will not be preserved (will be overwritten).
-  
+/// @param out     output buffer on GPU - will contain original image
-  /// Reverse 9/7 2D DWT. See common rules (above) for more details.
+///                in normalized range [-0.5, 0.5].
-  /// @param in      Input DWT coefficients. Format described in common rules.
+/// @param sizeX   width of input image (in pixels)
-  ///                Will not be preserved (will be overwritten).
+/// @param sizeY   height of input image (in pixels)
-  /// @param out     output buffer on GPU - will contain original image
+/// @param levels  number of recursive DWT levels
-  ///                in normalized range [-0.5, 0.5].
+void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
-  /// @param sizeX   width of input image (in pixels)
+
  /// @param sizeY   height of input image (in pixels)
  /// @param levels  number of recursive DWT levels
  void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
 } // namespace dwt_cuda
-
+#endif // DWT_CUDA_H
 #endif	// DWT_CUDA_H
--- a/examples/dwt2d/dwt_cuda/fdwt53.cu
+++ b/examples/dwt2d/dwt_cuda/fdwt53.cu
@ -6,16 +6,16 @@
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -44,7 +44,7 @@ namespace dwt_cuda {
  template <int WIN_SIZE_X, int WIN_SIZE_Y>
  class FDWT53 {
  private:
-    
+
    /// Info needed for processing of one input column.
    /// @tparam CHECKED_LOADER  true if column's loader should check boundaries
    ///                         false if there are no near boudnaries to check
@ -52,13 +52,13 @@ namespace dwt_cuda {
    struct FDWT53Column {
      /// loader for the column
      VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
-      
+
      /// offset of the column in shared buffer
-      int offset;                   
+      int offset;
-      
+
      // backup of first 3 loaded pixels (not transformed)
      int pixel0, pixel1, pixel2;
-      
+
      /// Sets all fields to anything to prevent 'uninitialized' warnings.
      __device__ void clear() {
        offset = pixel0 = pixel1 = pixel2 = 0;
@ -104,7 +104,7 @@ namespace dwt_cuda {
    /// @param colIndex  x-axis coordinate of the column (relative to the left
    ///                  side of this threadblock's block of input pixels)
    /// @param firstY    y-axis coordinate of first image row to be transformed
-	
+
 	template <bool CHECKED>
    __device__ void initColumn(FDWT53Column<CHECKED> & column,
                               const int * const input,
@ -137,7 +137,7 @@ namespace dwt_cuda {
        column.pixel2 = column.loader.loadFrom(input);
        // Now, the next pixel, which will be loaded by loader, is pixel #1.
      }
-		
+
 	}
@ -153,14 +153,14 @@ namespace dwt_cuda {
      buffer[column.offset + 0 * STRIDE] = column.pixel0;
      buffer[column.offset + 1 * STRIDE] = column.pixel1;
      buffer[column.offset + 2 * STRIDE] = column.pixel2;
-	
+
      // load remaining pixels to be able to vertically transform the window
-      for(int i = 3; i < (3 + WIN_SIZE_Y); i++) 
+      for(int i = 3; i < (3 + WIN_SIZE_Y); i++)
      {
        buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
      }
- 
+
      // remember last 3 pixels for use in next iteration
      column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
      column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
@ -169,7 +169,7 @@ namespace dwt_cuda {
      // vertically transform the column in transform buffer
 	  buffer.forEachVerticalOdd(column.offset, Forward53Predict());
      buffer.forEachVerticalEven(column.offset, Forward53Update());
-	  
+
    }
@ -178,7 +178,7 @@ namespace dwt_cuda {
    /// @tparam CHECK_WRITES  true if output writer must check boundaries
    /// @param in        input image
    /// @param out       output buffer
-    /// @param sizeX     width of the input image 
+    /// @param sizeX     width of the input image
    /// @param sizeY     height of the input image
    /// @param winSteps  number of sliding window steps
    template <bool CHECK_LOADS, bool CHECK_WRITES>
@ -186,15 +186,15 @@ namespace dwt_cuda {
                              const int sizeX, const int sizeY,
                              const int winSteps) {
      // info about one main and one boundary columns processed by this thread
-      FDWT53Column<CHECK_LOADS> column;    
+      FDWT53Column<CHECK_LOADS> column;
      FDWT53Column<CHECK_LOADS> boundaryColumn;  // only few threads use this
-      // Initialize all column info: initialize loaders, compute offset of 
+      // Initialize all column info: initialize loaders, compute offset of
      // column in shared buffer and initialize loader of column.
      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
 	    initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
-	  
+
      // first 3 threads initialize boundary columns, others do not use them
      boundaryColumn.clear();
      if(threadIdx.x < 3) {
@ -205,9 +205,9 @@ namespace dwt_cuda {
        initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
      }
-	  
+
-	  
+
-      // index of column which will be written into output by this thread      
+      // index of column which will be written into output by this thread
 	  const int outColumnIndex = parityIdx<WIN_SIZE_X>();
      // offset of column which will be written by this thread into output
@ -219,7 +219,7 @@ namespace dwt_cuda {
 	    writer.init(sizeX, sizeY, outputFirstX, firstY);
 			__syncthreads();
-	  
+
      // Sliding window iterations:
      // Each iteration assumes that first 3 pixels of each column are loaded.
     for(int w = 0; w < winSteps; w++) {
@ -227,23 +227,23 @@ namespace dwt_cuda {
 	 // For each column (including boundary columns): load and vertically
        // transform another WIN_SIZE_Y lines.
        loadAndVerticallyTransform(column, in);
-        if(threadIdx.x < 3) { 
+        if(threadIdx.x < 3) {
-          loadAndVerticallyTransform(boundaryColumn, in); 
+          loadAndVerticallyTransform(boundaryColumn, in);
        }
- 		
+
        // wait for all columns to be vertically transformed and transform all
        // output rows horizontally
        __syncthreads();
-		
+
 		buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
        __syncthreads();
-		
+
        buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
        // wait for all output rows to be transformed horizontally and write
        // them into output buffer
-        __syncthreads();	
+        __syncthreads();
        for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
@ -256,20 +256,20 @@ namespace dwt_cuda {
        // before proceeding to next iteration, wait for all output columns
        // to be written into the output
        __syncthreads();
-			
+
 	    }
-	
+
    }
-    
+
  public:
    /// Determines, whether this block's pixels touch boundary and selects
    /// right version of algorithm according to it - for many threadblocks, it
-    /// selects version which does not deal with boundary mirroring and thus is 
+    /// selects version which does not deal with boundary mirroring and thus is
    /// slightly faster.
    /// @param in     input image
    /// @param out    output buffer
-    /// @param sx     width of the input image 
+    /// @param sx     width of the input image
    /// @param sy     height of the input image
    /// @param steps  number of sliding window steps
    __device__ static void run(const int * const in, int * const out,
@ -292,32 +292,32 @@ namespace dwt_cuda {
      // if(threadIdx.x == 0) {
      //   printf("fdwt53 run");
      // }
-      if(atBottomBoudary) 
+      if(atBottomBoudary)
      {
        // near bottom boundary => check both writing and reading
        fdwt53.transform<true, true>(in, out, sx, sy, steps);
-      } else if(atRightBoudary) 
+      } else if(atRightBoudary)
      {
        // near right boundary only => check writing only
        fdwt53.transform<false, true>(in, out, sx, sy, steps);
-      } else 
+      } else
      {
        // no nearby boundary => check nothing
        fdwt53.transform<false, false>(in, out, sx, sy, steps);
      }
    }
    // }
-    
+
  }; // end of class FDWT53
-  
+
-  
+
-  
+
  /// Main GPU 5/3 FDWT entry point.
  /// @tparam WIN_SX   width of sliding window to be used
  /// @tparam WIN_SY   height of sliding window to be used
  /// @param input     input image
  /// @param output    output buffer
-  /// @param sizeX     width of the input image 
+  /// @param sizeX     width of the input image
  /// @param sizeY     height of the input image
  /// @param winSteps  number of sliding window steps
  template <int WIN_SX, int WIN_SY>
@ -328,20 +328,20 @@ namespace dwt_cuda {
    FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
  }
-  /// Only computes optimal number of sliding window steps, 
+
  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 5/3 FDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
-  /// @param sx       width of the input image 
+  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
    // compute optimal number of steps of each sliding window
-	
+
    const int steps = divRndUp(sy, 15 * WIN_SY);
 	int gx = divRndUp(sx, WIN_SX);
@ -352,18 +352,18 @@ namespace dwt_cuda {
    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
    // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
-    
+
    // run kernel, possibly measure time and finally check the call
    // PERF_BEGIN
    fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
    // PERF_END("        FDWT53", sx, sy)
    // CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
    printf("fdwt53Kernel in launchFDWT53Kernel has finished");
-	
+
  }
-  
+
-  
+
-  
+
  /// Forward 5/3 2D DWT. See common rules (above) for more details.
  /// @param in      Expected to be normalized into range [-128, 127].
  ///                Will not be preserved (will be overwritten).
@ -373,7 +373,7 @@ namespace dwt_cuda {
  /// @param levels  number of recursive DWT levels
  void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
    // select right width of kernel for the size of the image
-	
+
    if(sizeX >= 960) {
      launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
    } else if (sizeX >= 480) {
@ -381,20 +381,20 @@ namespace dwt_cuda {
    } else {
      launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
    }
-    
+
    // if this was not the last level, continue recursively with other levels
    if(levels > 1) {
      // copy output's LL band back into input buffer
-      const int llSizeX = divRndUp(sizeX, 2); 
+      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
 	 // printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
      memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
-      
+
      // run remaining levels of FDWT
      fdwt53(in, out, llSizeX, llSizeY, levels - 1);
    }
  }
-  
+
-  
+
 } // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/fdwt97.cu
+++ b/examples/dwt2d/dwt_cuda/fdwt97.cu
@ -1,4 +1,4 @@
-/// 
+///
 /// @file    fdwt97.cu
 /// @brief   CUDA implementation of forward 9/7 2D DWT.
 /// @author  Martin Jirman (207962@mail.muni.cz)
@ -7,16 +7,16 @@
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -38,8 +38,8 @@
 namespace dwt_cuda {
- 
+
-  
+
  /// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
  /// of specified size. Template arguments specify this size.
  /// @tparam WIN_SIZE_X  width of sliding window
@ -62,8 +62,8 @@ namespace dwt_cuda {
    template <bool CHECKED>
    struct FDWT97ColumnLoadingInfo {
      /// Loader of pixels from some input image.
-      VerticalDWTPixelLoader<float, CHECKED> loader;  
+      VerticalDWTPixelLoader<float, CHECKED> loader;
-      
+
      /// Offset of column loaded by loader. (Offset in shared buffer.)
      int offset;
    };
@ -103,7 +103,7 @@ namespace dwt_cuda {
    /// @param firstY       index of first row to be loaded from image
    template <bool CHECKED>
    __device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
-                              const int columnIndex, const float * const input, 
+                              const int columnIndex, const float * const input,
                              const int sizeX, const int sizeY,
                              const int firstY) {
      // get offset of the column with index 'columnIndex'
@ -113,7 +113,7 @@ namespace dwt_cuda {
      // x-coordinate of the first pixel to be loaded by given loader
      const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;
-      
+
      if(blockIdx.y == 0) {
        // topmost block - apply mirroring rules when loading first 7 rows
        column.loader.init(sizeX, sizeY, firstX, firstY);
@ -162,7 +162,7 @@ namespace dwt_cuda {
    /// @tparam CHECK_WRITES  true if boundaries should be checked when writing
    /// @param in        input image
    /// @param out       output buffer
-    /// @param sizeX     width of the input image 
+    /// @param sizeX     width of the input image
    /// @param sizeY     height of the input image
    /// @param winSteps  number of steps of sliding window
    template <bool CHECK_LOADS, bool CHECK_WRITES>
@ -205,7 +205,7 @@ namespace dwt_cuda {
      // transform buffer offset of column transformed and saved by this thread
      const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
-      // (Each iteration of this loop assumes that first 7 rows of transform 
+      // (Each iteration of this loop assumes that first 7 rows of transform
      // buffer are already loaded with horizontally transformed coefficients.)
      for(int w = 0; w < winSteps; w++) {
        // Load another WIN_SIZE_Y lines of thread's column into the buffer.
@ -220,7 +220,7 @@ namespace dwt_cuda {
        horizontalFDWT97(WIN_SIZE_Y, 7);
        // Using 7 registers, remember current values of last 7 rows of
-        // transform buffer. These rows are transformed horizontally only 
+        // transform buffer. These rows are transformed horizontally only
        // and will be used in next iteration.
        float last7Lines[7];
        for(int i = 0; i < 7; i++) {
@ -249,7 +249,7 @@ namespace dwt_cuda {
        // As expected, these lines are already horizontally transformed.
        for(int i = 0; i < 7; i++) {
          buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
-      
+
        }
        // Wait for all writing threads before proceeding to loading new
@ -259,15 +259,15 @@ namespace dwt_cuda {
      }
    }
-    
+
-    
+
  public:
    /// Runs one of specialized variants of 9/7 FDWT according to distance of
-    /// processed pixels to image boudnary. Some variants do not check for 
+    /// processed pixels to image boudnary. Some variants do not check for
    /// boudnary and thus are slightly faster.
    /// @param in     input image
    /// @param out    output buffer
-    /// @param sx     width of the input image 
+    /// @param sx     width of the input image
    /// @param sy     height of the input image
    /// @param steps  number of steps of sliding window
    __device__ static void run(const float * const input, float * const output,
@ -299,15 +299,15 @@ namespace dwt_cuda {
        fdwt97.transform<false, false>(input, output, sx, sy, steps);
      }
    }
-    
+
  }; // end of class FDWT97
-  
+
-  
+
-    
+
  /// Main GPU 9/7 FDWT entry point.
  /// @param input   input image
  /// @parma output  output buffer
-  /// @param sx      width of the input image 
+  /// @param sx      width of the input image
  /// @param sy      height of the input image
  /// @param steps   number of steps of sliding window
  template <int WIN_SX, int WIN_SY>
@ -321,21 +321,21 @@ namespace dwt_cuda {
    FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
  }
-  
+
-  
+
-  /// Only computes optimal number of sliding window steps, 
+  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 9/7 FDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
-  /// @param sx       width of the input image 
+  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
    // compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);
-    
+
    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
    printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
@ -346,11 +346,11 @@ namespace dwt_cuda {
    PERF_END("        FDWT97", sx, sy)
    CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
  }
-  
+
-  
+
-  
+
  /// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
-  /// @param in      Input DWT coefficients. Should be normalized (in range 
+  /// @param in      Input DWT coefficients. Should be normalized (in range
  ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
  /// @param out     output buffer on GPU - format specified in common rules
  /// @param sizeX   width of input image (in pixels)
@ -365,19 +365,19 @@ namespace dwt_cuda {
    } else {
      launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
    }
-    
+
    // if this was not the last level, continue recursively with other levels
    if(levels > 1) {
      // copy output's LL band back into input buffer
      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
      memCopy(in, out, llSizeX, llSizeY);
-      
+
      // run remaining levels of FDWT
      fdwt97(in, out, llSizeX, llSizeY, levels - 1);
    }
  }
-  
+
-  
+
 } // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/io.h
+++ b/examples/dwt2d/dwt_cuda/io.h
@ -3,20 +3,20 @@
 /// @brief   Manages loading and saving lineary stored bands and input images.
 /// @author  Martin Jirman (207962@mail.muni.cz)
 /// @date    2011-01-20 22:38
-/// 
+///
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -30,454 +30,411 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef IO_H
-#define	IO_H
+#define IO_H
 #include "common.h"
 namespace dwt_cuda {
-  
+/// Base for all IO classes - manages mirroring.
-  /// Base for all IO classes - manages mirroring.
+class DWTIO {
-  class DWTIO {
+protected:
-  protected:
+  /// Handles mirroring of image at edges in a DWT correct way.
-    /// Handles mirroring of image at edges in a DWT correct way.
+  /// @param d      a position in the image (will be replaced by mirrored d)
-    /// @param d      a position in the image (will be replaced by mirrored d)
+  /// @param sizeD  size of the image along the dimension of 'd'
-    /// @param sizeD  size of the image along the dimension of 'd'
+  __device__ static void mirror(int &d, const int &sizeD) {
-    __device__ static void mirror(int & d, const int & sizeD) {
+    // TODO: enable multiple mirroring:
-      // TODO: enable multiple mirroring:
+    //      if(sizeD > 1) {
-//      if(sizeD > 1) {
+    //        if(d < 0) {
-//        if(d < 0) {
+    //          const int underflow = -1 - d;
-//          const int underflow = -1 - d;
+    //          const int phase = (underflow / (sizeD - 1)) & 1;
-//          const int phase = (underflow / (sizeD - 1)) & 1;
+    //          const int remainder = underflow % (sizeD - 1);
-//          const int remainder = underflow % (sizeD - 1);
+    //          if(phase == 0) {
-//          if(phase == 0) {
+    //            d = remainder + 1;
-//            d = remainder + 1;
+    //          } else {
-//          } else {
+    //            d = sizeD - 2 - remainder;
-//            d = sizeD - 2 - remainder;
+    //          }
-//          }
+    //        } else if(d >= sizeD) {
-//        } else if(d >= sizeD) {
+    //          const int overflow = d - sizeD;
-//          const int overflow = d - sizeD;
+    //          const int phase = (overflow / (sizeD - 1)) & 1;
-//          const int phase = (overflow / (sizeD - 1)) & 1;
+    //          const int remainder = overflow % (sizeD - 1);
-//          const int remainder = overflow % (sizeD - 1);
+    //          if(phase == 0) {
-//          if(phase == 0) {
+    //            d = sizeD - 2 - remainder;
-//            d = sizeD - 2 - remainder;
+    //          } else {
-//          } else {
+    //            d = remainder + 1;
-//            d = remainder + 1;
+    //          }
-//          }
+    //        }
-//        }
+    //      } else {
-//      } else {
+    //        d = 0;
-//        d = 0;
+    //      }
-//      }
+    // for test the mirror's use Feb 17
-  //for test the mirror's use Feb 17
+    if (d >= sizeD) {
-      if(d >= sizeD) {
+      d = 2 * sizeD - 2 - d;
-        d = 2 * sizeD - 2 - d;
+    } else if (d < 0) {
-      } else if(d < 0) {
+      d = -d;
        d = -d;
      }
    }
-  };
+  }
 };
 /// Base class for pixel loader and writer - manages computing start index,
 /// stride and end of image for loading column of pixels.
 /// @tparam T        type of image pixels
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
 protected:
  int end;    ///< index of bottom neightbor of last pixel of column
  int stride; ///< increment of pointer to get to next pixel
-  /// Base class for pixel loader and writer - manages computing start index,
+  /// Initializes pixel IO - sets end index and a position of first pixel.
-  /// stride and end of image for loading column of pixels.
+  /// @param sizeX   width of the image
-  /// @tparam T        type of image pixels
+  /// @param sizeY   height of the image
-  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+  /// @param firstX  x-coordinate of first pixel to use
-  template <typename T, bool CHECKED>
+  /// @param firstY  y-coordinate of first pixel to use
-  class VerticalDWTPixelIO : protected DWTIO {
+  /// @return index of pixel at position [x, y] in the image
-  protected:
+  __device__ int initialize(const int sizeX, const int sizeY, int firstX,
-    int end;         ///< index of bottom neightbor of last pixel of column
+                            int firstY) {
-    int stride;      ///< increment of pointer to get to next pixel
+    // initialize all pointers and stride
    end = CHECKED ? (sizeY * sizeX + firstX) : 0;
    stride = sizeX;
    return firstX + sizeX * firstY;
  }
 };
-    /// Initializes pixel IO - sets end index and a position of first pixel.
+/// Writes reverse transformed pixels directly into output image.
-    /// @param sizeX   width of the image
+/// @tparam T        type of output pixels
-    /// @param sizeY   height of the image
+/// @tparam CHECKED  true = be prepared to image boundary, false = don't care
-    /// @param firstX  x-coordinate of first pixel to use
+template <typename T, bool CHECKED>
-    /// @param firstY  y-coordinate of first pixel to use
+class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
-    /// @return index of pixel at position [x, y] in the image
+private:
-    __device__ int initialize(const int sizeX, const int sizeY,
+  int next; // index of the next pixel to be loaded
                              int firstX, int firstY) {
      // initialize all pointers and stride
      end = CHECKED ? (sizeY * sizeX + firstX) : 0;
      stride = sizeX;
      return firstX + sizeX * firstY;
    }
  };
-
+public:
-
+  /// Initializes writer - sets output buffer and a position of first pixel.
-  /// Writes reverse transformed pixels directly into output image.
+  /// @param sizeX   width of the image
-  /// @tparam T        type of output pixels
+  /// @param sizeY   height of the image
-  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+  /// @param firstX  x-coordinate of first pixel to write into
-  template <typename T, bool CHECKED>
+  /// @param firstY  y-coordinate of first pixel to write into
-  class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
+  __device__ void init(const int sizeX, const int sizeY, int firstX,
-  private:
+                       int firstY) {
-    int next;   // index of the next pixel to be loaded
+    if (firstX < sizeX) {
-
+      next = this->initialize(sizeX, sizeY, firstX, firstY);
-  public:
+    } else {
    /// Initializes writer - sets output buffer and a position of first pixel.
    /// @param sizeX   width of the image
    /// @param sizeY   height of the image
    /// @param firstX  x-coordinate of first pixel to write into
    /// @param firstY  y-coordinate of first pixel to write into
    __device__ void init(const int sizeX, const int sizeY, 
                         int firstX, int firstY) {
      if(firstX < sizeX) {
        next = this->initialize(sizeX, sizeY, firstX, firstY);
      } else {
        this->end = 0;
        this->stride = 0;
        next = 0;
      }
    }
    /// Writes given value at next position and advances internal pointer while
    /// correctly handling mirroring.
    /// @param output  output image to write pixel into
    /// @param value   value of the pixel to be written
    __device__ void writeInto(T * const output, const T & value) {
      if((!CHECKED) || (next != this->end)) {
        output[next] = value;
        next += this->stride;
      }
    }
  };
  /// Loads pixels from input image.
  /// @tparam T        type of image input pixels
  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
  template <typename T, bool CHECKED>
  class VerticalDWTPixelLoader
          : protected VerticalDWTPixelIO<const T, CHECKED> {
  private:
    int last;  ///< index of last loaded pixel
  public:
  //******************* FOR TEST **********************
  __device__ int getlast(){
 		return last;
 	}
  __device__ int getend(){
 		return this->end;
 	}
  __device__ int getstride(){
 		return this->stride;
 	}
  __device__ void setend(int a){
      this->end=a;
 	}
 	//******************* FOR TEST **********************
    /// Initializes loader - sets input size and a position of first pixel.
    /// @param sizeX   width of the image
    /// @param sizeY   height of the image
    /// @param firstX  x-coordinate of first pixel to load
    /// @param firstY  y-coordinate of first pixel to load
    __device__ void init(const int sizeX, const int sizeY,
                         int firstX, int firstY) {
      // correctly mirror x coordinate
      this->mirror(firstX, sizeX);
      // 'last' always points to already loaded pixel (subtract sizeX = stride)
      last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
      //last = (FirstX + sizeX * FirstY) - sizeX
    }
    /// Sets all fields to zeros, for compiler not to complain about
    /// uninitialized stuff.
    __device__ void clear() {
      this->end = 0;
      this->stride = 0;
-      this->last = 0;
+      next = 0;
    }
  }
  /// Writes given value at next position and advances internal pointer while
  /// correctly handling mirroring.
  /// @param output  output image to write pixel into
  /// @param value   value of the pixel to be written
  __device__ void writeInto(T *const output, const T &value) {
    if ((!CHECKED) || (next != this->end)) {
      output[next] = value;
      next += this->stride;
    }
  }
 };
 /// Loads pixels from input image.
 /// @tparam T        type of image input pixels
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED>
 class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
 private:
  int last; ///< index of last loaded pixel
 public:
  //******************* FOR TEST **********************
  __device__ int getlast() { return last; }
  __device__ int getend() { return this->end; }
  __device__ int getstride() { return this->stride; }
  __device__ void setend(int a) { this->end = a; }
  //******************* FOR TEST **********************
  /// Initializes loader - sets input size and a position of first pixel.
  /// @param sizeX   width of the image
  /// @param sizeY   height of the image
  /// @param firstX  x-coordinate of first pixel to load
  /// @param firstY  y-coordinate of first pixel to load
  __device__ void init(const int sizeX, const int sizeY, int firstX,
                       int firstY) {
    // correctly mirror x coordinate
    this->mirror(firstX, sizeX);
    // 'last' always points to already loaded pixel (subtract sizeX = stride)
    last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
    // last = (FirstX + sizeX * FirstY) - sizeX
  }
  /// Sets all fields to zeros, for compiler not to complain about
  /// uninitialized stuff.
  __device__ void clear() {
    this->end = 0;
    this->stride = 0;
    this->last = 0;
  }
  /// Gets another pixel and advancees internal pointer to following one.
  /// @param input  input image to load next pixel from
  /// @return next pixel from given image
  __device__ T loadFrom(const T *const input) {
    last += this->stride;
    if (CHECKED && (last == this->end)) {
      last -= 2 * this->stride;
      this->stride = -this->stride; // reverse loader's direction
    }
    // avoid reading from negative indices if loader is checked
    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
    // checked variant later
    if (last < 0) {
      return 0;
    }
-    /// Gets another pixel and advancees internal pointer to following one.
+    return input[last];
-    /// @param input  input image to load next pixel from
+    // return this->end;
-    /// @return next pixel from given image
+    // return last;
-    __device__ T loadFrom(const T * const input) {
+    // return this->stride;
-      last += this->stride;
+  }
-      if(CHECKED && (last == this->end)) {
+};
        last -= 2 * this->stride;
        this->stride = -this->stride; // reverse loader's direction
      }
      // avoid reading from negative indices if loader is checked
      // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
      if(last < 0 ) {
        return 0;
      }
      return input[last];
      // return this->end;
      // return last;
      // return this->stride;
    }
  };
 /// Base for band write and loader. Manages computing strides and pointers
 /// to first and last pixels in a linearly-stored-bands correct way.
 /// @tparam T        type of band coefficients
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
 protected:
  /// index of bottom neighbor of last pixel of loaded column
  int end;
  /// increment of index to get from highpass band to the lowpass one
  int strideHighToLow;
-  /// Base for band write and loader. Manages computing strides and pointers
+  /// increment of index to get from the lowpass band to the highpass one
-  /// to first and last pixels in a linearly-stored-bands correct way.
+  int strideLowToHigh;
  /// @tparam T        type of band coefficients
  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
  template <typename T, bool CHECKED>
  class VerticalDWTBandIO : protected DWTIO {
  protected:
    /// index of bottom neighbor of last pixel of loaded column
    int end;
    /// increment of index to get from highpass band to the lowpass one
    int strideHighToLow;
    /// increment of index to get from the lowpass band to the highpass one
    int strideLowToHigh;
-    /// Initializes IO - sets size of image and a position of first pixel.
+  /// Initializes IO - sets size of image and a position of first pixel.
-    /// @param imageSizeX   width of the image
+  /// @param imageSizeX   width of the image
-    /// @param imageSizeY   height of the image
+  /// @param imageSizeY   height of the image
-    /// @param firstX       x-coordinate of first pixel to use
+  /// @param firstX       x-coordinate of first pixel to use
-    ///                     (Parity determines vertically low or high band.)
+  ///                     (Parity determines vertically low or high band.)
-    /// @param firstY       y-coordinate of first pixel to use
+  /// @param firstY       y-coordinate of first pixel to use
-    ///                     (Parity determines horizontally low or high band.)
+  ///                     (Parity determines horizontally low or high band.)
-    /// @return index of first item specified by firstX and firstY
+  /// @return index of first item specified by firstX and firstY
-    __device__ int initialize(const int imageSizeX, const int imageSizeY,
+  __device__ int initialize(const int imageSizeX, const int imageSizeY,
-                              int firstX, int firstY) {
+                            int firstX, int firstY) {
-      // index of first pixel (topmost one) of the column with index firstX
+    // index of first pixel (topmost one) of the column with index firstX
-      int columnOffset = firstX / 2;
+    int columnOffset = firstX / 2;
      // difference between indices of two vertically neighboring pixels
      // in the same band
      int verticalStride;
      // resolve index of first pixel according to horizontal parity
      if(firstX & 1) {
        // first pixel in one of right bands
        verticalStride = imageSizeX / 2;
        columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
        strideLowToHigh = (imageSizeX * imageSizeY) / 2;
      } else {
        // first pixel in one of left bands
        verticalStride = imageSizeX / 2 + (imageSizeX & 1);
        strideLowToHigh = divRndUp(imageSizeY, 2)  * imageSizeX;
      }
      // set the other stride
      strideHighToLow = verticalStride - strideLowToHigh;
-      // compute index of coefficient which indicates end of image
+    // difference between indices of two vertically neighboring pixels
-      if(CHECKED) {
+    // in the same band
-        end = columnOffset                            // right column
+    int verticalStride;
                + (imageSizeY / 2) * verticalStride   // right row
                + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
      } else {
        end = 0;
      }
-
+    // resolve index of first pixel according to horizontal parity
-	//***********for test**************
+    if (firstX & 1) {
-	//	end = CHECKED;
+      // first pixel in one of right bands
-	//***********for test**************
+      verticalStride = imageSizeX / 2;
-	
+      columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
-	
+      strideLowToHigh = (imageSizeX * imageSizeY) / 2;
-      // finally, return index of the first item
+    } else {
-      return columnOffset                        // right column
+      // first pixel in one of left bands
-              + (firstY / 2) * verticalStride    // right row
+      verticalStride = imageSizeX / 2 + (imageSizeX & 1);
-              + (firstY & 1) * strideLowToHigh;  // possibly in high band
+      strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
    }
  };
  /// Directly loads coefficients from four consecutively stored transformed
  /// bands.
  /// @tparam T        type of input band coefficients
  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
  template <typename T, bool CHECKED>
  class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
  private:
    int last;  ///< index of last loaded pixel
    /// Checks internal index and possibly reverses direction of loader.
    /// (Handles mirroring at the bottom of the image.)
    /// @param input   input image to load next coefficient from
    /// @param stride  stride to use now (one of two loader's strides)
    /// @return loaded coefficient
    __device__ T updateAndLoad(const T * const input, const int & stride) {
      last += stride;
      if(CHECKED && (last == this->end)) {
        // undo last two updates of index (to get to previous mirrored item)
        last -= (this->strideLowToHigh + this->strideHighToLow);
        // swap and reverse strides (to move up in the loaded column now)
        const int temp = this->strideLowToHigh;
        this->strideLowToHigh = -this->strideHighToLow;
        this->strideHighToLow = -temp;
      }
      if(last < 0 ) {
        return 0;
      }
      // avoid reading from negative indices if loader is checked
      // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
      return input[last];
    }
  public:
    /// Initializes loader - sets input size and a position of first pixel.
    /// @param imageSizeX   width of the image
    /// @param imageSizeY   height of the image
    /// @param firstX       x-coordinate of first pixel to load
    ///                     (Parity determines vertically low or high band.)
    /// @param firstY       y-coordinate of first pixel to load
    ///                     (Parity determines horizontally low or high band.)
    __device__ void init(const int imageSizeX, const int imageSizeY,
                         int firstX, const int firstY) {
      this->mirror(firstX, imageSizeX);
      last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
      // adjust to point to previous item
      last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow; 
    }
    /// Sets all fields to zeros, for compiler not to complain about
    /// uninitialized stuff.
    __device__ void clear() {
      this->end = 0;
      this->strideHighToLow = 0;
      this->strideLowToHigh = 0;
      this->last = 0;
    }
-    /// Gets another coefficient from lowpass band and advances internal index.
+    // set the other stride
-    /// Call this method first if position of first pixel passed to init
+    strideHighToLow = verticalStride - strideLowToHigh;
-    /// was in high band.
+
-    /// @param input   input image to load next coefficient from
+    // compute index of coefficient which indicates end of image
-    /// @return next coefficient from the lowpass band of the given image
+    if (CHECKED) {
-    __device__ T loadLowFrom(const T * const input) {
+      end = columnOffset                          // right column
-      return updateAndLoad(input, this->strideHighToLow);
+            + (imageSizeY / 2) * verticalStride   // right row
            + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
    } else {
      end = 0;
    }
-    /// Gets another coefficient from the highpass band and advances index.
+    //***********for test**************
-    /// Call this method first if position of first pixel passed to init
+    //	end = CHECKED;
-    /// was in high band.
+    //***********for test**************
-    /// @param input   input image to load next coefficient from
+
-    /// @return next coefficient from the highbass band of the given image
+    // finally, return index of the first item
-    __device__ T loadHighFrom(const T * const input) {
+    return columnOffset                      // right column
-      return updateAndLoad(input, this->strideLowToHigh);
+           + (firstY / 2) * verticalStride   // right row
           + (firstY & 1) * strideLowToHigh; // possibly in high band
  }
 };
 /// Directly loads coefficients from four consecutively stored transformed
 /// bands.
 /// @tparam T        type of input band coefficients
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED>
 class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
 private:
  int last; ///< index of last loaded pixel
  /// Checks internal index and possibly reverses direction of loader.
  /// (Handles mirroring at the bottom of the image.)
  /// @param input   input image to load next coefficient from
  /// @param stride  stride to use now (one of two loader's strides)
  /// @return loaded coefficient
  __device__ T updateAndLoad(const T *const input, const int &stride) {
    last += stride;
    if (CHECKED && (last == this->end)) {
      // undo last two updates of index (to get to previous mirrored item)
      last -= (this->strideLowToHigh + this->strideHighToLow);
      // swap and reverse strides (to move up in the loaded column now)
      const int temp = this->strideLowToHigh;
      this->strideLowToHigh = -this->strideHighToLow;
      this->strideHighToLow = -temp;
    }
-
+    if (last < 0) {
-  };
+      return 0;
  /// Directly saves coefficients into four transformed bands.
  /// @tparam T        type of output band coefficients
  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
  template <typename T, bool CHECKED>
  class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
  private:
    int next;  ///< index of last loaded pixel
    /// Checks internal index and possibly stops the writer.
    /// (Handles mirroring at edges of the image.)
    /// @param output  output buffer
    /// @param item    item to put into the output
    /// @param stride  increment of the pointer to get to next output index
    __device__ int saveAndUpdate(T * const output, const T & item,
                                  const int & stride) {
 //	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){		//test, Mar 20					  
      if((!CHECKED) || (next != this->end)) {
        // if(next == 4) {
        //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
        // }
        output[next] = item;
        next += stride;
      } 
 //	}
      // if((!CHECKED) || (next != this->end)) { //the real one
        // output[next] = item;
        // next += stride;  //stride has been test
      // } 
 	return next;
    }
-  public:
+    // avoid reading from negative indices if loader is checked
    // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this
    // checked variant later
    return input[last];
  }
-    /// Initializes writer - sets output size and a position of first pixel.
+public:
-    /// @param output       output image
+  /// Initializes loader - sets input size and a position of first pixel.
-    /// @param imageSizeX   width of the image
+  /// @param imageSizeX   width of the image
-    /// @param imageSizeY   height of the image
+  /// @param imageSizeY   height of the image
-    /// @param firstX       x-coordinate of first pixel to write
+  /// @param firstX       x-coordinate of first pixel to load
-    ///                     (Parity determines vertically low or high band.)
+  ///                     (Parity determines vertically low or high band.)
-    /// @param firstY       y-coordinate of first pixel to write
+  /// @param firstY       y-coordinate of first pixel to load
-    ///                     (Parity determines horizontally low or high band.)
+  ///                     (Parity determines horizontally low or high band.)
-    __device__ void init(const int imageSizeX, const int imageSizeY,
+  __device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
-                         const int firstX, const int firstY) {
+                       const int firstY) {
-      if (firstX < imageSizeX) {
+    this->mirror(firstX, imageSizeX);
-        next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
+    last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
      } else {
        clear();
      }
    }
    /// Sets all fields to zeros, for compiler not to complain about
    /// uninitialized stuff.
    __device__ void clear() {
      this->end = 0;
      this->strideHighToLow = 0;
      this->strideLowToHigh = 0;
      this->next = 0;
    }
-    /// Writes another coefficient into the band which was specified using
+    // adjust to point to previous item
-    /// init's firstX and firstY parameters and advances internal pointer.
+    last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
-    /// Call this method first if position of first pixel passed to init
+  }
    /// was in lowpass band.
    /// @param output  output image
    /// @param low     lowpass coefficient to save into the lowpass band
    __device__ int writeLowInto(T * const output, const T & primary) {
      return saveAndUpdate(output, primary, this->strideLowToHigh);
    }
-    /// Writes another coefficient from the other band and advances pointer.
+  /// Sets all fields to zeros, for compiler not to complain about
-    /// Call this method first if position of first pixel passed to init
+  /// uninitialized stuff.
-    /// was in highpass band.
+  __device__ void clear() {
-    /// @param output  output image
+    this->end = 0;
-    /// @param high    highpass coefficient to save into the highpass band
+    this->strideHighToLow = 0;
-    __device__ int writeHighInto(T * const output, const T & other) {
+    this->strideLowToHigh = 0;
-      return saveAndUpdate(output, other, this->strideHighToLow);
+    this->last = 0;
-    }
+  }
  /// Gets another coefficient from lowpass band and advances internal index.
  /// Call this method first if position of first pixel passed to init
  /// was in high band.
  /// @param input   input image to load next coefficient from
  /// @return next coefficient from the lowpass band of the given image
  __device__ T loadLowFrom(const T *const input) {
    return updateAndLoad(input, this->strideHighToLow);
  }
  /// Gets another coefficient from the highpass band and advances index.
  /// Call this method first if position of first pixel passed to init
  /// was in high band.
  /// @param input   input image to load next coefficient from
  /// @return next coefficient from the highbass band of the given image
  __device__ T loadHighFrom(const T *const input) {
    return updateAndLoad(input, this->strideLowToHigh);
  }
 };
 /// Directly saves coefficients into four transformed bands.
 /// @tparam T        type of output band coefficients
 /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
 template <typename T, bool CHECKED>
 class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
 private:
  int next; ///< index of last loaded pixel
  /// Checks internal index and possibly stops the writer.
  /// (Handles mirroring at edges of the image.)
  /// @param output  output buffer
  /// @param item    item to put into the output
  /// @param stride  increment of the pointer to get to next output index
  __device__ int saveAndUpdate(T *const output, const T &item,
                               const int &stride) {
    //	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
    ////test, Mar 20
    if ((!CHECKED) || (next != this->end)) {
      // if(next == 4) {
      //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
      // }
      output[next] = item;
      next += stride;
    }
    //	}
    // if((!CHECKED) || (next != this->end)) { //the real one
    // output[next] = item;
    // next += stride;  //stride has been test
    // }
    return next;
  }
 public:
  /// Initializes writer - sets output size and a position of first pixel.
  /// @param output       output image
  /// @param imageSizeX   width of the image
  /// @param imageSizeY   height of the image
  /// @param firstX       x-coordinate of first pixel to write
  ///                     (Parity determines vertically low or high band.)
  /// @param firstY       y-coordinate of first pixel to write
  ///                     (Parity determines horizontally low or high band.)
  __device__ void init(const int imageSizeX, const int imageSizeY,
                       const int firstX, const int firstY) {
    if (firstX < imageSizeX) {
      next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
    } else {
      clear();
    }
  }
  /// Sets all fields to zeros, for compiler not to complain about
  /// uninitialized stuff.
  __device__ void clear() {
    this->end = 0;
    this->strideHighToLow = 0;
    this->strideLowToHigh = 0;
    this->next = 0;
  }
  /// Writes another coefficient into the band which was specified using
  /// init's firstX and firstY parameters and advances internal pointer.
  /// Call this method first if position of first pixel passed to init
  /// was in lowpass band.
  /// @param output  output image
  /// @param low     lowpass coefficient to save into the lowpass band
  __device__ int writeLowInto(T *const output, const T &primary) {
    return saveAndUpdate(output, primary, this->strideLowToHigh);
  }
  /// Writes another coefficient from the other band and advances pointer.
  /// Call this method first if position of first pixel passed to init
  /// was in highpass band.
  /// @param output  output image
  /// @param high    highpass coefficient to save into the highpass band
  __device__ int writeHighInto(T *const output, const T &other) {
    return saveAndUpdate(output, other, this->strideHighToLow);
  }
  //*******Add three functions to get private values*******
  __device__ int getnext() { return next; }
  __device__ int getend() { return this->end; }
  __device__ int getstrideHighToLow() { return this->strideHighToLow; }
  __device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
  //*******Add three functions to get private values*******
 };
 	//*******Add three functions to get private values*******
 	__device__ int getnext(){
 		return next;
 	}
 	__device__ int getend(){
 		return this->end;
 	}
 	__device__ int getstrideHighToLow(){
 		return this->strideHighToLow;
 	}
 	__device__ int getstrideLowToHigh(){
 		return this->strideLowToHigh;
 	}
 	//*******Add three functions to get private values*******
  };
 } // namespace dwt_cuda
-
+#endif // IO_H
 #endif	// IO_H
--- a/examples/dwt2d/dwt_cuda/rdwt53.cu
+++ b/examples/dwt2d/dwt_cuda/rdwt53.cu
@ -1,4 +1,4 @@
-/// 
+///
 /// @file    rdwt53.cu
 /// @brief   CUDA implementation of reverse 5/3 2D DWT.
 /// @author  Martin Jirman (207962@mail.muni.cz)
@ -7,16 +7,16 @@
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -38,7 +38,7 @@
 namespace dwt_cuda {
-  
+
  /// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
  /// using sliding window and lifting schema.
@ -46,8 +46,8 @@ namespace dwt_cuda {
  /// @tparam WIN_SIZE_Y  height of sliding window
  template <int WIN_SIZE_X, int WIN_SIZE_Y>
  class RDWT53 {
-  private: 
+  private:
-    
+
    /// Shared memory buffer used for 5/3 DWT transforms.
    typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
@ -64,10 +64,10 @@ namespace dwt_cuda {
    struct RDWT53Column {
      /// loader of pixels from column in input image
      VerticalDWTBandLoader<int, CHECKED> loader;
-      
+
      /// Offset of corresponding column in shared buffer.
      int offset;
-      
+
      /// Sets all fields to some values to avoid 'uninitialized' warnings.
      __device__ void clear() {
        offset = 0;
@ -128,7 +128,7 @@ namespace dwt_cuda {
    /// @param sizeY     height of the input image
    /// @param loader    (uninitialized) info about loaded column
    template <bool CHECKED>
-    __device__ void initColumn(const int columnX, const int * const input, 
+    __device__ void initColumn(const int columnX, const int * const input,
                               const int sizeX, const int sizeY,
                               RDWT53Column<CHECKED> & column,
                               const int firstY) {
@ -162,7 +162,7 @@ namespace dwt_cuda {
    /// @tparam CHECKED_WRITES  true if boundaries must be checked when writing
    /// @param in        input image (5/3 transformed coefficients)
    /// @param out       output buffer (for reverse transformed image)
-    /// @param sizeX     width of the output image 
+    /// @param sizeX     width of the output image
    /// @param sizeY     height of the output image
    /// @param winSteps  number of sliding window steps
    template<bool CHECKED_LOADS, bool CHECKED_WRITES>
@ -182,7 +182,7 @@ namespace dwt_cuda {
        // column #0, thread #1 get right column #1 and thread #2 left column.
        const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
-        // Thread initializes offset of the boundary column (in shared 
+        // Thread initializes offset of the boundary column (in shared
        // buffer), first 3 pixels of the column and a loader for this column.
        initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
      }
@ -216,8 +216,8 @@ namespace dwt_cuda {
        // horizontally transform all newly loaded lines
        horizontalTransform(WIN_SIZE_Y, 3);
-        // Using 3 registers, remember current values of last 3 rows 
+        // Using 3 registers, remember current values of last 3 rows
-        // of transform buffer. These rows are transformed horizontally 
+        // of transform buffer. These rows are transformed horizontally
        // only and will be used in next iteration.
        int last3Lines[3];
        last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
@ -253,7 +253,7 @@ namespace dwt_cuda {
    /// Main GPU 5/3 RDWT entry point.
    /// @param in     input image (5/3 transformed coefficients)
    /// @param out    output buffer (for reverse transformed image)
-    /// @param sizeX  width of the output image 
+    /// @param sizeX  width of the output image
    /// @param sizeY  height of the output image
    /// @param winSteps  number of sliding window steps
    __device__ static void run(const int * const input, int * const output,
@ -284,13 +284,13 @@ namespace dwt_cuda {
    }
  }; // end of class RDWT53
-  
+
-  
+
-  
+
  /// Main GPU 5/3 RDWT entry point.
  /// @param in     input image (5/3 transformed coefficients)
  /// @param out    output buffer (for reverse transformed image)
-  /// @param sizeX  width of the output image 
+  /// @param sizeX  width of the output image
  /// @param sizeY  height of the output image
  /// @param winSteps  number of sliding window steps
  template <int WIN_SX, int WIN_SY>
@ -299,34 +299,34 @@ namespace dwt_cuda {
                               const int sx, const int sy, const int steps) {
    RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
  }
-  
+
-  
+
-  
+
-  /// Only computes optimal number of sliding window steps, 
+  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 5/3 RDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
-  /// @param sx       width of the input image 
+  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
    // compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);
-    
+
    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
-    
+
    // finally transform this level
    PERF_BEGIN
    rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
    PERF_END("        RDWT53", sx, sy)
    CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
  }
-    
+
-  
+
-  
+
  /// Reverse 5/3 2D DWT. See common rules (above) for more details.
  /// @param in      Input DWT coefficients. Format described in common rules.
  ///                Will not be preserved (will be overwritten).
@ -341,11 +341,11 @@ namespace dwt_cuda {
      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
      rdwt53(in, out, llSizeX, llSizeY, levels - 1);
-      
+
      // copy reverse transformed LL band from output back into the input
      memCopy(in, out, llSizeX, llSizeY);
    }
-    
+
    // select right width of kernel for the size of the image
    if(sizeX >= 960) {
      launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
@ -355,6 +355,6 @@ namespace dwt_cuda {
      launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
    }
  }
-  
+
 } // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/rdwt97.cu
+++ b/examples/dwt2d/dwt_cuda/rdwt97.cu
@ -1,4 +1,4 @@
-/// 
+///
 /// @file    rdwt97.cu
 /// @brief   CUDA implementation of reverse 9/7 2D DWT.
 /// @author  Martin Jirman (207962@mail.muni.cz)
@ -7,16 +7,16 @@
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -38,7 +38,7 @@
 namespace dwt_cuda {
-  
+
  /// Wraps shared memory buffer and methods for computing 9/7 RDWT using
  /// lifting schema and sliding window.
  /// @tparam WIN_SIZE_X  width of the sliding window
@ -46,7 +46,7 @@ namespace dwt_cuda {
  template <int WIN_SIZE_X, int WIN_SIZE_Y>
  class RDWT97 {
  private:
-    
+
    /// Info related to loading of one input column.
    /// @tparam CHECKED true if boundary chould be checked,
    ///                 false if there is no near boudnary
@ -54,10 +54,10 @@ namespace dwt_cuda {
    struct RDWT97Column  {
      /// laoder of input pxels for given column.
      VerticalDWTBandLoader<float, CHECKED> loader;
-      
+
      /// Offset of loaded column in shared memory buffer.
      int offset;
-      
+
      /// Sets all fields to some values to avoid 'uninitialized' warnings.
      __device__ void clear() {
        loader.clear();
@ -104,7 +104,7 @@ namespace dwt_cuda {
    /// @param column    (uninitialized) info about loading one column
    /// @param firstY    index of first image row to be transformed
    template <bool CHECKED>
-    __device__ void initColumn(const int colIndex, const float * const input, 
+    __device__ void initColumn(const int colIndex, const float * const input,
                               const int sizeX, const int sizeY,
                               RDWT97Column<CHECKED> & column,
                               const int firstY) {
@ -124,7 +124,7 @@ namespace dwt_cuda {
        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
        buffer[column.offset + 5 * STRIDE] =
        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
-        buffer[column.offset + 6 * STRIDE] = 
+        buffer[column.offset + 6 * STRIDE] =
        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
      } else {
        // non-topmost row - regular loading:
@ -162,7 +162,7 @@ namespace dwt_cuda {
    ///                         when writing into output buffer
    /// @param in        input image (9/7 transformed coefficients)
    /// @param out       output buffer (for reverse transformed image)
-    /// @param sizeX     width of the output image 
+    /// @param sizeX     width of the output image
    /// @param sizeY     height of the output image
    /// @param winSteps  number of steps of sliding window
    template <bool CHECKED_LOADS, bool CHECKED_WRITES>
@ -182,7 +182,7 @@ namespace dwt_cuda {
        // each thread among first 7 ones gets index of one of boundary columns
        const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7);
-        // Thread initializes offset of the boundary column (in shared  
+        // Thread initializes offset of the boundary column (in shared
        // buffer), first 7 pixels of the column and a loader for this column.
        initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
      }
@ -201,7 +201,7 @@ namespace dwt_cuda {
      // offset of column (in transform buffer) saved by this thread
      const int outColumnOffset = buffer.getColumnOffset(threadIdx.x);
-      // (Each iteration assumes that first 7 rows of transform buffer are 
+      // (Each iteration assumes that first 7 rows of transform buffer are
      // already loaded with horizontally transformed pixels.)
      for(int w = 0; w < winSteps; w++) {
        // Load another WIN_SIZE_Y lines of this thread's column
@ -216,8 +216,8 @@ namespace dwt_cuda {
        // horizontally transform all newly loaded lines
        horizontalRDWT97(WIN_SIZE_Y, 7);
-        // Using 7 registers, remember current values of last 7 rows 
+        // Using 7 registers, remember current values of last 7 rows
-        // of transform buffer. These rows are transformed horizontally 
+        // of transform buffer. These rows are transformed horizontally
        // only and will be used in next iteration.
        float last7Lines[7];
        for(int i = 0; i < 7; i++) {
@ -257,13 +257,13 @@ namespace dwt_cuda {
    /// Main GPU 9/7 RDWT entry point.
    /// @param in     input image (9/7 transformed coefficients)
    /// @param out    output buffer (for reverse transformed image)
-    /// @param sizeX  width of the output image 
+    /// @param sizeX  width of the output image
    /// @param sizeY  height of the output image
    __device__ static void run(const float * const input, float * const output,
                               const int sx, const int sy, const int steps) {
      // prepare instance with buffer in shared memory
      __shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97;
-      
+
      // Compute limits of this threadblock's block of pixels and use them to
      // determine, whether this threadblock will have to deal with boundary.
      // (3 in next expressions is for radius of impulse response of 9/7 RDWT.)
@ -285,15 +285,15 @@ namespace dwt_cuda {
        rdwt97.transform<false, false>(input, output, sx, sy, steps);
      }
    }
-    
+
  }; // end of class RDWT97
-  
+
-    
+
-  
+
  /// Main GPU 9/7 RDWT entry point.
  /// @param in     input image (9/7 transformed coefficients)
  /// @param out    output buffer (for reverse transformed image)
-  /// @param sizeX  width of the output image 
+  /// @param sizeX  width of the output image
  /// @param sizeY  height of the output image
  template <int WIN_SX, int WIN_SY>
  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8))
@ -301,34 +301,34 @@ namespace dwt_cuda {
                               const int sx, const int sy, const int steps) {
    RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
  }
-  
+
-  
+
-  
+
-  /// Only computes optimal number of sliding window steps, 
+  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 9/7 RDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
-  /// @param sx       width of the input image 
+  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchRDWT97Kernel (float * in, float * out, int sx, int sy) {
    // compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);
-    
+
    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
-    
+
    // finally launch kernel
    PERF_BEGIN
    rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
    PERF_END("        RDWT97", sx, sy)
    CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel");
  }
-  
+
-  
+
-  
+
  /// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details.
  /// @param in      Input DWT coefficients. Format described in common rules.
  ///                Will not be preserved (will be overwritten).
@ -343,11 +343,11 @@ namespace dwt_cuda {
      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
      rdwt97(in, out, llSizeX, llSizeY, levels - 1);
-      
+
      // copy reverse transformed LL band from output back into the input
      memCopy(in, out, llSizeX, llSizeY);
    }
-    
+
    // select right width of kernel for the size of the image
    if(sizeX >= 960) {
      launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
@ -357,7 +357,7 @@ namespace dwt_cuda {
      launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
    }
  }
-  
+
 } // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/transform_buffer.h
+++ b/examples/dwt2d/dwt_cuda/transform_buffer.h
@ -7,16 +7,16 @@
 ///
 /// Copyright (c) 2011 Martin Jirman
 /// All rights reserved.
-/// 
+///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions are met:
-/// 
+///
 ///     * Redistributions of source code must retain the above copyright
 ///       notice, this list of conditions and the following disclaimer.
 ///     * Redistributions in binary form must reproduce the above copyright
 ///       notice, this list of conditions and the following disclaimer in the
 ///       documentation and/or other materials provided with the distribution.
-/// 
+///
 /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -30,344 +30,309 @@
 /// POSSIBILITY OF SUCH DAMAGE.
 ///
 #ifndef TRANSFORM_BUFFER_H
-#define	TRANSFORM_BUFFER_H
+#define TRANSFORM_BUFFER_H
 namespace dwt_cuda {
  /// Buffer (in shared memory of GPU) where block of input image is stored,
  /// but odd and even lines are separated. (Generates less bank conflicts when 
  /// using lifting schema.) All operations expect SIZE_X threads.
  /// Also implements basic building blocks of lifting schema.
  /// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
  ///                     a number of threads participating on all operations.)
  ///                     Must be divisible by 4.
  /// @tparam SIZE_Y      height of buffer (total number of lines)
  /// @tparam BOUNDARY_X  number of extra pixels at the left and right side
  ///                     boundary is expected to be smaller than half SIZE_X
  ///                     Must be divisible by 2.
  template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
  class TransformBuffer {
  public:
    enum {
      /// difference between pointers to two vertical neigbors
      VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
    };
  private:
    enum {
      /// number of shared memory banks - needed for correct padding
      #ifdef __CUDA_ARCH__
      SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
      #else
      SHM_BANKS = 16,  // for host code only - can be anything, won't be used
      #endif
      /// size of one of two buffers (odd or even)
      BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
      /// unused space between two buffers
      PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
      /// offset of the odd columns buffer from the beginning of data buffer
      ODD_OFFSET = BUFFER_SIZE + PADDING,
    };
-    /// buffer for both even and odd columns
+/// Buffer (in shared memory of GPU) where block of input image is stored,
-    T data[2 * BUFFER_SIZE + PADDING];
+/// but odd and even lines are separated. (Generates less bank conflicts when
-    
+/// using lifting schema.) All operations expect SIZE_X threads.
-    
+/// Also implements basic building blocks of lifting schema.
-    
+/// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
-    /// Applies specified function to all central elements while also passing
+///                     a number of threads participating on all operations.)
-    /// previous and next elements as parameters.
+///                     Must be divisible by 4.
-    /// @param count         count of central elements to apply function to
+/// @tparam SIZE_Y      height of buffer (total number of lines)
-    /// @param prevOffset    offset of first central element
+/// @tparam BOUNDARY_X  number of extra pixels at the left and right side
-    /// @param midOffset     offset of first central element's predecessor
+///                     boundary is expected to be smaller than half SIZE_X
-    /// @param nextOffset    offset of first central element's successor
+///                     Must be divisible by 2.
-    /// @param function      the function itself
+template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
-    template <typename FUNC>
+class TransformBuffer {
-    __device__ void horizontalStep(const int count, const int prevOffset, 
+public:
-                                   const int midOffset, const int nextOffset,
+  enum {
-                                   const FUNC & function) {
+    /// difference between pointers to two vertical neigbors
-      // number of unchecked iterations
+    VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
-      const int STEPS = count / SIZE_X;
+  };
-      
+
-      // items remaining after last unchecked iteration
+private:
-      const int finalCount = count % SIZE_X; 
+  enum {
-      
+/// number of shared memory banks - needed for correct padding
-      // offset of items processed in last (checked) iteration
+#ifdef __CUDA_ARCH__
-      const int finalOffset = count - finalCount;  
+    SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
-      
+#else
-      // all threads perform fixed number of iterations ...
+    SHM_BANKS = 16, // for host code only - can be anything, won't be used
-      for(int i = 0; i < STEPS; i++) {
+#endif
    /// size of one of two buffers (odd or even)
    BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
    /// unused space between two buffers
    PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
    /// offset of the odd columns buffer from the beginning of data buffer
    ODD_OFFSET = BUFFER_SIZE + PADDING,
  };
  /// buffer for both even and odd columns
  T data[2 * BUFFER_SIZE + PADDING];
  /// Applies specified function to all central elements while also passing
  /// previous and next elements as parameters.
  /// @param count         count of central elements to apply function to
  /// @param prevOffset    offset of first central element
  /// @param midOffset     offset of first central element's predecessor
  /// @param nextOffset    offset of first central element's successor
  /// @param function      the function itself
  template <typename FUNC>
  __device__ void horizontalStep(const int count, const int prevOffset,
                                 const int midOffset, const int nextOffset,
                                 const FUNC &function) {
    // number of unchecked iterations
    const int STEPS = count / SIZE_X;
    // items remaining after last unchecked iteration
    const int finalCount = count % SIZE_X;
    // offset of items processed in last (checked) iteration
    const int finalOffset = count - finalCount;
    // all threads perform fixed number of iterations ...
    for (int i = 0; i < STEPS; i++) {
      // for(int i = 0; i < 3; i++) {
-        const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
+      const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
-        const T next     = data[nextOffset + i * SIZE_X + threadIdx.x];
+      const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
-        T & center       = data[midOffset  + i * SIZE_X + threadIdx.x];
+      T &center = data[midOffset + i * SIZE_X + threadIdx.x];
-        // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
+      // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
-        function(previous, center, next);// the real one
+      function(previous, center, next); // the real one
      }
      // ... but not all threads participate on final iteration
      if(threadIdx.x < finalCount) {
        const T previous = data[prevOffset + finalOffset + threadIdx.x];
        const T next     = data[nextOffset + finalOffset + threadIdx.x];
        T & center = data[midOffset + finalOffset + threadIdx.x];
        // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
        // kaixi
        function(previous, center, next);//the real one
      }
    }
-  public:
+    // ... but not all threads participate on final iteration
-
+    if (threadIdx.x < finalCount) {
-    __device__ void getPrintData() {
+      const T previous = data[prevOffset + finalOffset + threadIdx.x];
-      //
+      const T next = data[nextOffset + finalOffset + threadIdx.x];
-      for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) {     
+      T &center = data[midOffset + finalOffset + threadIdx.x];
-          printf(" index: %d  data: %f \n ", i ,data[i]);  
+      // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
-      }
+      // kaixi
-
+      function(previous, center, next); // the real one
   }
    /// Gets offset of the column with given index. Central columns have 
    /// indices from 0 to NUM_LINES - 1, left boundary columns have negative 
    /// indices and right boundary columns indices start with NUM_LINES.
    /// @param columnIndex  index of column to get pointer to
    /// @return  offset of the first item of column with specified index
    __device__ int getColumnOffset(int columnIndex) {
      columnIndex += BOUNDARY_X;             // skip boundary
      return columnIndex / 2                 // select right column
          + (columnIndex & 1) * ODD_OFFSET;  // select odd or even buffer
    }
-    
+  }
    /// Provides access to data of the transform buffer.
    /// @param index  index of the item to work with
    /// @return reference to item at given index
    __device__ T & operator[] (const int index) {
      return data[index];
    }
    /// Applies specified function to all horizontally even elements in 
    /// specified lines. (Including even elements in boundaries except 
    /// first even element in first left boundary.) SIZE_X threads participate 
    /// and synchronization is needed before result can be used.
    /// @param firstLine  index of first line
    /// @param numLines   count of lines
    /// @param func       function to be applied on all even elements
    ///                   parameters: previous (odd) element, the even
    ///                   element itself and finally next (odd) element
    template <typename FUNC>
    __device__ void forEachHorizontalEven(const int firstLine,
                                          const int numLines,
                                          const FUNC & func) {
      // number of even elemens to apply function to
      const int count = numLines * VERTICAL_STRIDE - 1;
      // offset of first even element
      const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
      // offset of odd predecessor of first even element
      const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
      // offset of odd successor of first even element
      const int nextOffset = prevOffset + 1;
-      // if(threadIdx.x == 0) {
+public:
  __device__ void getPrintData() {
    //
    for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
      printf(" index: %d  data: %f \n ", i, data[i]);
    }
  }
-      //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
+  /// Gets offset of the column with given index. Central columns have
-      // }
+  /// indices from 0 to NUM_LINES - 1, left boundary columns have negative
-      
+  /// indices and right boundary columns indices start with NUM_LINES.
-      // call generic horizontal step function
+  /// @param columnIndex  index of column to get pointer to
-      horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
+  /// @return  offset of the first item of column with specified index
-    }
+  __device__ int getColumnOffset(int columnIndex) {
-    
+    columnIndex += BOUNDARY_X;               // skip boundary
-    
+    return columnIndex / 2                   // select right column
-    /// Applies given function to all horizontally odd elements in specified
+           + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
-    /// lines. (Including odd elements in boundaries except last odd element
+  }
    /// in last right boundary.) SIZE_X threads participate and synchronization
    /// is needed before result can be used.
    /// @param firstLine  index of first line
    /// @param numLines   count of lines
    /// @param func       function to be applied on all odd elements
    ///                   parameters: previous (even) element, the odd
    ///                   element itself and finally next (even) element
    template <typename FUNC>
    __device__ void forEachHorizontalOdd(const int firstLine,
                                         const int numLines,
                                         const FUNC & func) {
      // numbet of odd elements to apply function to
      const int count = numLines * VERTICAL_STRIDE - 1;
      // offset of even predecessor of first odd element
      const int prevOffset = firstLine * VERTICAL_STRIDE;
      // offset of first odd element
      const int centerOffset = prevOffset + ODD_OFFSET;
      // offset of even successor of first odd element
      const int nextOffset = prevOffset + 1;
-      //  if(threadIdx.x == 0) {
+  /// Provides access to data of the transform buffer.
-      //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
+  /// @param index  index of the item to work with
-      // }
+  /// @return reference to item at given index
-      
+  __device__ T &operator[](const int index) { return data[index]; }
-      
+
-      // call generic horizontal step function
+  /// Applies specified function to all horizontally even elements in
-      horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
+  /// specified lines. (Including even elements in boundaries except
-    }
+  /// first even element in first left boundary.) SIZE_X threads participate
-    
+  /// and synchronization is needed before result can be used.
-    
+  /// @param firstLine  index of first line
-    /// Applies specified function to all even elements (except element #0)
+  /// @param numLines   count of lines
-    /// of given column. Each thread takes care of one column, so there's 
+  /// @param func       function to be applied on all even elements
-    /// no need for synchronization.
+  ///                   parameters: previous (odd) element, the even
-    /// @param columnOffset  offset of thread's column
+  ///                   element itself and finally next (odd) element
-    /// @param f             function to be applied on all even elements
+  template <typename FUNC>
-    ///                      parameters: previous (odd) element, the even
+  __device__ void forEachHorizontalEven(const int firstLine, const int numLines,
-    ///                      element itself and finally next (odd) element
+                                        const FUNC &func) {
-    template <typename F>
+    // number of even elemens to apply function to
-    __device__ void forEachVerticalEven(const int columnOffset, const F & f) {
+    const int count = numLines * VERTICAL_STRIDE - 1;
-      if(SIZE_Y > 3) { // makes no sense otherwise
+    // offset of first even element
-        const int steps = SIZE_Y / 2 - 1;
+    const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
-        for(int i = 0; i < steps; i++) {
+    // offset of odd predecessor of first even element
-          const int row = 2 + i * 2;
+    const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
-          const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
+    // offset of odd successor of first even element
-          const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
+    const int nextOffset = prevOffset + 1;
-          f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next);
+
-		  
+    // if(threadIdx.x == 0) {
-		  //--------------- FOR TEST -----------------
+
-/*		__syncthreads();
+    //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d
-		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
+    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
-			diffOut[2500]++;
+    // }
-			diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
+
-		}	
+    // call generic horizontal step function
-		__syncthreads();
+    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
-*/		  //--------------- FOR TEST -----------------
+  }
-		  
+
-		  
+  /// Applies given function to all horizontally odd elements in specified
-        }
+  /// lines. (Including odd elements in boundaries except last odd element
-      }
+  /// in last right boundary.) SIZE_X threads participate and synchronization
-    }
+  /// is needed before result can be used.
-    
+  /// @param firstLine  index of first line
-    
+  /// @param numLines   count of lines
-    /// Applies specified function to all odd elements of given column.
+  /// @param func       function to be applied on all odd elements
-    /// Each thread takes care of one column, so there's no need for
+  ///                   parameters: previous (even) element, the odd
-    /// synchronization.
+  ///                   element itself and finally next (even) element
-    /// @param columnOffset  offset of thread's column
+  template <typename FUNC>
-    /// @param f             function to be applied on all odd elements
+  __device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
-    ///                      parameters: previous (even) element, the odd
+                                       const FUNC &func) {
-    ///                      element itself and finally next (even) element
+    // numbet of odd elements to apply function to
-    template <typename F>
+    const int count = numLines * VERTICAL_STRIDE - 1;
-    __device__ void forEachVerticalOdd(const int columnOffset, const F & f) {
+    // offset of even predecessor of first odd element
-      const int steps = (SIZE_Y - 1) / 2;
+    const int prevOffset = firstLine * VERTICAL_STRIDE;
-      for(int i = 0; i < steps; i++) {
+    // offset of first odd element
-        const int row = i * 2 + 1;
+    const int centerOffset = prevOffset + ODD_OFFSET;
    // offset of even successor of first odd element
    const int nextOffset = prevOffset + 1;
    //  if(threadIdx.x == 0) {
    //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d
    //   nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
    // }
    // call generic horizontal step function
    horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
  }
  /// Applies specified function to all even elements (except element #0)
  /// of given column. Each thread takes care of one column, so there's
  /// no need for synchronization.
  /// @param columnOffset  offset of thread's column
  /// @param f             function to be applied on all even elements
  ///                      parameters: previous (odd) element, the even
  ///                      element itself and finally next (odd) element
  template <typename F>
  __device__ void forEachVerticalEven(const int columnOffset, const F &f) {
    if (SIZE_Y > 3) { // makes no sense otherwise
      const int steps = SIZE_Y / 2 - 1;
      for (int i = 0; i < steps; i++) {
        const int row = 2 + i * 2;
        const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
        const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
        f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
-		f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
+        //--------------- FOR TEST -----------------
-		
+        /*		__syncthreads();
-		
+                        if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
-		  //--------------- FOR TEST -----------------
+                                diffOut[2500]++;
-/*		__syncthreads();
+                                diffOut[diffOut[2500]] = 2;//data[columnOffset +
-		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
+           row * VERTICAL_STRIDE];
-			diffOut[2500]++;
+                        }
-			diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];
+                        __syncthreads();
-		}
+        */		  //--------------- FOR TEST -----------------
 		__syncthreads();
 */		  //--------------- FOR TEST -----------------
      }
    }
-    
+  }
    /// Scales elements at specified lines.
    /// @param evenScale  scaling factor for horizontally even elements
    /// @param oddScale   scaling factor for horizontally odd elements
    /// @param numLines   number of lines, whose elements should be scaled
    /// @param firstLine  index of first line to scale elements in
    __device__ void scaleHorizontal(const T evenScale, const T oddScale,
                                    const int firstLine, const int numLines) {
      const int offset = firstLine * VERTICAL_STRIDE;
      const int count = numLines * VERTICAL_STRIDE;
      const int steps = count / SIZE_X;
      const int finalCount = count % SIZE_X;
      const int finalOffset = count - finalCount;
-      // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);
+  /// Applies specified function to all odd elements of given column.
-      
+  /// Each thread takes care of one column, so there's no need for
-      // run iterations, whete all threads participate
+  /// synchronization.
-      for(int i = 0; i < steps; i++) {
+  /// @param columnOffset  offset of thread's column
-        data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
+  /// @param f             function to be applied on all odd elements
-        // if(threadIdx.x + i * SIZE_X + offset == 531) {
+  ///                      parameters: previous (even) element, the odd
-        //   printf("threadidx 531: %d \n", threadIdx.x);
+  ///                      element itself and finally next (even) element
-        // }
+  template <typename F>
-        // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
+  __device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
-        //   printf("threadidx 531: %d \n", threadIdx.x);
+    const int steps = (SIZE_Y - 1) / 2;
-        // }
+    for (int i = 0; i < steps; i++) {
-        data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
+      const int row = i * 2 + 1;
-      }
+      const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
-      
+      const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
      // some threads also finish remaining unscaled items
      if(threadIdx.x < finalCount) {
        data[threadIdx.x + finalOffset + offset] *= evenScale;
        // if(threadIdx.x + finalOffset + offset == 531) {
        //   printf("threadidx 531: %d \n", threadIdx.x);
        // }
        //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
        //   printf("threadidx 531: %d \n", threadIdx.x);
        // }
        data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
      }
      f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
      //--------------- FOR TEST -----------------
      /*		__syncthreads();
                      if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
                              diffOut[2500]++;
                              diffOut[diffOut[2500]] = 1; //data[columnOffset +
         row * VERTICAL_STRIDE];
                      }
                      __syncthreads();
      */		  //--------------- FOR TEST -----------------
    }
-    
+  }
-    
+
-    /// Scales elements in specified column.
+  /// Scales elements at specified lines.
-    /// @param evenScale     scaling factor for vertically even elements
+  /// @param evenScale  scaling factor for horizontally even elements
-    /// @param oddScale      scaling factor for vertically odd elements
+  /// @param oddScale   scaling factor for horizontally odd elements
-    /// @param columnOffset  offset of the column to work with
+  /// @param numLines   number of lines, whose elements should be scaled
-    /// @param numLines      number of lines, whose elements should be scaled
+  /// @param firstLine  index of first line to scale elements in
-    /// @param firstLine     index of first line to scale elements in
+  __device__ void scaleHorizontal(const T evenScale, const T oddScale,
-    __device__ void scaleVertical(const T evenScale, const T oddScale,
+                                  const int firstLine, const int numLines) {
-                                  const int columnOffset, const int numLines,
+    const int offset = firstLine * VERTICAL_STRIDE;
-                                  const int firstLine) {
+    const int count = numLines * VERTICAL_STRIDE;
-      for(int i = firstLine; i < (numLines + firstLine); i++) {
+    const int steps = count / SIZE_X;
-        if(i & 1) {
+    const int finalCount = count % SIZE_X;
-          data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
+    const int finalOffset = count - finalCount;
-        } else {
+
-          data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
+    // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d,
-        }
+    // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
    // finalCount, finalOffset);
    // run iterations, whete all threads participate
    for (int i = 0; i < steps; i++) {
      data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
      // if(threadIdx.x + i * SIZE_X + offset == 531) {
      //   printf("threadidx 531: %d \n", threadIdx.x);
      // }
      // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
      //   printf("threadidx 531: %d \n", threadIdx.x);
      // }
      data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
    }
    // some threads also finish remaining unscaled items
    if (threadIdx.x < finalCount) {
      data[threadIdx.x + finalOffset + offset] *= evenScale;
      // if(threadIdx.x + finalOffset + offset == 531) {
      //   printf("threadidx 531: %d \n", threadIdx.x);
      // }
      //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
      //   printf("threadidx 531: %d \n", threadIdx.x);
      // }
      data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
    }
  }
  /// Scales elements in specified column.
  /// @param evenScale     scaling factor for vertically even elements
  /// @param oddScale      scaling factor for vertically odd elements
  /// @param columnOffset  offset of the column to work with
  /// @param numLines      number of lines, whose elements should be scaled
  /// @param firstLine     index of first line to scale elements in
  __device__ void scaleVertical(const T evenScale, const T oddScale,
                                const int columnOffset, const int numLines,
                                const int firstLine) {
    for (int i = firstLine; i < (numLines + firstLine); i++) {
      if (i & 1) {
        data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
      } else {
        data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
      }
    }
-	
+  }
 	//****************For Test(Feb23), test inter parameters*************
 	__device__ int getVERTICAL_STRIDE(){
 		return VERTICAL_STRIDE;
 	}
 	__device__ int getSHM_BANKS(){
 		return SHM_BANKS;
 	}
 	__device__ int  getBuffersize(){		
 		return BUFFER_SIZE;
 	}
 	__device__ int getPADDING(){
 		return PADDING;
 	}
 	__device__ int getODD_OFFSET(){
 		return ODD_OFFSET;
 	}
  //****************For Test(Feb23), test inter parameters*************
  __device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
  __device__ int getSHM_BANKS() { return SHM_BANKS; }
  __device__ int getBuffersize() { return BUFFER_SIZE; }
  __device__ int getPADDING() { return PADDING; }
  __device__ int getODD_OFFSET() { return ODD_OFFSET; }
-    //****************For Test(Feb23), test inter parameters*************
+  //****************For Test(Feb23), test inter parameters*************
  };  // end of class TransformBuffer
 }; // end of class TransformBuffer
 } // namespace dwt_cuda
-
+#endif // TRANSFORM_BUFFER_H
 #endif	// TRANSFORM_BUFFER_H
--- a/examples/dwt2d/main.cu
+++ b/examples/dwt2d/main.cu
@ -1,16 +1,16 @@
-/* 
+/*
 * Copyright (c) 2009, Jiri Matela
 * All rights reserved.
- * 
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
- * 
+ *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
- * 
+ *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -54,7 +54,7 @@ int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
    // printf("Loading ipnput: %s\n", srcFilename);
    char *path = "../../data/dwt2d/";
    char *newSrc = NULL;
-    
+
    if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
    {
        newSrc[0] = '\0';
@ -67,7 +67,7 @@ int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
    //srcFilename = strcat("../../data/dwt2d/",srcFilename);
    //read image
    int i = open(srcFilename, O_RDONLY, 0644);
-    if (i == -1) { 
+    if (i == -1) {
        error(0,errno,"cannot access %s", srcFilename);
        return -1;
    }
@ -97,18 +97,18 @@ template <typename T>
 void processDWT(struct dwt *d, int forward, int writeVisual)
 {
    int componentSize = d->pixWidth*d->pixHeight*sizeof(T);
-    
+
    T *c_r_out, *backup ;
    cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size
    cudaCheckError("Alloc device memory");
    cudaMemset(c_r_out, 0, componentSize);
    cudaCheckError("Memset device memory");
-    
+
    cudaMalloc((void**)&backup, componentSize); //< aligned component size
    cudaCheckError("Alloc device memory");
    cudaMemset(backup, 0, componentSize);
    cudaCheckError("Memset device memory");
-	
+
    if (d->components == 3) {
        /* Alloc two more buffers for G and B */
        T *c_g_out, *c_b_out;
@ -116,12 +116,12 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
        cudaCheckError("Alloc device memory");
        cudaMemset(c_g_out, 0, componentSize);
        cudaCheckError("Memset device memory");
-        
+
        cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size
        cudaCheckError("Alloc device memory");
        cudaMemset(c_b_out, 0, componentSize);
        cudaCheckError("Memset device memory");
-        
+
        /* Load components */
        T *c_r, *c_g, *c_b;
        cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size
@ -140,13 +140,13 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
        cudaCheckError("Memset device memory");
        rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight);
-		
+
        /* Compute DWT and always store into file */
        nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
        nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
        nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
-     
+
        // -------test----------
        // T *h_r_out=(T*)malloc(componentSize);
 		// cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost);
@ -156,13 +156,13 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
 			// if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
        // }
        // -------test----------
-        
+
-		
+
        /* Store DWT to file */
        writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
        // writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
        // writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
-#ifdef OUTPUT        
+#ifdef OUTPUT
        if (writeVisual) {
            writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
            writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
@ -186,7 +186,7 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
        cudaFree(c_b_out);
        cudaCheckError("Cuda free");
-    } 
+    }
    else if (d->components == 1) {
        //Load component
        T *c_r;
@ -197,11 +197,11 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
        bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight);
-        // Compute DWT 
+        // Compute DWT
        nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
-        // Store DWT to file 
+        // Store DWT to file
-// #ifdef OUTPUT        
+// #ifdef OUTPUT
        if (writeVisual) {
            writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out");
        } else {
@ -218,7 +218,7 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
    cudaCheckError("Cuda free device");
 }
-int main(int argc, char **argv) 
+int main(int argc, char **argv)
 {
    int optindex = 0;
    char ch;
@ -233,13 +233,13 @@ int main(int argc, char **argv)
        {"97",          no_argument,       0, '9'}, //9/7 transform
        {"53",          no_argument,       0, '5' }, //5/3transform
        {"write-visual",no_argument,       0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
-        {"help",        no_argument,       0, 'h'}  
+        {"help",        no_argument,       0, 'h'}
    };
-    
+
    int pixWidth    = 0; //<real pixWidth
    int pixHeight   = 0; //<real pixHeight
    int compCount   = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
-    int bitDepth    = 8; 
+    int bitDepth    = 8;
    int dwtLvls     = 3; //default numuber of DWT levels
    int device      = 0;
    int forward     = 1; //forward transform
@ -322,19 +322,19 @@ int main(int argc, char **argv)
    if (devCount == 0) {
        printf("No CUDA enabled device\n");
        return -1;
-    } 
+    }
    if (device < 0 || device > devCount -1) {
-        printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n", 
+        printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
               device, 0, devCount -1);
        return -1;
    }
-    cudaDeviceProp devProp;                                          
+    cudaDeviceProp devProp;
-    cudaGetDeviceProperties(&devProp, device);  
+    cudaGetDeviceProperties(&devProp, device);
    cudaCheckError("Get device properties");
-    // if (devProp.major < 1) {                                         
+    // if (devProp.major < 1) {
    //     printf("Device %d does not support CUDA\n", device);
    //     return -1;
-    // }                                                                   
+    // }
    printf("Using device %d: %s\n", device, devProp.name);
    cudaSetDevice(device);
    cudaCheckError("Set selected device");
@ -366,14 +366,14 @@ int main(int argc, char **argv)
    printf(" DWT levels:\t\t%d\n", dwtLvls);
    printf(" Forward transform:\t%d\n", forward);
    printf(" 9/7 transform:\t\t%d\n", dwt97);
-    
+
    //data sizes
    int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
    //load img source image
    cudaMallocHost((void **)&d->srcImg, inputSize);
    cudaCheckError("Alloc host memory");
-    if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) 
+    if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
        return -1;
    /* DWT */
--- a/examples/dwt2d/run_cpu.sh
+++ b/examples/dwt2d/run_cpu.sh
@ -5,4 +5,3 @@
 ./dwt2d 4.bmp  -d 4x4 -r -5 -l 3
 # ./dwt2d 4.bmp  -d 4x4 -r -9 -l 3
 # ./dwt2d 8.bmp  -d 8x8 -f -9 -l 3
--- a/examples/dwt2d/test_compile_nvcc.sh
+++ b/examples/dwt2d/test_compile_nvcc.sh
@ -7,12 +7,3 @@
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
 /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
 g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
--- a/examples/microbench/cudamemcpy_test.cc
+++ b/examples/microbench/cudamemcpy_test.cc
@ -1,42 +1,40 @@
 #include <stdio.h>
-__global__
+__global__ void saxpy(int n, float a, float *x, float *y) {
-void saxpy(int n, float a, float *x, float *y)
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
-{
+  if (i < n)
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
+    y[i] = a * x[i] + y[i];
  if (i < n) y[i] = a*x[i] + y[i];
 }
-int main(void)
+int main(void) {
-{
+  int N = 1 << 20;
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
-  x = (float*)malloc(N*sizeof(float));
+  x = (float *)malloc(N * sizeof(float));
-  y = (float*)malloc(N*sizeof(float));
+  y = (float *)malloc(N * sizeof(float));
-  cudaMalloc(&d_x, N*sizeof(float)); 
+  cudaMalloc(&d_x, N * sizeof(float));
-  cudaMalloc(&d_y, N*sizeof(float));
+  cudaMalloc(&d_y, N * sizeof(float));
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
-  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
  // Perform SAXPY on 1M elements
  // saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
-  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
-    maxError = max(maxError, abs(y[i]-4.0f));
+    maxError = max(maxError, abs(y[i] - 4.0f));
  printf("Max error: %f\n", maxError);
  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
-}
+}
--- a/examples/microbench/dummy_kernel.cc
+++ b/examples/microbench/dummy_kernel.cc
@ -1,42 +1,39 @@
 #include <stdio.h>
-__global__
+__global__ void saxpy(void) {
-void saxpy(void)
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
 {
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  printf("block_id:%d thread_id:%d \n", i)
 }
-int main(void)
+int main(void) {
-{
+  int N = 1 << 20;
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
-  x = (float*)malloc(N*sizeof(float));
+  x = (float *)malloc(N * sizeof(float));
-  y = (float*)malloc(N*sizeof(float));
+  y = (float *)malloc(N * sizeof(float));
-  cudaMalloc(&d_x, N*sizeof(float)); 
+  cudaMalloc(&d_x, N * sizeof(float));
-  cudaMalloc(&d_y, N*sizeof(float));
+  cudaMalloc(&d_y, N * sizeof(float));
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
-  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
  // Perform SAXPY on 1M elements
-    saxpy<<<(1,1)>>>;
+  saxpy<<<(1, 1)>>>;
-  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
-    maxError = max(maxError, abs(y[i]-4.0f));
+    maxError = max(maxError, abs(y[i] - 4.0f));
  printf("Max error: %f\n", maxError);
  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
-}
+}
--- a/examples/microbench/kerne_arg.cc
+++ b/examples/microbench/kerne_arg.cc
@ -1,41 +1,36 @@
 #include <stdio.h>
-__global__
+__global__ void saxpy(int N) { printf("hello!: %d\n", N); }
 void saxpy(int N)
 {
 printf("hello!: %d\n", N);
 }
-int main(void)
+int main(void) {
-{
+  int N = 1 << 20;
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
-  x = (float*)malloc(N*sizeof(float));
+  x = (float *)malloc(N * sizeof(float));
-  y = (float*)malloc(N*sizeof(float));
+  y = (float *)malloc(N * sizeof(float));
-  cudaMalloc(&d_x, N*sizeof(float)); 
+  cudaMalloc(&d_x, N * sizeof(float));
-  cudaMalloc(&d_y, N*sizeof(float));
+  cudaMalloc(&d_y, N * sizeof(float));
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
-  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
  // Perform SAXPY on 1M elements
-    saxpy<<<(1,1)>>>(N);
+  saxpy<<<(1, 1)>>>(N);
-  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
-    maxError = max(maxError, abs(y[i]-4.0f));
+    maxError = max(maxError, abs(y[i] - 4.0f));
  printf("Max error: %f\n", maxError);
  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
-}
+}
--- a/examples/microbench/one_thread_kernel.cc
+++ b/examples/microbench/one_thread_kernel.cc
@ -1,41 +1,36 @@
 #include <stdio.h>
-__global__
+__global__ void saxpy(void) { printf("hello!\n"); }
 void saxpy(void)
 {
 printf("hello!\n");
 }
-int main(void)
+int main(void) {
-{
+  int N = 1 << 20;
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
-  x = (float*)malloc(N*sizeof(float));
+  x = (float *)malloc(N * sizeof(float));
-  y = (float*)malloc(N*sizeof(float));
+  y = (float *)malloc(N * sizeof(float));
-  cudaMalloc(&d_x, N*sizeof(float)); 
+  cudaMalloc(&d_x, N * sizeof(float));
-  cudaMalloc(&d_y, N*sizeof(float));
+  cudaMalloc(&d_y, N * sizeof(float));
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
-  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
  // Perform SAXPY on 1M elements
-    saxpy<<<(1,1)>>>;
+  saxpy<<<(1, 1)>>>;
-  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
-    maxError = max(maxError, abs(y[i]-4.0f));
+    maxError = max(maxError, abs(y[i] - 4.0f));
  printf("Max error: %f\n", maxError);
  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
-}
+}
--- a/runtime/lib/cudaRuntimeImpl.cpp
+++ b/runtime/lib/cudaRuntimeImpl.cpp
@ -43,7 +43,7 @@ cudaError_t cudaMallocHost(void **devPtr, size_t size) {
  *devPtr = malloc(size);
  if (devPtr == NULL)
    return cudaErrorMemoryAllocation;
- return cudaSuccess;
+  return cudaSuccess;
 }
 cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
  memset(devPtr, value, count);