Merge pull request #6 from jchen706/master

add dwt2d example and fixes and workflow
2022-05-24 23:44:43 +00:00 · 2022-05-24 23:44:43 +00:00 · d834f31626
parent 52431a983d 82fe95734f
commit d834f31626
25 changed files with 4062 additions and 2 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -155,3 +155,41 @@ jobs:
          llc --relocation-model=pic --filetype=obj  host.bc
          g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
          ./lavaMD -boxes1d 10
+      - name: Execute the dwt2d example
+        run: |
+          cd ${{ github.workspace }}/SC_evaluate/rodinia-cox/dwt2d
+          clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu  dwt_cuda/common.cu  dwt_cuda/rdwt97.cu  dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=${{ github.workspace }}/cuda-10.1  --cuda-gpu-arch=sm_61 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v  || true
+          export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH
+          export PATH=${{ github.workspace }}/build/compilation:$PATH
+          kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc
+          kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc
+          kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc
+          kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc
+          kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc
+          kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc
+          kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc
+          hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
+          hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
+          hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
+          hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
+          hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
+          hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
+          hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
+          hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
+          llc --relocation-model=pic --filetype=obj  common.bc
+          llc --relocation-model=pic --filetype=obj  components.bc
+          llc --relocation-model=pic --filetype=obj  fdwt53.bc
+          llc --relocation-model=pic --filetype=obj  dwt.bc
+          llc --relocation-model=pic --filetype=obj  host.bc
+          llc --relocation-model=pic --filetype=obj  common_host.bc
+          llc --relocation-model=pic --filetype=obj  components_host.bc
+          llc --relocation-model=pic --filetype=obj  fdwt53_host.bc
+          llc --relocation-model=pic --filetype=obj  dwt_host.bc
+          llc --relocation-model=pic --filetype=obj  fdwt97_host.bc
+          llc --relocation-model=pic --filetype=obj  rdwt97_host.bc
+          llc --relocation-model=pic --filetype=obj  rdwt53_host.bc
+          llc --relocation-model=pic --filetype=obj  fdwt97.bc
+          llc --relocation-model=pic --filetype=obj  rdwt97.bc
+          llc --relocation-model=pic --filetype=obj  rdwt53.bc
+          g++ -o dwt2d -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lpthread -lc -lx86Runtime -lthreadPool -pthread
+          ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
--- a/compilation/KernelTranslation/lib/init.cpp
+++ b/compilation/KernelTranslation/lib/init.cpp
@ -395,5 +395,9 @@ void init_block(llvm::Module *M, std::ofstream &fout) {
  // replace asm Inline
  replace_asm_call(M);
  // replace dynamic shared memory
-  replace_dynamic_shared_memory(M);
+  auto dynamic_shared_memory_addr =
+        M->getGlobalVariable("dynamic_shared_memory");
+  if (dynamic_shared_memory_addr) {
+    replace_dynamic_shared_memory(M);
+  }
 }
--- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp
+++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp
@ -270,11 +270,17 @@ void AddContextSaveRestore(llvm::Instruction *instruction,
      AddContextSave(instruction, alloca, intra_warp_loop);

  std::vector<Instruction *> uses;
+  Function *f2 = instruction->getParent()->getParent();
+

  for (Instruction::use_iterator ui = instruction->use_begin(),
                                 ue = instruction->use_end();
       ui != ue; ++ui) {
    llvm::Instruction *user = cast<Instruction>(ui->getUser());
+    Function *f1 = user->getParent()->getParent();
+    if(f2->getName() != f1->getName()) {
+      continue;
+    }
    if (user == NULL)
      continue;
    if (user == theStore)
--- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp
+++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp
@ -86,6 +86,24 @@ void mem_share2global(llvm::Module *M) {
            corresponding_global_memory.insert(
                std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
                                                              global_memory));
+          } else if (element_type->isStructTy()) {
+            auto undef = llvm::UndefValue::get(element_type);
+            llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
+                *M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef,
+                new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
+                false);
+            global_memory->setDSOLocal(true);
+            Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName()));
+            comdat->setSelectionKind(Comdat::SelectionKind::Any);
+            global_memory->setComdat(comdat);
+            global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
+            global_memory->setInitializer(undef);
+            global_memory->setAlignment(share_memory->getAlignment());
+            corresponding_global_memory.insert(
+                std::pair<GlobalVariable *, GlobalVariable *>(share_memory, 
+                global_memory));
+
+
          } else {
            assert(0 && "The required Share Memory Type is not supported\n");
          }
--- a/examples/dwt2d/common.h
+++ b/examples/dwt2d/common.h
@ -0,0 +1,62 @@
+/* 
+ * Copyright (c) 2009, Jiri Matela
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+//24-bit multiplication is faster on G80,
+//but we must be sure to multiply integers
+//only within [-8M, 8M - 1] range
+#define IMUL(a, b) __mul24(a, b)
+
+////cuda timing macros
+//#define CTIMERINIT  cudaEvent_t cstart, cstop; \
+//                    cudaEventCreate(&cstart); \
+//                    cudaEventCreate(&cstop); \
+//                    float elapsedTime
+//#define CTIMERSTART(cstart) cudaEventRecord(cstart,0)
+//#define CTIMERSTOP(cstop) cudaEventRecord(cstop,0); \
+//                          cudaEventSynchronize(cstop); \
+//                          cudaEventElapsedTime(&elapsedTime, cstart, cstop)
+
+//divide and round up macro
+#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
+
+#  define cudaCheckError( msg ) {                                            \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "%s: %i: %s: %s.\n",                                 \
+                __FILE__, __LINE__, msg, cudaGetErrorString( err) );         \
+        exit(-1);                                                            \
+    } }
+
+#  define cudaCheckAsyncError( msg ) {                                       \
+    cudaThreadSynchronize();                                                 \
+    cudaCheckError( msg );                                                   \
+    }
+
+
+#endif
--- a/examples/dwt2d/components.cu
+++ b/examples/dwt2d/components.cu
@ -0,0 +1,193 @@
+/* 
+ * Copyright (c) 2009, Jiri Matela
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include <unistd.h>
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "components.h"
+#include "common.h"
+
+#define THREADS 256
+
+/* Store 3 RGB float components */
+__device__ void storeComponents(float *d_r, float *d_g, float *d_b, float r, float g, float b, int pos)
+{
+    d_r[pos] = (r/255.0f) - 0.5f;
+    d_g[pos] = (g/255.0f) - 0.5f;
+    d_b[pos] = (b/255.0f) - 0.5f;
+}
+
+/* Store 3 RGB intege components */
+__device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int b, int pos)
+{
+    d_r[pos] = r - 128;
+    d_g[pos] = g - 128;
+    d_b[pos] = b - 128;
+} 
+
+/* Store float component */
+__device__ void storeComponent(float *d_c, float c, int pos)
+{
+    d_c[pos] = (c/255.0f) - 0.5f;
+}
+
+/* Store integer component */
+__device__ void storeComponent(int *d_c, int c, int pos)
+{
+    d_c[pos] = c - 128;
+}
+
+/* Copy img src data into three separated component buffers */
+template<typename T>
+__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b, 
+                                  unsigned char * d_src, 
+                                  int pixels)
+{
+    int x  = threadIdx.x;
+    int gX = blockDim.x*blockIdx.x;
+
+    __shared__ unsigned char sData[THREADS*3];
+
+    /* Copy data to shared mem by 4bytes 
+       other checks are not necessary, since 
+       d_src buffer is aligned to sharedDataSize */
+    if ( (x*4) < THREADS*3 ) {
+        float *s = (float *)d_src;
+        float *d = (float *)sData;
+        d[x] = s[((gX*3)>>2) + x];
+    }
+    __syncthreads();
+
+    T r, g, b;
+
+    int offset = x*3;
+    r = (T)(sData[offset]);
+    g = (T)(sData[offset+1]);
+    b = (T)(sData[offset+2]);
+
+    int globalOutputPosition = gX + x;
+    if (globalOutputPosition < pixels) {
+        storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
+    }
+}
+
+/* Copy img src data into three separated component buffers */
+template<typename T>
+__global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
+{
+    int x  = threadIdx.x;
+    int gX = blockDim.x*blockIdx.x;
+
+    __shared__ unsigned char sData[THREADS];
+
+    /* Copy data to shared mem by 4bytes 
+       other checks are not necessary, since 
+       d_src buffer is aligned to sharedDataSize */
+    if ( (x*4) < THREADS) {
+        float *s = (float *)d_src;
+        float *d = (float *)sData;
+        d[x] = s[(gX>>2) + x];
+    }
+    __syncthreads();
+
+    T c;
+
+    c = (T)(sData[x]);
+
+    int globalOutputPosition = gX + x;
+    if (globalOutputPosition < pixels) {
+        storeComponent(d_c, c, globalOutputPosition);
+    }
+}
+
+
+/* Separate compoents of 8bit RGB source image */
+template<typename T>
+void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height)
+{
+    unsigned char * d_src;
+    int pixels      = width*height;
+    int alignedSize =  DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
+
+    /* Alloc d_src buffer */
+    cudaMalloc((void **)&d_src, alignedSize);
+    cudaCheckAsyncError("Cuda malloc")
+    cudaMemset(d_src, 0, alignedSize);
+
+    /* Copy data to device */
+    cudaMemcpy(d_src, src, pixels*3, cudaMemcpyHostToDevice);
+    cudaCheckError("Copy data to device")
+
+    /* Kernel */
+    dim3 threads(THREADS);
+    dim3 grid(alignedSize/(THREADS*3));
+    assert(alignedSize%(THREADS*3) == 0);
+    c_CopySrcToComponents<<<grid, threads>>>(d_r, d_g, d_b, d_src, pixels);
+    cudaCheckAsyncError("CopySrcToComponents kernel")
+
+    /* Free Memory */
+    cudaFree(d_src);
+    cudaCheckAsyncError("Free memory")
+}
+template void rgbToComponents<float>(float *d_r, float *d_g, float *d_b, unsigned char * src, int width, int height);
+template void rgbToComponents<int>(int *d_r, int *d_g, int *d_b, unsigned char * src, int width, int height);
+
+
+/* Copy a 8bit source image data into a color compoment of type T */
+template<typename T>
+void bwToComponent(T *d_c, unsigned char * src, int width, int height)
+{
+    unsigned char * d_src;
+    int pixels      = width*height;
+    int alignedSize =  DIVANDRND(pixels, THREADS) * THREADS; //aligned to thread block size -- THREADS
+
+    /* Alloc d_src buffer */
+    cudaMalloc((void **)&d_src, alignedSize);
+    cudaCheckAsyncError("Cuda malloc")
+    cudaMemset(d_src, 0, alignedSize);
+
+    /* Copy data to device */
+    cudaMemcpy(d_src, src, pixels, cudaMemcpyHostToDevice);
+    cudaCheckError("Copy data to device")
+
+    /* Kernel */
+    dim3 threads(THREADS);
+    dim3 grid(alignedSize/(THREADS));
+    assert(alignedSize%(THREADS) == 0);
+    c_CopySrcToComponent<<<grid, threads>>>(d_c, d_src, pixels);
+    cudaCheckAsyncError("CopySrcToComponent kernel")
+
+    /* Free Memory */
+    cudaFree(d_src);
+    cudaCheckAsyncError("Free memory")
+}
+
+template void bwToComponent<float>(float *d_c, unsigned char *src, int width, int height);
+template void bwToComponent<int>(int *d_c, unsigned char *src, int width, int height);
--- a/examples/dwt2d/components.h
+++ b/examples/dwt2d/components.h
@ -0,0 +1,38 @@
+/* 
+ * Copyright (c) 2009, Jiri Matela
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _COMPONENTS_H
+#define _COMPONENTS_H
+
+/* Separate compoents of source 8bit RGB image */
+template<typename T>
+void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height);
+
+/* Copy a 8bit source image data into a color compoment of type T */
+template<typename T>
+void bwToComponent(T *d_c, unsigned char * src, int width, int height);
+
+#endif
--- a/examples/dwt2d/dwt.cu
+++ b/examples/dwt2d/dwt.cu
@ -0,0 +1,385 @@
+/* 
+ * Copyright (c) 2009, Jiri Matela
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <error.h>
+#include "dwt_cuda/dwt.h"
+#include "dwt_cuda/common.h"
+#include "dwt.h"
+#include "common.h"
+#include <iostream>
+#include <fstream>
+
+inline void fdwt(float *in, float *out, int width, int height, int levels)
+{
+        printf(" Running fdwt97 Float \n");
+        dwt_cuda::fdwt97(in, out, width, height, levels);
+}
+/*
+inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut)
+{
+        dwt_cuda::fdwt97(in, out, width, height, levels, diffOut);
+}
+*/
+
+
+
+inline void fdwt(int *in, int *out, int width, int height, int levels)
+{
+        printf(" Running fdwt53 Int \n");
+
+        dwt_cuda::fdwt53(in, out, width, height, levels);
+}
+/*
+inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut)
+{
+        dwt_cuda::fdwt53(in, out, width, height, levels, diffOut);
+}
+*/
+
+
+
+inline void rdwt(float *in, float *out, int width, int height, int levels)
+{
+        printf(" Running rdwt97 Float \n");
+
+        dwt_cuda::rdwt97(in, out, width, height, levels);
+}
+
+inline void rdwt(int *in, int *out, int width, int height, int levels)
+{
+        printf(" Running rdwt53 Int \n");
+
+        dwt_cuda::rdwt53(in, out, width, height, levels);
+}
+
+template<typename T>
+int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
+{
+    printf("\n*** %d stages of 2D forward DWT:\n", stages);
+    
+    /* create backup of input, because each test iteration overwrites it */
+    const int size = pixHeight * pixWidth * sizeof(T);
+    cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
+    cudaCheckError("Memcopy device to device");
+    
+    /* Measure time of individual levels. */
+    if(forward)
+        fdwt(in, out, pixWidth, pixHeight, stages);
+    else
+        rdwt(in, out, pixWidth, pixHeight, stages);
+    
+    // Measure overall time of DWT. 
+/*    #ifdef GPU_DWT_TESTING_1
+	
+    dwt_cuda::CudaDWTTester tester;
+    for(int i = tester.getNumIterations(); i--; ) {
+        // Recover input and measure one overall DWT run. 
+        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); 
+        cudaCheckError("Memcopy device to device");
+        tester.beginTestIteration();
+        if(forward)
+            fdwt(in, out, pixWidth, pixHeight, stages);
+        else
+            rdwt(in, out, pixWidth, pixHeight, stages);
+        tester.endTestIteration();
+    }
+    tester.showPerformance("   Overall DWT", pixWidth, pixHeight);
+    #endif  // GPU_DWT_TESTING 
+    
+    cudaCheckAsyncError("DWT Kernel calls");
+*/    return 0;
+}
+template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
+template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
+
+
+
+/*
+template<typename T>
+int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
+{
+    printf("*** %d stages of 2D forward DWT:\n", stages);
+    
+    // create backup of input, because each test iteration overwrites it 
+    const int size = pixHeight * pixWidth * sizeof(T);
+    cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
+    cudaCheckError("Memcopy device to device");
+    
+    // Measure time of individual levels. 
+    if(forward)
+        fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
+    else
+        rdwt(in, out, pixWidth, pixHeight, stages);
+    
+    // Measure overall time of DWT. 
+    #ifdef GPU_DWT_TESTING_1
+	
+    dwt_cuda::CudaDWTTester tester;
+    for(int i = tester.getNumIterations(); i--; ) {
+        // Recover input and measure one overall DWT run. 
+        cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); 
+        cudaCheckError("Memcopy device to device");
+        tester.beginTestIteration();
+        if(forward)
+            fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
+        else
+            rdwt(in, out, pixWidth, pixHeight, stages);
+        tester.endTestIteration();
+    }
+    tester.showPerformance("   Overall DWT", pixWidth, pixHeight);
+    #endif  // GPU_DWT_TESTING 
+    
+    cudaCheckAsyncError("DWT Kernel calls");
+    return 0;
+}
+template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool, float*);
+template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool, int*);
+
+*/
+
+void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename)
+{
+    int i;
+    std::ofstream outputFile;
+    char outfile[strlen(filename)+strlen(".txt")];
+    strcpy(outfile, filename);
+    strcpy(outfile+strlen(filename), ".txt");
+    outputFile.open(outfile);
+
+
+    for(i = 0; i < samplesNum; i++) {
+        float r = (src[i]+0.5f) * 255;
+        if (r > 255) r = 255; 
+        if (r < 0)   r = 0; 
+        dst[i] = (unsigned char)r;
+        outputFile << "index: " << i  << " val: "<< r <<" \n";
+
+
+    }
+    outputFile.close();
+}
+
+void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename)
+{
+    int i;
+    std::ofstream outputFile;
+    char outfile[strlen(filename)+strlen(".txt")];
+    strcpy(outfile, filename);
+    strcpy(outfile+strlen(filename), ".txt");
+    outputFile.open(outfile);
+    for(i = 0; i < samplesNum; i++) {
+        int r = src[i]+128;
+        if (r > 255) r = 255;
+        if (r < 0)   r = 0; 
+        dst[i] = (unsigned char)r;
+        // added this line to output check
+        outputFile << "index: " << i  << " val: "<< r <<" \n";
+    }
+    outputFile.close();
+}
+
+///* Write output linear orderd*/
+template<typename T>
+int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
+                const char * filename, const char * suffix)
+{
+    unsigned char * result;
+    T *gpu_output;
+    int i;
+    int size;
+    int samplesNum = pixWidth*pixHeight;
+
+    size = samplesNum*sizeof(T);
+    cudaMallocHost((void **)&gpu_output, size);
+    cudaCheckError("Malloc host");
+    memset(gpu_output, 0, size);
+    result = (unsigned char *)malloc(samplesNum);
+    cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost);
+    cudaCheckError("Memcopy device to host");
+
+    /* T to char */
+    samplesToChar(result, gpu_output, samplesNum, filename);
+
+    /* Write component */
+    char outfile[strlen(filename)+strlen(suffix)];
+    strcpy(outfile, filename);
+    strcpy(outfile+strlen(filename), suffix);
+    i = open(outfile, O_CREAT|O_WRONLY, 0644);
+    if (i == -1) {
+        error(0,errno,"cannot access %s", outfile);
+        return -1;
+    }
+    printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
+    ssize_t x ;
+    x = write(i, result, samplesNum);
+    close(i);
+
+    /* Clean up */
+    cudaFreeHost(gpu_output);
+    cudaCheckError("Cuda free host memory");
+    free(result);
+    if(x == 0) return 1;
+    return 0;
+}
+template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); 
+template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); 
+
+/* Write output visual ordered */
+template<typename T>
+int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, 
+                     int stages, const char * filename, const char * suffix) 
+{
+    struct band {
+        int dimX; 
+        int dimY;
+    };
+    struct dimensions {
+        struct band LL;
+        struct band HL;
+        struct band LH;
+        struct band HH;
+    };
+
+    unsigned char * result;
+    T *src, *dst;
+    int i,s;
+    int size;
+    int offset;
+    int yOffset;
+    int samplesNum = pixWidth*pixHeight;
+    struct dimensions * bandDims;
+
+    bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
+
+    bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
+    bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
+    bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
+    bandDims[0].HL.dimY = bandDims[0].LL.dimY;
+    bandDims[0].LH.dimX = bandDims[0].LL.dimX;
+    bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
+    bandDims[0].HH.dimX = bandDims[0].HL.dimX;
+    bandDims[0].HH.dimY = bandDims[0].LH.dimY;
+
+    for (i = 1; i < stages; i++) {
+        bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
+        bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
+        bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
+        bandDims[i].HL.dimY = bandDims[i].LL.dimY;
+        bandDims[i].LH.dimX = bandDims[i].LL.dimX;
+        bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
+        bandDims[i].HH.dimX = bandDims[i].HL.dimX;
+        bandDims[i].HH.dimY = bandDims[i].LH.dimY;
+    }
+
+#if 0
+    printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
+    for (i = 0; i < stages; i++) {
+        printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
+        printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
+        printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
+        printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
+    }
+#endif
+    
+    size = samplesNum*sizeof(T);
+    cudaMallocHost((void **)&src, size);
+    cudaCheckError("Malloc host");
+    dst = (T*)malloc(size);
+    memset(src, 0, size);
+    memset(dst, 0, size);
+    result = (unsigned char *)malloc(samplesNum);
+    cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost);
+    cudaCheckError("Memcopy device to host");
+
+    // LL Band
+    size = bandDims[stages-1].LL.dimX * sizeof(T);
+    for (i = 0; i < bandDims[stages-1].LL.dimY; i++) {
+        memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
+    }
+
+    for (s = stages - 1; s >= 0; s--) {
+        // HL Band
+        size = bandDims[s].HL.dimX * sizeof(T);
+        offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
+        for (i = 0; i < bandDims[s].HL.dimY; i++) {
+            memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
+                src+offset+i*bandDims[s].HL.dimX, 
+                size);
+        }
+
+        // LH band
+        size = bandDims[s].LH.dimX * sizeof(T);
+        offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
+        yOffset = bandDims[s].LL.dimY;
+        for (i = 0; i < bandDims[s].HL.dimY; i++) {
+            memcpy(dst+(yOffset+i)*pixWidth,
+                src+offset+i*bandDims[s].LH.dimX, 
+                size);
+        }
+
+        //HH band
+        size = bandDims[s].HH.dimX * sizeof(T);
+        offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
+        yOffset = bandDims[s].HL.dimY;
+        for (i = 0; i < bandDims[s].HH.dimY; i++) {
+            memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
+                src+offset+i*bandDims[s].HH.dimX, 
+                size);
+        }
+    }
+
+    /* Write component */
+    samplesToChar(result, dst, samplesNum, filename);
+
+    char outfile[strlen(filename)+strlen(suffix)];
+    strcpy(outfile, filename);
+    strcpy(outfile+strlen(filename), suffix);
+    i = open(outfile, O_CREAT|O_WRONLY, 0644);
+    if (i == -1) {
+        error(0,errno,"cannot access %s", outfile);
+        return -1;
+    }
+    printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
+    ssize_t x;
+    x = write(i, result, samplesNum);
+    close(i);
+
+    cudaFreeHost(src);
+    cudaCheckError("Cuda free host memory");
+    free(dst);
+    free(result);
+    free(bandDims);
+    if (x == 0) return 1;
+    return 0;
+}
+template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); 
+template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); 
--- a/examples/dwt2d/dwt.h
+++ b/examples/dwt2d/dwt.h
@ -0,0 +1,40 @@
+/* 
+ * Copyright (c) 2009, Jiri Matela
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _DWT_H
+#define _DWT_H
+
+template<typename T> 
+int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
+
+template<typename T>
+int writeNStage2DDWT(T *component_cuda, int width, int height, 
+                     int stages, const char * filename, const char * suffix);
+template<typename T>
+int writeLinear(T *component_cuda, int width, int height, 
+                     const char * filename, const char * suffix);
+
+#endif
--- a/examples/dwt2d/dwt_cuda/common.cu
+++ b/examples/dwt2d/dwt_cuda/common.cu
@ -0,0 +1,35 @@
+///  
+/// @file    common.cu
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @date    2011-01-20 14:37
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+ 
+#include "common.h"
+
+namespace dwt_cuda {
+  bool CudaDWTTester::testRunning = false;
+}
--- a/examples/dwt2d/dwt_cuda/common.h
+++ b/examples/dwt2d/dwt_cuda/common.h
@ -0,0 +1,261 @@
+///  
+/// @file    common.h
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @brief   Common stuff for all CUDA dwt functions.
+/// @date    2011-01-20 14:19
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+
+
+#ifndef DWT_COMMON_H
+#define	DWT_COMMON_H
+
+
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+
+
+
+// compile time minimum macro
+#define CTMIN(a,b) (((a) < (b)) ? (a) : (b))
+
+
+
+// performance testing macros
+#if defined(GPU_DWT_TESTING)
+  #define PERF_BEGIN  \
+  { \
+    dwt_cuda::CudaDWTTester PERF_TESTER; \
+    for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \
+    { \
+      PERF_TESTER.beginTestIteration();
+
+  #define PERF_END(PERF_NAME, PERF_W, PERF_H)  \
+      PERF_TESTER.endTestIteration(); \
+    } \
+    PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
+  }
+#else // GPU_DWT_TESTING
+  #define PERF_BEGIN
+  #define PERF_END(PERF_NAME, PERF_W, PERF_H)
+#endif // GPU_DWT_TESTING
+
+
+
+namespace dwt_cuda {
+  
+  
+  /// Divide and round up.
+  template <typename T>
+  __device__ __host__ inline T divRndUp(const T & n, const T & d) {
+    return (n / d) + ((n % d) ? 1 : 0);
+  }
+  
+  
+  // 9/7 forward DWT lifting schema coefficients
+  const float f97Predict1 = -1.586134342;   ///< forward 9/7 predict 1
+  const float f97Update1 = -0.05298011854;  ///< forward 9/7 update 1
+  const float f97Predict2 = 0.8829110762;   ///< forward 9/7 predict 2
+  const float f97Update2 = 0.4435068522;    ///< forward 9/7 update 2
+
+
+  // 9/7 reverse DWT lifting schema coefficients
+  const float r97update2 = -f97Update2;    ///< undo 9/7 update 2
+  const float r97predict2 = -f97Predict2;  ///< undo 9/7 predict 2
+  const float r97update1 = -f97Update1;    ///< undo 9/7 update 1
+  const float r97Predict1 = -f97Predict1;  ///< undo 9/7 predict 1
+  
+  // FDWT 9/7 scaling coefficients
+  const float scale97Mul = 1.23017410491400f;
+  const float scale97Div = 1.0 / scale97Mul;
+  
+  
+  // 5/3 forward DWT lifting schema coefficients
+  const float forward53Predict = -0.5f;   /// forward 5/3 predict
+  const float forward53Update = 0.25f;    /// forward 5/3 update
+  
+  // 5/3 forward DWT lifting schema coefficients
+  const float reverse53Update = -forward53Update;    /// undo 5/3 update
+  const float reverse53Predict = -forward53Predict;  /// undo 5/3 predict
+  
+  
+  
+  /// Functor which adds scaled sum of neighbors to given central pixel.
+  struct AddScaledSum {
+    const float scale;  // scale of neighbors
+    __device__ AddScaledSum(const float scale) : scale(scale) {}
+    __device__ void operator()(const float p, float & c, const float n) const {
+
+      // if(threadIdx.x == 0) {
+
+      //   printf("scale  %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );
+      
+      // }
+      
+      c += scale * (p + n);
+    }
+  };
+  
+  
+  
+  /// Returns index ranging from 0 to num threads, such that first half
+  /// of threads get even indices and others get odd indices. Each thread
+  /// gets different index.
+  /// Example: (for 8 threads)   threadIdx.x:   0  1  2  3  4  5  6  7
+  ///                              parityIdx:   0  2  4  6  1  3  5  7
+  /// @tparam THREADS  total count of participating threads
+  /// @return parity-separated index of thread
+  template <int THREADS>
+  __device__ inline int parityIdx() {
+    return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
+  }
+  
+          
+  
+  /// size of shared memory
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+  const int SHM_SIZE = 48 * 1024;
+  #else
+  const int SHM_SIZE = 16 * 1024;
+  #endif
+  
+  
+  
+  /// Perrformance and return code tester.
+  class CudaDWTTester {
+  private:
+    static bool testRunning;    ///< true if any test is currently running
+    cudaEvent_t beginEvent;     ///< begin CUDA event
+    cudaEvent_t endEvent;       ///< end CUDA event
+    std::vector<float> times;   ///< collected times
+    const bool disabled;        ///< true if this object is disabled
+  public:
+    /// Checks CUDA related error.
+    /// @param status   return code to be checked
+    /// @param message  message to be shown if there was an error
+    /// @return true if there was no error, false otherwise
+    static bool check(const cudaError_t & status, const char * message) {
+      #if defined(GPU_DWT_TESTING)
+      if((!testRunning) && status != cudaSuccess) {
+        const char * errorString = cudaGetErrorString(status);
+        fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
+        fflush(stderr);
+        return false;
+      }
+      #endif // GPU_DWT_TESTING
+      return true;
+    }
+
+    /// Checks last kernel call for errors.
+    /// @param message  description of the kernel call
+    /// @return true if there was no error, false otherwise
+    static bool checkLastKernelCall(const char * message) {
+      #if defined(GPU_DWT_TESTING)
+      return testRunning ? true : check(cudaThreadSynchronize(), message);
+      #else // GPU_DWT_TESTING
+      return true;
+      #endif // GPU_DWT_TESTING
+    }
+    
+    /// Initializes DWT tester for time measurement
+    CudaDWTTester() : disabled(testRunning) {}
+    
+    /// Gets rpefered number of iterations
+    int getNumIterations() {
+      return disabled ? 1 : 31;
+    }
+    
+    /// Starts one test iteration.
+    void beginTestIteration() {
+      if(!disabled) {
+        cudaEventCreate(&beginEvent);
+        cudaEventCreate(&endEvent);
+        cudaEventRecord(beginEvent, 0);
+        testRunning = true;
+      }
+    }
+    
+    /// Ends on etest iteration.
+    void endTestIteration() {
+      if(!disabled) {
+        float time;
+        testRunning = false;
+        cudaEventRecord(endEvent, 0);
+        cudaEventSynchronize(endEvent);
+        cudaEventElapsedTime(&time, beginEvent, endEvent);
+        cudaEventDestroy(beginEvent);
+        cudaEventDestroy(endEvent);
+        times.push_back(time);
+      }
+    }
+    
+    /// Shows brief info about all iterations.
+    /// @param name   name of processing method
+    /// @param sizeX  width of processed image
+    /// @param sizeY  height of processed image
+    void showPerformance(const char * name, const int sizeX, const int sizeY) {
+      if(!disabled) {
+        // compute mean and median
+        std::sort(times.begin(), times.end());
+        double sum = 0;
+        for(int i = times.size(); i--; ) {
+          sum += times[i];
+        }
+        const double median = (times[times.size() / 2]
+                             + times[(times.size() - 1) / 2]) * 0.5f;
+        printf("  %s:   %7.3f ms (mean)   %7.3f ms (median)   %7.3f ms (max)  "
+               "(%d x %d)\n", name, (sum / times.size()), median, 
+               times[times.size() - 1], sizeX, sizeY);
+      }
+    }
+  };
+  
+  
+  
+  /// Simple cudaMemcpy wrapped in performance tester.
+  /// @param dest  destination bufer
+  /// @param src   source buffer
+  /// @param sx    width of copied image
+  /// @param sy    height of copied image
+  template <typename T>
+  inline void memCopy(T * const dest, const T * const src,
+                      const size_t sx, const size_t sy) {
+    cudaError_t status;
+    PERF_BEGIN
+    status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice);
+    PERF_END("        memcpy", sx, sy)
+    CudaDWTTester::check(status, "memcpy device > device");
+  }
+  
+  
+  
+} // end of namespace dwt_cuda
+
+
+
+#endif	// DWT_COMMON_CUDA_H
+
--- a/examples/dwt2d/dwt_cuda/dwt.h
+++ b/examples/dwt2d/dwt_cuda/dwt.h
@ -0,0 +1,112 @@
+/// 
+/// @file    dwt.h
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @brief   Entry points for CUDA implementaion of 9/7 and 5/3 DWT.
+/// @date    2011-01-20 11:41
+///
+///
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+///
+///
+/// Following conditions are common for all four DWT functions:
+/// - Both input and output images are stored in GPU memory with no padding
+///   of lines or interleaving of pixels.
+/// - DWT coefficients are stored as follows: Each band is saved as one
+///   consecutive chunk (no padding/stride/interleaving). Deepest level bands
+///   (smallest ones) are stored first (at the beginning of the input/output
+///   buffers), less deep bands follow. There is no padding between stored
+///   bands in the buffer. Order of bands of the same level in the buffer is
+///   following: Low-low band (or deeper level subbands) is stored first.
+///   Vertical-low/horizontal-high band follows. Vertical-high/horizonal-low
+///   band is saved next and finally, the high-high band is saved. Out of all
+///   low-low bands, only th edeepest one is saved (right at the beginning of
+///   the buffer), others are replaced with deeper level subbands.
+/// - Input images of all functions won't be preserved (will be overwritten).
+/// - Input and output buffers can't overlap.
+/// - Size of output buffer must be greater or equal to size of input buffer.
+///
+/// There are no common compile time settings (buffer size, etc...) for
+/// all DWTs, because each DTW type needs different amount of GPU resources.
+/// Instead, each DWT type has its own compile time settings, which can be
+/// found in *.cu file, where it is implemented.
+///
+
+#ifndef DWT_CUDA_H
+#define	DWT_CUDA_H
+
+
+namespace dwt_cuda {
+  
+  
+  /// Forward 5/3 2D DWT. See common rules (above) for more details.
+  /// @param in      Expected to be normalized into range [-128, 127].
+  ///                Will not be preserved (will be overwritten).
+  /// @param out     output buffer on GPU
+  /// @param sizeX   width of input image (in pixels)
+  /// @param sizeY   height of input image (in pixels)
+  /// @param levels  number of recursive DWT levels
+  void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
+  
+  
+  /// Reverse 5/3 2D DWT. See common rules (above) for more details.
+  /// @param in      Input DWT coefficients. Format described in common rules.
+  ///                Will not be preserved (will be overwritten).
+  /// @param out     output buffer on GPU - will contain original image
+  ///                in normalized range [-128, 127].
+  /// @param sizeX   width of input image (in pixels)
+  /// @param sizeY   height of input image (in pixels)
+  /// @param levels  number of recursive DWT levels
+  void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
+  
+  
+  /// Forward 9/7 2D DWT. See common rules (above) for more details.
+  /// @param in      Input DWT coefficients. Should be normalized (in range 
+  ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
+  /// @param out     output buffer on GPU - format specified in common rules
+  /// @param sizeX   width of input image (in pixels)
+  /// @param sizeY   height of input image (in pixels)
+  /// @param levels  number of recursive DWT levels
+  void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
+  
+  
+  /// Reverse 9/7 2D DWT. See common rules (above) for more details.
+  /// @param in      Input DWT coefficients. Format described in common rules.
+  ///                Will not be preserved (will be overwritten).
+  /// @param out     output buffer on GPU - will contain original image
+  ///                in normalized range [-0.5, 0.5].
+  /// @param sizeX   width of input image (in pixels)
+  /// @param sizeY   height of input image (in pixels)
+  /// @param levels  number of recursive DWT levels
+  void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
+  
+  
+} // namespace dwt_cuda
+
+
+
+#endif	// DWT_CUDA_H
+
--- a/examples/dwt2d/dwt_cuda/fdwt53.cu
+++ b/examples/dwt2d/dwt_cuda/fdwt53.cu
@ -0,0 +1,400 @@
+/// @file    fdwt53.cu
+/// @brief   CUDA implementation of forward 5/3 2D DWT.
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @date    2011-02-04 13:23
+///
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+
+
+#include "common.h"
+#include "transform_buffer.h"
+#include "io.h"
+
+namespace dwt_cuda {
+
+
+  /// Wraps buffer and methods needed for computing one level of 5/3 FDWT
+  /// using sliding window approach.
+  /// @tparam WIN_SIZE_X  width of sliding window
+  /// @tparam WIN_SIZE_Y  height of sliding window
+  template <int WIN_SIZE_X, int WIN_SIZE_Y>
+  class FDWT53 {
+  private:
+    
+    /// Info needed for processing of one input column.
+    /// @tparam CHECKED_LOADER  true if column's loader should check boundaries
+    ///                         false if there are no near boudnaries to check
+    template <bool CHECKED_LOADER>
+    struct FDWT53Column {
+      /// loader for the column
+      VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
+      
+      /// offset of the column in shared buffer
+      int offset;                   
+      
+      // backup of first 3 loaded pixels (not transformed)
+      int pixel0, pixel1, pixel2;
+      
+      /// Sets all fields to anything to prevent 'uninitialized' warnings.
+      __device__ void clear() {
+        offset = pixel0 = pixel1 = pixel2 = 0;
+        loader.clear();
+      }
+    };
+
+
+    /// Type of shared memory buffer for 5/3 FDWT transforms.
+    typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> FDWT53Buffer;
+
+    /// Actual shared buffer used for forward 5/3 DWT.
+    FDWT53Buffer buffer;
+
+    /// Difference between indices of two vertical neighbors in buffer.
+    enum { STRIDE = FDWT53Buffer::VERTICAL_STRIDE };
+
+
+    /// Forward 5/3 DWT predict operation.
+    struct Forward53Predict {
+      __device__ void operator() (const int p, int & c, const int n) const {
+        // c = n;
+        c -= (p + n) / 2;      // F.8, page 126, ITU-T Rec. T.800 final draft the real one
+      }
+    };
+
+
+    /// Forward 5/3 DWT update operation.
+    struct Forward53Update {
+      __device__ void operator() (const int p, int & c, const int n) const {
+        c += (p + n + 2) / 4;  // F.9, page 126, ITU-T Rec. T.800 final draft
+      }
+    };
+
+
+    /// Initializes one column: computes offset of the column in shared memory
+    /// buffer, initializes loader and finally uses it to load first 3 pixels.
+    /// @tparam CHECKED  true if loader of the column checks boundaries
+    /// @param column    (uninitialized) column info to be initialized
+    /// @param input     input image
+    /// @param sizeX     width of the input image
+    /// @param sizeY     height of the input image
+    /// @param colIndex  x-axis coordinate of the column (relative to the left
+    ///                  side of this threadblock's block of input pixels)
+    /// @param firstY    y-axis coordinate of first image row to be transformed
+	
+	template <bool CHECKED>
+    __device__ void initColumn(FDWT53Column<CHECKED> & column,
+                               const int * const input,
+                               const int sizeX, const int sizeY,
+                               const int colIndex, const int firstY) {
+      // get offset of the column with index 'cId'
+      column.offset = buffer.getColumnOffset(colIndex);
+
+      // coordinates of the first pixel to be loaded
+      const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
+
+      if(blockIdx.y == 0) {
+        // topmost block - apply mirroring rules when loading first 3 rows
+        column.loader.init(sizeX, sizeY, firstX, firstY);
+
+        // load pixels in mirrored way
+        column.pixel2 = column.loader.loadFrom(input);  // loaded pixel #0
+        column.pixel1 = column.loader.loadFrom(input);  // loaded pixel #1
+        column.pixel0 = column.loader.loadFrom(input);  // loaded pixel #2
+
+        // reinitialize loader to start with pixel #1 again
+        column.loader.init(sizeX, sizeY, firstX, firstY + 1);
+      } else {
+        // non-topmost row - regular loading:
+        column.loader.init(sizeX, sizeY, firstX, firstY - 2);
+
+        // load 3 rows into the column
+        column.pixel0 = column.loader.loadFrom(input);
+        column.pixel1 = column.loader.loadFrom(input);
+        column.pixel2 = column.loader.loadFrom(input);
+        // Now, the next pixel, which will be loaded by loader, is pixel #1.
+      }
+		
+	}
+
+
+    /// Loads and vertically transforms given column. Assumes that first 3
+    /// pixels are already loaded in column fields pixel0 ... pixel2.
+    /// @tparam CHECKED  true if loader of the column checks boundaries
+    /// @param column    column to be loaded and vertically transformed
+    /// @param input     pointer to input image data
+    template <bool CHECKED>
+    __device__ void loadAndVerticallyTransform(FDWT53Column<CHECKED> & column,
+                                               const int * const input) {
+	  // take 3 loaded pixels and put them into shared memory transform buffer
+      buffer[column.offset + 0 * STRIDE] = column.pixel0;
+      buffer[column.offset + 1 * STRIDE] = column.pixel1;
+      buffer[column.offset + 2 * STRIDE] = column.pixel2;
+	
+      // load remaining pixels to be able to vertically transform the window
+
+      for(int i = 3; i < (3 + WIN_SIZE_Y); i++) 
+      {
+        buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
+      }
+ 
+      // remember last 3 pixels for use in next iteration
+      column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
+      column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
+      column.pixel2 = buffer[column.offset + (WIN_SIZE_Y + 2) * STRIDE];
+
+      // vertically transform the column in transform buffer
+	  buffer.forEachVerticalOdd(column.offset, Forward53Predict());
+      buffer.forEachVerticalEven(column.offset, Forward53Update());
+	  
+    }
+
+
+    /// Actual implementation of 5/3 FDWT.
+    /// @tparam CHECK_LOADS   true if input loader must check boundaries
+    /// @tparam CHECK_WRITES  true if output writer must check boundaries
+    /// @param in        input image
+    /// @param out       output buffer
+    /// @param sizeX     width of the input image 
+    /// @param sizeY     height of the input image
+    /// @param winSteps  number of sliding window steps
+    template <bool CHECK_LOADS, bool CHECK_WRITES>
+    __device__ void transform(const int * const in, int * const out,
+                              const int sizeX, const int sizeY,
+                              const int winSteps) {
+      // info about one main and one boundary columns processed by this thread
+      FDWT53Column<CHECK_LOADS> column;    
+      FDWT53Column<CHECK_LOADS> boundaryColumn;  // only few threads use this
+
+      // Initialize all column info: initialize loaders, compute offset of 
+      // column in shared buffer and initialize loader of column.
+      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
+	    initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
+
+	  
+      // first 3 threads initialize boundary columns, others do not use them
+      boundaryColumn.clear();
+      if(threadIdx.x < 3) {
+        // index of boundary column (relative x-axis coordinate of the column)
+        const int colId = threadIdx.x + ((threadIdx.x == 0) ? WIN_SIZE_X : -3);
+
+        // initialize the column
+        initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
+
+      }
+	  
+	  
+      // index of column which will be written into output by this thread      
+	  const int outColumnIndex = parityIdx<WIN_SIZE_X>();
+
+      // offset of column which will be written by this thread into output
+      const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
+
+      // initialize output writer for this thread
+      const int outputFirstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
+      VerticalDWTBandWriter<int, CHECK_WRITES> writer;
+	    writer.init(sizeX, sizeY, outputFirstX, firstY);
+			__syncthreads();
+
+	  
+      // Sliding window iterations:
+      // Each iteration assumes that first 3 pixels of each column are loaded.
+     for(int w = 0; w < winSteps; w++) {
+
+	 // For each column (including boundary columns): load and vertically
+        // transform another WIN_SIZE_Y lines.
+        loadAndVerticallyTransform(column, in);
+        if(threadIdx.x < 3) { 
+          loadAndVerticallyTransform(boundaryColumn, in); 
+        }
+ 		
+        // wait for all columns to be vertically transformed and transform all
+        // output rows horizontally
+        __syncthreads();
+		
+
+		buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
+        __syncthreads();
+		
+        buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
+
+        // wait for all output rows to be transformed horizontally and write
+        // them into output buffer
+        __syncthreads();	
+
+
+        for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
+          // Write low coefficients from output column into low band ...
+			writer.writeLowInto(out, buffer[outColumnOffset + r * STRIDE]);
+          // ... and high coeficients into the high band.
+			writer.writeHighInto(out, buffer[outColumnOffset + (r+1) * STRIDE]);
+        }
+
+        // before proceeding to next iteration, wait for all output columns
+        // to be written into the output
+        __syncthreads();
+			
+	    }
+	
+    }
+
+    
+  public:
+    /// Determines, whether this block's pixels touch boundary and selects
+    /// right version of algorithm according to it - for many threadblocks, it
+    /// selects version which does not deal with boundary mirroring and thus is 
+    /// slightly faster.
+    /// @param in     input image
+    /// @param out    output buffer
+    /// @param sx     width of the input image 
+    /// @param sy     height of the input image
+    /// @param steps  number of sliding window steps
+    __device__ static void run(const int * const in, int * const out,
+                               const int sx, const int sy, const int steps) {
+        // if(blockIdx.x==0 && blockIdx.y ==11 && threadIdx.x >=0&&threadIdx.x <64){
+      // object with transform buffer in shared memory
+      __shared__ FDWT53<WIN_SIZE_X, WIN_SIZE_Y> fdwt53;
+
+	  // Compute limits of this threadblock's block of pixels and use them to
+      // determine, whether this threadblock will have to deal with boundary.
+      // (1 in next expressions is for radius of impulse response of 9/7 FDWT.)
+      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
+      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
+      const bool atRightBoudary = maxX >= sx;
+      const bool atBottomBoudary = maxY >= sy;
+
+      // Select specialized version of code according to distance of this
+      // threadblock's pixels from image boundary.
+
+      // if(threadIdx.x == 0) {
+      //   printf("fdwt53 run");
+      // }
+      if(atBottomBoudary) 
+      {
+        // near bottom boundary => check both writing and reading
+        fdwt53.transform<true, true>(in, out, sx, sy, steps);
+      } else if(atRightBoudary) 
+      {
+        // near right boundary only => check writing only
+        fdwt53.transform<false, true>(in, out, sx, sy, steps);
+      } else 
+      {
+        // no nearby boundary => check nothing
+        fdwt53.transform<false, false>(in, out, sx, sy, steps);
+      }
+    }
+    // }
+    
+  }; // end of class FDWT53
+  
+  
+  
+  /// Main GPU 5/3 FDWT entry point.
+  /// @tparam WIN_SX   width of sliding window to be used
+  /// @tparam WIN_SY   height of sliding window to be used
+  /// @param input     input image
+  /// @param output    output buffer
+  /// @param sizeX     width of the input image 
+  /// @param sizeY     height of the input image
+  /// @param winSteps  number of sliding window steps
+  template <int WIN_SX, int WIN_SY>
+  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT53<WIN_SX, WIN_SY>), 8))
+  __global__ void fdwt53Kernel(const int * const input, int * const output,
+                               const int sizeX, const int sizeY,
+                               const int winSteps) {
+    FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
+  }
+
+  
+
+  /// Only computes optimal number of sliding window steps, 
+  /// number of threadblocks and then lanches the 5/3 FDWT kernel.
+  /// @tparam WIN_SX  width of sliding window
+  /// @tparam WIN_SY  height of sliding window
+  /// @param in       input image
+  /// @param out      output buffer
+  /// @param sx       width of the input image 
+  /// @param sy       height of the input image
+  template <int WIN_SX, int WIN_SY>
+  void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
+    // compute optimal number of steps of each sliding window
+	
+    const int steps = divRndUp(sy, 15 * WIN_SY);
+
+	int gx = divRndUp(sx, WIN_SX);
+	int gy = divRndUp(sy, WIN_SY * steps);
+
+	printf("\n sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
+
+    // prepare grid size
+    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
+    // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
+    
+    // run kernel, possibly measure time and finally check the call
+    // PERF_BEGIN
+    fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
+    // PERF_END("        FDWT53", sx, sy)
+    // CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
+    printf("fdwt53Kernel in launchFDWT53Kernel has finished");
+	
+  }
+  
+  
+  
+  /// Forward 5/3 2D DWT. See common rules (above) for more details.
+  /// @param in      Expected to be normalized into range [-128, 127].
+  ///                Will not be preserved (will be overwritten).
+  /// @param out     output buffer on GPU
+  /// @param sizeX   width of input image (in pixels)
+  /// @param sizeY   height of input image (in pixels)
+  /// @param levels  number of recursive DWT levels
+  void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
+    // select right width of kernel for the size of the image
+	
+    if(sizeX >= 960) {
+      launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
+    } else if (sizeX >= 480) {
+      launchFDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
+    } else {
+      launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
+    }
+    
+    // if this was not the last level, continue recursively with other levels
+    if(levels > 1) {
+      // copy output's LL band back into input buffer
+      const int llSizeX = divRndUp(sizeX, 2); 
+      const int llSizeY = divRndUp(sizeY, 2);
+	 // printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
+      memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
+      
+      // run remaining levels of FDWT
+      fdwt53(in, out, llSizeX, llSizeY, levels - 1);
+    }
+  }
+  
+  
+
+} // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/fdwt97.cu
+++ b/examples/dwt2d/dwt_cuda/fdwt97.cu
@ -0,0 +1,383 @@
+/// 
+/// @file    fdwt97.cu
+/// @brief   CUDA implementation of forward 9/7 2D DWT.
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @date    2011-01-20 13:18
+///
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+
+
+#include "common.h"
+#include "transform_buffer.h"
+#include "io.h"
+
+
+namespace dwt_cuda {
+
+ 
+  
+  /// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
+  /// of specified size. Template arguments specify this size.
+  /// @tparam WIN_SIZE_X  width of sliding window
+  /// @tparam WIN_SIZE_Y  height of sliding window
+  template <int WIN_SIZE_X, int WIN_SIZE_Y>
+  class FDWT97 {
+  private:
+    /// Type of shared memory buffer used for 9/7 DWT.
+    typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> FDWT97Buffer;
+
+    /// Actual shared buffer used for forward 9/7 DWT.
+    FDWT97Buffer buffer;
+
+    /// Difference of indices of two vertically neighboring items in buffer.
+    enum { STRIDE = FDWT97Buffer::VERTICAL_STRIDE };
+
+
+    /// One thread's info about loading input image
+    /// @tparam CHECKED  true if loader should check for image boundaries
+    template <bool CHECKED>
+    struct FDWT97ColumnLoadingInfo {
+      /// Loader of pixels from some input image.
+      VerticalDWTPixelLoader<float, CHECKED> loader;  
+      
+      /// Offset of column loaded by loader. (Offset in shared buffer.)
+      int offset;
+    };
+
+
+    /// Horizontal 9/7 FDWT on specified lines of transform buffer.
+    /// @param lines      number of lines to be transformed
+    /// @param firstLine  index of the first line to be transformed
+    __device__ void horizontalFDWT97(const int lines, const int firstLine) {
+      __syncthreads();
+
+      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict1));
+      __syncthreads();
+      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update1));
+      __syncthreads();
+      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict2));
+      __syncthreads();
+      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update2));
+      __syncthreads();
+
+      buffer.scaleHorizontal(scale97Div, scale97Mul, firstLine, lines);
+
+      __syncthreads();
+
+    }
+
+
+    /// Initializes one column of shared transform buffer with 7 input pixels.
+    /// Those 7 pixels will not be transformed. Also initializes given loader.
+    /// @tparam CHECKED     true if loader should check for image boundaries
+    /// @param column       (uninitialized) object for loading input pixels
+    /// @param columnIndex  index (not offset!) of the column to be loaded
+    ///                     (relative to threadblock's first column)
+    /// @param input        pointer to input image in GPU memory
+    /// @param sizeX        width of the input image
+    /// @param sizeY        height of the input image
+    /// @param firstY       index of first row to be loaded from image
+    template <bool CHECKED>
+    __device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
+                              const int columnIndex, const float * const input, 
+                              const int sizeX, const int sizeY,
+                              const int firstY) {
+      // get offset of the column with index 'columnIndex'
+      column.offset = buffer.getColumnOffset(columnIndex);
+
+      // printf(" offset: %d  , threadIdx: %d, blockIdx.y %d\n ", column.offset, threadIdx.x, blockIdx.y);
+
+      // x-coordinate of the first pixel to be loaded by given loader
+      const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;
+      
+      if(blockIdx.y == 0) {
+        // topmost block - apply mirroring rules when loading first 7 rows
+        column.loader.init(sizeX, sizeY, firstX, firstY);
+
+        // load pixels in mirrored way
+        buffer[column.offset + 4 * STRIDE] = column.loader.loadFrom(input);
+        buffer[column.offset + 3 * STRIDE] =
+        buffer[column.offset + 5 * STRIDE] = column.loader.loadFrom(input);
+        buffer[column.offset + 2 * STRIDE] =
+        buffer[column.offset + 6 * STRIDE] = column.loader.loadFrom(input);
+        buffer[column.offset + 1 * STRIDE] = column.loader.loadFrom(input);
+        buffer[column.offset + 0 * STRIDE] = column.loader.loadFrom(input);
+
+        // reinitialize loader to start with pixel #3 again
+        column.loader.init(sizeX, sizeY, firstX, firstY + 3);
+
+      } else {
+        // non-topmost row - regular loading:
+        column.loader.init(sizeX, sizeY, firstX, firstY - 4);
+
+        // load 7 rows into the transform buffer
+        for(int i = 0; i < 7; i++) {
+          buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
+
+        }
+      }
+      // Now, the next pixel, which will be loaded by loader, is pixel #3.
+    }
+
+
+    /// Loads another WIN_SIZE_Y pixels into given column using given loader.
+    /// @tparam CHECKED  true if loader should check for image boundaries
+    /// @param input     input image to load from
+    /// @param column    loader and offset of loaded column in shared buffer
+    template <bool CHECKED>
+    inline __device__ void loadWindowIntoColumn(const float * const input,
+                                  FDWT97ColumnLoadingInfo<CHECKED> & column) {
+      for(int i = 7; i < (7 + WIN_SIZE_Y); i++) {
+        buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
+      }
+    }
+
+
+    /// Main GPU 9/7 FDWT entry point.
+    /// @tparam CHECK_LOADS   true if boundaries should be checked when loading
+    /// @tparam CHECK_WRITES  true if boundaries should be checked when writing
+    /// @param in        input image
+    /// @param out       output buffer
+    /// @param sizeX     width of the input image 
+    /// @param sizeY     height of the input image
+    /// @param winSteps  number of steps of sliding window
+    template <bool CHECK_LOADS, bool CHECK_WRITES>
+    __device__ void transform(const float * const in, float * const out,
+                              const int sizeX, const int sizeY,
+                              const int winSteps) {
+      // info about columns loaded by this thread: one main column and possibly
+      // one boundary column. (Only some threads load some boundary column.)
+      FDWT97ColumnLoadingInfo<CHECK_LOADS> loadedColumn;
+      FDWT97ColumnLoadingInfo<CHECK_LOADS> boundaryColumn;
+
+      // Initialize first 7 lines of transform buffer.
+      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
+      initColumn(loadedColumn, threadIdx.x, in, sizeX, sizeY, firstY);
+
+      // Some threads initialize boundary columns.
+      boundaryColumn.offset = 0;
+      boundaryColumn.loader.clear();
+      if(threadIdx.x < 7) {
+        // each thread among first 7 ones gets index of one of boundary columns
+        const int colId = threadIdx.x + ((threadIdx.x < 3) ? WIN_SIZE_X : -7);
+
+        // Thread initializes offset of the boundary column (in shared buffer),
+        // first 7 pixels of the column and a loader for this column.
+        initColumn(boundaryColumn, colId, in, sizeX, sizeY, firstY);
+      }
+
+      // horizontally transform first 7 rows in all columns
+      horizontalFDWT97(7, 0);
+
+      // Index of column handled by this thread. (First half of threads handle
+      // even columns and others handle odd columns.)
+      const int outColumnIndex = parityIdx<WIN_SIZE_X>();
+
+      // writer of output linear bands - initialize it
+      const int firstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
+      VerticalDWTBandWriter<float, CHECK_WRITES> writer;
+      writer.init(sizeX, sizeY, firstX, firstY);
+
+      // transform buffer offset of column transformed and saved by this thread
+      const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
+
+      // (Each iteration of this loop assumes that first 7 rows of transform 
+      // buffer are already loaded with horizontally transformed coefficients.)
+      for(int w = 0; w < winSteps; w++) {
+        // Load another WIN_SIZE_Y lines of thread's column into the buffer.
+        loadWindowIntoColumn(in, loadedColumn);
+
+        // some threads also load boundary columns
+        if(threadIdx.x < 7) {
+          loadWindowIntoColumn(in, boundaryColumn);
+        }
+
+        // horizontally transform all newly loaded lines
+        horizontalFDWT97(WIN_SIZE_Y, 7);
+
+        // Using 7 registers, remember current values of last 7 rows of
+        // transform buffer. These rows are transformed horizontally only 
+        // and will be used in next iteration.
+        float last7Lines[7];
+        for(int i = 0; i < 7; i++) {
+          last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
+        }
+
+        // vertically transform all central columns (do not scale yet)
+        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict1));
+        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update1));
+        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict2));
+        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update2));
+
+        // Save all results of current window. Results are in transform buffer
+        // at rows from #4 to #(4 + WIN_SIZE_Y). Other rows are invalid now.
+        // (They only served as a boundary for vertical FDWT.)
+
+        for(int i = 4; i < (4 + WIN_SIZE_Y); i += 2) {
+          const int index = outColumnOffset + i * STRIDE;
+          // Write low coefficients from column into low band ...
+          writer.writeLowInto(out, buffer[index] * scale97Div);
+          // ... and high coeficients into the high band.
+          writer.writeHighInto(out, buffer[index + STRIDE] * scale97Mul);
+        }
+
+        // Use last 7 remembered lines as first 7 lines for next iteration.
+        // As expected, these lines are already horizontally transformed.
+        for(int i = 0; i < 7; i++) {
+          buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
+      
+        }
+
+        // Wait for all writing threads before proceeding to loading new
+        // pixels in next iteration. (Not to overwrite those which
+        // are not written yet.)
+        __syncthreads();
+      }
+
+    }
+    
+    
+  public:
+    /// Runs one of specialized variants of 9/7 FDWT according to distance of
+    /// processed pixels to image boudnary. Some variants do not check for 
+    /// boudnary and thus are slightly faster.
+    /// @param in     input image
+    /// @param out    output buffer
+    /// @param sx     width of the input image 
+    /// @param sy     height of the input image
+    /// @param steps  number of steps of sliding window
+    __device__ static void run(const float * const input, float * const output,
+                               const int sx, const int sy, const int steps) {
+      // object with transform buffer in shared memory
+      __shared__ FDWT97<WIN_SIZE_X, WIN_SIZE_Y> fdwt97;
+
+      // Compute limits of this threadblock's block of pixels and use them to
+      // determine, whether this threadblock will have to deal with boundary.
+      // (3 in next expressions is for radius of impulse response of 9/7 FDWT.)
+      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
+      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
+      const bool atRightBoudary = maxX >= sx;
+      const bool atBottomBoudary = maxY >= sy;
+
+      // Select specialized version of code according to distance of this
+      // threadblock's pixels from image boundary.
+      if(atBottomBoudary) {
+        // near bottom boundary => check both writing and reading
+        // printf("\n atBottomBoudary \n ");
+        fdwt97.transform<true, true>(input, output, sx, sy, steps);
+      } else if(atRightBoudary) {
+
+        // near right boundary only => check writing only
+        fdwt97.transform<false, true>(input, output, sx, sy, steps);
+      } else {
+
+        // no nearby boundary => check nothing
+        fdwt97.transform<false, false>(input, output, sx, sy, steps);
+      }
+    }
+    
+  }; // end of class FDWT97
+  
+  
+    
+  /// Main GPU 9/7 FDWT entry point.
+  /// @param input   input image
+  /// @parma output  output buffer
+  /// @param sx      width of the input image 
+  /// @param sy      height of the input image
+  /// @param steps   number of steps of sliding window
+  template <int WIN_SX, int WIN_SY>
+  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT97<WIN_SX, WIN_SY>), 8))
+  __global__ void fdwt97Kernel(const float * const input, float * const output,
+                               const int sx, const int sy, const int steps) {
+    // Excuse me, dear reader of this code - this call have to be here. If you
+    // try to simply put contents of following method right here, CUDA compiler
+    // (version 3.2) will spit tons of nonsense messy errors ...
+    // Hope they will not break it even more in future releases.
+    FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
+  }
+
+  
+  
+  /// Only computes optimal number of sliding window steps, 
+  /// number of threadblocks and then lanches the 9/7 FDWT kernel.
+  /// @tparam WIN_SX  width of sliding window
+  /// @tparam WIN_SY  height of sliding window
+  /// @param in       input image
+  /// @param out      output buffer
+  /// @param sx       width of the input image 
+  /// @param sy       height of the input image
+  template <int WIN_SX, int WIN_SY>
+  void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
+    // compute optimal number of steps of each sliding window
+    const int steps = divRndUp(sy, 15 * WIN_SY);
+    
+    // prepare grid size
+    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
+    printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
+
+    // run kernel, possibly measure time and finally check the call
+    PERF_BEGIN
+    fdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
+    PERF_END("        FDWT97", sx, sy)
+    CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
+  }
+  
+  
+  
+  /// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
+  /// @param in      Input DWT coefficients. Should be normalized (in range 
+  ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
+  /// @param out     output buffer on GPU - format specified in common rules
+  /// @param sizeX   width of input image (in pixels)
+  /// @param sizeY   height of input image (in pixels)
+  /// @param levels  number of recursive DWT levels
+  void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
+    // select right width of kernel for the size of the image
+    if(sizeX >= 960) {
+      launchFDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
+    } else if (sizeX >= 480) {
+      launchFDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
+    } else {
+      launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
+    }
+    
+    // if this was not the last level, continue recursively with other levels
+    if(levels > 1) {
+      // copy output's LL band back into input buffer
+      const int llSizeX = divRndUp(sizeX, 2);
+      const int llSizeY = divRndUp(sizeY, 2);
+      memCopy(in, out, llSizeX, llSizeY);
+      
+      // run remaining levels of FDWT
+      fdwt97(in, out, llSizeX, llSizeY, levels - 1);
+    }
+  }
+  
+  
+
+} // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/io.h
+++ b/examples/dwt2d/dwt_cuda/io.h
@ -0,0 +1,483 @@
+///
+/// @file:   io.h
+/// @brief   Manages loading and saving lineary stored bands and input images.
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @date    2011-01-20 22:38
+/// 
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+
+
+#ifndef IO_H
+#define	IO_H
+
+
+#include "common.h"
+
+namespace dwt_cuda {
+
+  
+  /// Base for all IO classes - manages mirroring.
+  class DWTIO {
+  protected:
+    /// Handles mirroring of image at edges in a DWT correct way.
+    /// @param d      a position in the image (will be replaced by mirrored d)
+    /// @param sizeD  size of the image along the dimension of 'd'
+    __device__ static void mirror(int & d, const int & sizeD) {
+      // TODO: enable multiple mirroring:
+//      if(sizeD > 1) {
+//        if(d < 0) {
+//          const int underflow = -1 - d;
+//          const int phase = (underflow / (sizeD - 1)) & 1;
+//          const int remainder = underflow % (sizeD - 1);
+//          if(phase == 0) {
+//            d = remainder + 1;
+//          } else {
+//            d = sizeD - 2 - remainder;
+//          }
+//        } else if(d >= sizeD) {
+//          const int overflow = d - sizeD;
+//          const int phase = (overflow / (sizeD - 1)) & 1;
+//          const int remainder = overflow % (sizeD - 1);
+//          if(phase == 0) {
+//            d = sizeD - 2 - remainder;
+//          } else {
+//            d = remainder + 1;
+//          }
+//        }
+//      } else {
+//        d = 0;
+//      }
+  //for test the mirror's use Feb 17
+      if(d >= sizeD) {
+        d = 2 * sizeD - 2 - d;
+      } else if(d < 0) {
+        d = -d;
+      }
+    }
+  };
+
+
+  /// Base class for pixel loader and writer - manages computing start index,
+  /// stride and end of image for loading column of pixels.
+  /// @tparam T        type of image pixels
+  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+  template <typename T, bool CHECKED>
+  class VerticalDWTPixelIO : protected DWTIO {
+  protected:
+    int end;         ///< index of bottom neightbor of last pixel of column
+    int stride;      ///< increment of pointer to get to next pixel
+
+    /// Initializes pixel IO - sets end index and a position of first pixel.
+    /// @param sizeX   width of the image
+    /// @param sizeY   height of the image
+    /// @param firstX  x-coordinate of first pixel to use
+    /// @param firstY  y-coordinate of first pixel to use
+    /// @return index of pixel at position [x, y] in the image
+    __device__ int initialize(const int sizeX, const int sizeY,
+                              int firstX, int firstY) {
+      // initialize all pointers and stride
+      end = CHECKED ? (sizeY * sizeX + firstX) : 0;
+      stride = sizeX;
+      return firstX + sizeX * firstY;
+    }
+  };
+
+
+
+  /// Writes reverse transformed pixels directly into output image.
+  /// @tparam T        type of output pixels
+  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+  template <typename T, bool CHECKED>
+  class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
+  private:
+    int next;   // index of the next pixel to be loaded
+
+  public:
+    /// Initializes writer - sets output buffer and a position of first pixel.
+    /// @param sizeX   width of the image
+    /// @param sizeY   height of the image
+    /// @param firstX  x-coordinate of first pixel to write into
+    /// @param firstY  y-coordinate of first pixel to write into
+    __device__ void init(const int sizeX, const int sizeY, 
+                         int firstX, int firstY) {
+      if(firstX < sizeX) {
+        next = this->initialize(sizeX, sizeY, firstX, firstY);
+      } else {
+        this->end = 0;
+        this->stride = 0;
+        next = 0;
+      }
+    }
+
+    /// Writes given value at next position and advances internal pointer while
+    /// correctly handling mirroring.
+    /// @param output  output image to write pixel into
+    /// @param value   value of the pixel to be written
+    __device__ void writeInto(T * const output, const T & value) {
+      if((!CHECKED) || (next != this->end)) {
+        output[next] = value;
+        next += this->stride;
+      }
+    }
+  };
+
+
+  
+  /// Loads pixels from input image.
+  /// @tparam T        type of image input pixels
+  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+  template <typename T, bool CHECKED>
+  class VerticalDWTPixelLoader
+          : protected VerticalDWTPixelIO<const T, CHECKED> {
+  private:
+    int last;  ///< index of last loaded pixel
+  public:
+  
+
+  //******************* FOR TEST **********************
+  __device__ int getlast(){
+		return last;
+	}
+  __device__ int getend(){
+		return this->end;
+	}
+  __device__ int getstride(){
+		return this->stride;
+	}
+  __device__ void setend(int a){
+      this->end=a;
+	}
+	//******************* FOR TEST **********************
+  
+  
+  
+    /// Initializes loader - sets input size and a position of first pixel.
+    /// @param sizeX   width of the image
+    /// @param sizeY   height of the image
+    /// @param firstX  x-coordinate of first pixel to load
+    /// @param firstY  y-coordinate of first pixel to load
+    __device__ void init(const int sizeX, const int sizeY,
+                         int firstX, int firstY) {
+      // correctly mirror x coordinate
+      this->mirror(firstX, sizeX);
+      
+      // 'last' always points to already loaded pixel (subtract sizeX = stride)
+      last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
+      //last = (FirstX + sizeX * FirstY) - sizeX
+    }
+    
+    /// Sets all fields to zeros, for compiler not to complain about
+    /// uninitialized stuff.
+    __device__ void clear() {
+      this->end = 0;
+      this->stride = 0;
+      this->last = 0;
+    }
+
+    /// Gets another pixel and advancees internal pointer to following one.
+    /// @param input  input image to load next pixel from
+    /// @return next pixel from given image
+    __device__ T loadFrom(const T * const input) {
+      last += this->stride;
+      if(CHECKED && (last == this->end)) {
+        last -= 2 * this->stride;
+        this->stride = -this->stride; // reverse loader's direction
+      }
+      // avoid reading from negative indices if loader is checked
+      // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
+      if(last < 0 ) {
+        return 0;
+      }
+    
+      return input[last];
+      // return this->end;
+      // return last;
+      // return this->stride;
+    }
+  };
+
+
+
+  /// Base for band write and loader. Manages computing strides and pointers
+  /// to first and last pixels in a linearly-stored-bands correct way.
+  /// @tparam T        type of band coefficients
+  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+  template <typename T, bool CHECKED>
+  class VerticalDWTBandIO : protected DWTIO {
+  protected:
+    /// index of bottom neighbor of last pixel of loaded column
+    int end;
+    
+    /// increment of index to get from highpass band to the lowpass one
+    int strideHighToLow;
+    
+    /// increment of index to get from the lowpass band to the highpass one
+    int strideLowToHigh;
+
+    /// Initializes IO - sets size of image and a position of first pixel.
+    /// @param imageSizeX   width of the image
+    /// @param imageSizeY   height of the image
+    /// @param firstX       x-coordinate of first pixel to use
+    ///                     (Parity determines vertically low or high band.)
+    /// @param firstY       y-coordinate of first pixel to use
+    ///                     (Parity determines horizontally low or high band.)
+    /// @return index of first item specified by firstX and firstY
+    __device__ int initialize(const int imageSizeX, const int imageSizeY,
+                              int firstX, int firstY) {
+      // index of first pixel (topmost one) of the column with index firstX
+      int columnOffset = firstX / 2;
+      
+      // difference between indices of two vertically neighboring pixels
+      // in the same band
+      int verticalStride;
+      
+      // resolve index of first pixel according to horizontal parity
+      if(firstX & 1) {
+        // first pixel in one of right bands
+        verticalStride = imageSizeX / 2;
+        columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
+        strideLowToHigh = (imageSizeX * imageSizeY) / 2;
+      } else {
+        // first pixel in one of left bands
+        verticalStride = imageSizeX / 2 + (imageSizeX & 1);
+        strideLowToHigh = divRndUp(imageSizeY, 2)  * imageSizeX;
+      }
+      
+      // set the other stride
+      strideHighToLow = verticalStride - strideLowToHigh;
+
+      // compute index of coefficient which indicates end of image
+      if(CHECKED) {
+        end = columnOffset                            // right column
+                + (imageSizeY / 2) * verticalStride   // right row
+                + (imageSizeY & 1) * strideLowToHigh; // possibly in high band
+      } else {
+        end = 0;
+      }
+
+
+	//***********for test**************
+	//	end = CHECKED;
+	//***********for test**************
+	
+	
+      // finally, return index of the first item
+      return columnOffset                        // right column
+              + (firstY / 2) * verticalStride    // right row
+              + (firstY & 1) * strideLowToHigh;  // possibly in high band
+    }
+  };
+
+
+
+
+  /// Directly loads coefficients from four consecutively stored transformed
+  /// bands.
+  /// @tparam T        type of input band coefficients
+  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+  template <typename T, bool CHECKED>
+  class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
+  private:
+    int last;  ///< index of last loaded pixel
+
+    /// Checks internal index and possibly reverses direction of loader.
+    /// (Handles mirroring at the bottom of the image.)
+    /// @param input   input image to load next coefficient from
+    /// @param stride  stride to use now (one of two loader's strides)
+    /// @return loaded coefficient
+    __device__ T updateAndLoad(const T * const input, const int & stride) {
+      last += stride;
+      if(CHECKED && (last == this->end)) {
+        // undo last two updates of index (to get to previous mirrored item)
+        last -= (this->strideLowToHigh + this->strideHighToLow);
+
+        // swap and reverse strides (to move up in the loaded column now)
+        const int temp = this->strideLowToHigh;
+        this->strideLowToHigh = -this->strideHighToLow;
+        this->strideHighToLow = -temp;
+      }
+      if(last < 0 ) {
+        return 0;
+      }
+      // avoid reading from negative indices if loader is checked
+      // return (CHECKED && (last < 0)) ? 0 : input[last];  // TODO: use this checked variant later
+      return input[last];
+    }
+  public:
+
+    /// Initializes loader - sets input size and a position of first pixel.
+    /// @param imageSizeX   width of the image
+    /// @param imageSizeY   height of the image
+    /// @param firstX       x-coordinate of first pixel to load
+    ///                     (Parity determines vertically low or high band.)
+    /// @param firstY       y-coordinate of first pixel to load
+    ///                     (Parity determines horizontally low or high band.)
+    __device__ void init(const int imageSizeX, const int imageSizeY,
+                         int firstX, const int firstY) {
+      this->mirror(firstX, imageSizeX);
+      last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
+      
+      // adjust to point to previous item
+      last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow; 
+    }
+    
+    /// Sets all fields to zeros, for compiler not to complain about
+    /// uninitialized stuff.
+    __device__ void clear() {
+      this->end = 0;
+      this->strideHighToLow = 0;
+      this->strideLowToHigh = 0;
+      this->last = 0;
+    }
+
+    /// Gets another coefficient from lowpass band and advances internal index.
+    /// Call this method first if position of first pixel passed to init
+    /// was in high band.
+    /// @param input   input image to load next coefficient from
+    /// @return next coefficient from the lowpass band of the given image
+    __device__ T loadLowFrom(const T * const input) {
+      return updateAndLoad(input, this->strideHighToLow);
+    }
+
+    /// Gets another coefficient from the highpass band and advances index.
+    /// Call this method first if position of first pixel passed to init
+    /// was in high band.
+    /// @param input   input image to load next coefficient from
+    /// @return next coefficient from the highbass band of the given image
+    __device__ T loadHighFrom(const T * const input) {
+      return updateAndLoad(input, this->strideLowToHigh);
+    }
+
+  };
+
+
+
+
+  /// Directly saves coefficients into four transformed bands.
+  /// @tparam T        type of output band coefficients
+  /// @tparam CHECKED  true = be prepared to image boundary, false = don't care
+  template <typename T, bool CHECKED>
+  class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
+  private:
+    int next;  ///< index of last loaded pixel
+
+    /// Checks internal index and possibly stops the writer.
+    /// (Handles mirroring at edges of the image.)
+    /// @param output  output buffer
+    /// @param item    item to put into the output
+    /// @param stride  increment of the pointer to get to next output index
+    __device__ int saveAndUpdate(T * const output, const T & item,
+                                  const int & stride) {
+//	if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){		//test, Mar 20					  
+      if((!CHECKED) || (next != this->end)) {
+        // if(next == 4) {
+        //   printf(" next: %d  stride: %d val: %f \n", next, stride, item );
+        // }
+        output[next] = item;
+        next += stride;
+      } 
+//	}
+      // if((!CHECKED) || (next != this->end)) { //the real one
+        // output[next] = item;
+        // next += stride;  //stride has been test
+      // } 
+	return next;
+    }
+  public:
+
+    /// Initializes writer - sets output size and a position of first pixel.
+    /// @param output       output image
+    /// @param imageSizeX   width of the image
+    /// @param imageSizeY   height of the image
+    /// @param firstX       x-coordinate of first pixel to write
+    ///                     (Parity determines vertically low or high band.)
+    /// @param firstY       y-coordinate of first pixel to write
+    ///                     (Parity determines horizontally low or high band.)
+    __device__ void init(const int imageSizeX, const int imageSizeY,
+                         const int firstX, const int firstY) {
+      if (firstX < imageSizeX) {
+        next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
+      } else {
+        clear();
+      }
+    }
+    
+    /// Sets all fields to zeros, for compiler not to complain about
+    /// uninitialized stuff.
+    __device__ void clear() {
+      this->end = 0;
+      this->strideHighToLow = 0;
+      this->strideLowToHigh = 0;
+      this->next = 0;
+    }
+
+    /// Writes another coefficient into the band which was specified using
+    /// init's firstX and firstY parameters and advances internal pointer.
+    /// Call this method first if position of first pixel passed to init
+    /// was in lowpass band.
+    /// @param output  output image
+    /// @param low     lowpass coefficient to save into the lowpass band
+    __device__ int writeLowInto(T * const output, const T & primary) {
+      return saveAndUpdate(output, primary, this->strideLowToHigh);
+    }
+
+    /// Writes another coefficient from the other band and advances pointer.
+    /// Call this method first if position of first pixel passed to init
+    /// was in highpass band.
+    /// @param output  output image
+    /// @param high    highpass coefficient to save into the highpass band
+    __device__ int writeHighInto(T * const output, const T & other) {
+      return saveAndUpdate(output, other, this->strideHighToLow);
+    }
+
+	//*******Add three functions to get private values*******
+	__device__ int getnext(){
+		return next;
+	}
+	
+	__device__ int getend(){
+		return this->end;
+	}
+	
+	__device__ int getstrideHighToLow(){
+		return this->strideHighToLow;
+	}
+	
+	__device__ int getstrideLowToHigh(){
+		return this->strideLowToHigh;
+	}
+	
+	//*******Add three functions to get private values*******
+  };
+  
+  
+  
+} // namespace dwt_cuda
+
+
+#endif	// IO_H
+
--- a/examples/dwt2d/dwt_cuda/rdwt53.cu
+++ b/examples/dwt2d/dwt_cuda/rdwt53.cu
@ -0,0 +1,360 @@
+/// 
+/// @file    rdwt53.cu
+/// @brief   CUDA implementation of reverse 5/3 2D DWT.
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @date    2011-02-04 14:19
+///
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+
+
+#include "common.h"
+#include "transform_buffer.h"
+#include "io.h"
+
+
+namespace dwt_cuda {
+
+  
+
+  /// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
+  /// using sliding window and lifting schema.
+  /// @tparam WIN_SIZE_X  width of sliding window
+  /// @tparam WIN_SIZE_Y  height of sliding window
+  template <int WIN_SIZE_X, int WIN_SIZE_Y>
+  class RDWT53 {
+  private: 
+    
+    /// Shared memory buffer used for 5/3 DWT transforms.
+    typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
+
+    /// Shared buffer used for reverse 5/3 DWT.
+    RDWT53Buffer buffer;
+
+    /// Difference between indices of two vertically neighboring items in buffer.
+    enum { STRIDE = RDWT53Buffer::VERTICAL_STRIDE };
+
+
+    /// Info needed for loading of one input column from input image.
+    /// @tparam CHECKED  true if loader should check boundaries
+    template <bool CHECKED>
+    struct RDWT53Column {
+      /// loader of pixels from column in input image
+      VerticalDWTBandLoader<int, CHECKED> loader;
+      
+      /// Offset of corresponding column in shared buffer.
+      int offset;
+      
+      /// Sets all fields to some values to avoid 'uninitialized' warnings.
+      __device__ void clear() {
+        offset = 0;
+        loader.clear();
+      }
+    };
+
+
+    /// 5/3 DWT reverse update operation.
+    struct Reverse53Update {
+      __device__ void operator() (const int p, int & c, const int n) const {
+        c -= (p + n + 2) / 4;  // F.3, page 118, ITU-T Rec. T.800 final draft
+      }
+    };
+
+
+    /// 5/3 DWT reverse predict operation.
+    struct Reverse53Predict {
+      __device__ void operator() (const int p, int & c, const int n) const {
+        c += (p + n) / 2;      // F.4, page 118, ITU-T Rec. T.800 final draft
+      }
+    };
+
+
+    /// Horizontal 5/3 RDWT on specified lines of transform buffer.
+    /// @param lines      number of lines to be transformed
+    /// @param firstLine  index of the first line to be transformed
+    __device__ void horizontalTransform(const int lines, const int firstLine) {
+      __syncthreads();
+      buffer.forEachHorizontalEven(firstLine, lines, Reverse53Update());
+      __syncthreads();
+      buffer.forEachHorizontalOdd(firstLine, lines, Reverse53Predict());
+      __syncthreads();
+    }
+
+
+    /// Using given loader, it loads another WIN_SIZE_Y coefficients
+    /// into specified column.
+    /// @tparam CHECKED  true if loader should check image boundaries
+    /// @param input     input coefficients to load from
+    /// @param col       info about loaded column
+    template <bool CHECKED>
+    inline __device__ void loadWindowIntoColumn(const int * const input,
+                                                RDWT53Column<CHECKED> & col) {
+      for(int i = 3; i < (3 + WIN_SIZE_Y); i += 2) {
+        buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
+        buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
+      }
+    }
+
+
+    /// Initializes one column of shared transform buffer with 7 input pixels.
+    /// Those 7 pixels will not be transformed. Also initializes given loader.
+    /// @tparam CHECKED  true if loader should check image boundaries
+    /// @param columnX   x coordinate of column in shared transform buffer
+    /// @param input     input image
+    /// @param sizeX     width of the input image
+    /// @param sizeY     height of the input image
+    /// @param loader    (uninitialized) info about loaded column
+    template <bool CHECKED>
+    __device__ void initColumn(const int columnX, const int * const input, 
+                               const int sizeX, const int sizeY,
+                               RDWT53Column<CHECKED> & column,
+                               const int firstY) {
+      // coordinates of the first coefficient to be loaded
+      const int firstX = blockIdx.x * WIN_SIZE_X + columnX;
+
+      // offset of the column with index 'colIndex' in the transform buffer
+      column.offset = buffer.getColumnOffset(columnX);
+
+      if(blockIdx.y == 0) {
+        // topmost block - apply mirroring rules when loading first 3 rows
+        column.loader.init(sizeX, sizeY, firstX, firstY);
+
+        // load pixels in mirrored way
+        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
+        buffer[column.offset + 0 * STRIDE] =
+        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
+      } else {
+        // non-topmost row - regular loading:
+        column.loader.init(sizeX, sizeY, firstX, firstY - 1);
+        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
+        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
+        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
+      }
+      // Now, the next coefficient, which will be loaded by loader, is #2.
+    }
+
+
+    /// Actual GPU 5/3 RDWT implementation.
+    /// @tparam CHECKED_LOADS   true if boundaries must be checked when reading
+    /// @tparam CHECKED_WRITES  true if boundaries must be checked when writing
+    /// @param in        input image (5/3 transformed coefficients)
+    /// @param out       output buffer (for reverse transformed image)
+    /// @param sizeX     width of the output image 
+    /// @param sizeY     height of the output image
+    /// @param winSteps  number of sliding window steps
+    template<bool CHECKED_LOADS, bool CHECKED_WRITES>
+    __device__ void transform(const int * const in, int * const out,
+                              const int sizeX, const int sizeY,
+                              const int winSteps) {
+      // info about one main and one boundary column
+      RDWT53Column<CHECKED_LOADS> column, boundaryColumn;
+
+      // index of first row to be transformed
+      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
+
+      // some threads initialize boundary columns
+      boundaryColumn.clear();
+      if(threadIdx.x < 3) {
+        // First 3 threads also handle boundary columns. Thread #0 gets right
+        // column #0, thread #1 get right column #1 and thread #2 left column.
+        const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
+
+        // Thread initializes offset of the boundary column (in shared 
+        // buffer), first 3 pixels of the column and a loader for this column.
+        initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
+      }
+
+      // All threads initialize central columns.
+      initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
+
+      // horizontally transform first 3 rows
+      horizontalTransform(3, 0);
+
+      // writer of output pixels - initialize it
+      const int outX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
+      VerticalDWTPixelWriter<int, CHECKED_WRITES> writer;
+      writer.init(sizeX, sizeY, outX, firstY);
+
+      // offset of column (in transform buffer) saved by this thread
+      const int outputColumnOffset = buffer.getColumnOffset(threadIdx.x);
+
+      // (Each iteration assumes that first 3 rows of transform buffer are
+      // already loaded with horizontally transformed pixels.)
+      for(int w = 0; w < winSteps; w++) {
+        // Load another WIN_SIZE_Y lines of this thread's column
+        // into the transform buffer.
+        loadWindowIntoColumn(in, column);
+
+        // possibly load boundary columns
+        if(threadIdx.x < 3) {
+          loadWindowIntoColumn(in, boundaryColumn);
+        }
+
+        // horizontally transform all newly loaded lines
+        horizontalTransform(WIN_SIZE_Y, 3);
+
+        // Using 3 registers, remember current values of last 3 rows 
+        // of transform buffer. These rows are transformed horizontally 
+        // only and will be used in next iteration.
+        int last3Lines[3];
+        last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
+        last3Lines[1] = buffer[outputColumnOffset + (WIN_SIZE_Y + 1) * STRIDE];
+        last3Lines[2] = buffer[outputColumnOffset + (WIN_SIZE_Y + 2) * STRIDE];
+
+        // vertically transform all central columns
+        buffer.forEachVerticalOdd(outputColumnOffset, Reverse53Update());
+        buffer.forEachVerticalEven(outputColumnOffset, Reverse53Predict());
+
+        // Save all results of current window. Results are in transform buffer
+        // at rows from #1 to #(1 + WIN_SIZE_Y). Other rows are invalid now.
+        // (They only served as a boundary for vertical RDWT.)
+        for(int i = 1; i < (1 + WIN_SIZE_Y); i++) {
+          writer.writeInto(out, buffer[outputColumnOffset + i * STRIDE]);
+        }
+
+        // Use last 3 remembered lines as first 3 lines for next iteration.
+        // As expected, these lines are already horizontally transformed.
+        buffer[outputColumnOffset + 0 * STRIDE] = last3Lines[0];
+        buffer[outputColumnOffset + 1 * STRIDE] = last3Lines[1];
+        buffer[outputColumnOffset + 2 * STRIDE] = last3Lines[2];
+
+        // Wait for all writing threads before proceeding to loading new
+        // coeficients in next iteration. (Not to overwrite those which
+        // are not written yet.)
+        __syncthreads();
+      }
+    }
+
+
+  public:
+    /// Main GPU 5/3 RDWT entry point.
+    /// @param in     input image (5/3 transformed coefficients)
+    /// @param out    output buffer (for reverse transformed image)
+    /// @param sizeX  width of the output image 
+    /// @param sizeY  height of the output image
+    /// @param winSteps  number of sliding window steps
+    __device__ static void run(const int * const input, int * const output,
+                               const int sx, const int sy, const int steps) {
+      // prepare instance with buffer in shared memory
+      __shared__ RDWT53<WIN_SIZE_X, WIN_SIZE_Y> rdwt53;
+
+      // Compute limits of this threadblock's block of pixels and use them to
+      // determine, whether this threadblock will have to deal with boundary.
+      // (1 in next expressions is for radius of impulse response of 5/3 RDWT.)
+      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
+      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
+      const bool atRightBoudary = maxX >= sx;
+      const bool atBottomBoudary = maxY >= sy;
+
+      // Select specialized version of code according to distance of this
+      // threadblock's pixels from image boundary.
+      if(atBottomBoudary) {
+        // near bottom boundary => check both writing and reading
+        rdwt53.transform<true, true>(input, output, sx, sy, steps);
+      } else if(atRightBoudary) {
+        // near right boundary only => check writing only
+        rdwt53.transform<false, true>(input, output, sx, sy, steps);
+      } else {
+        // no nearby boundary => check nothing
+        rdwt53.transform<false, false>(input, output, sx, sy, steps);
+      }
+    }
+
+  }; // end of class RDWT53
+  
+  
+  
+  /// Main GPU 5/3 RDWT entry point.
+  /// @param in     input image (5/3 transformed coefficients)
+  /// @param out    output buffer (for reverse transformed image)
+  /// @param sizeX  width of the output image 
+  /// @param sizeY  height of the output image
+  /// @param winSteps  number of sliding window steps
+  template <int WIN_SX, int WIN_SY>
+  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT53<WIN_SX, WIN_SY>), 8))
+  __global__ void rdwt53Kernel(const int * const in, int * const out,
+                               const int sx, const int sy, const int steps) {
+    RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
+  }
+  
+  
+  
+  /// Only computes optimal number of sliding window steps, 
+  /// number of threadblocks and then lanches the 5/3 RDWT kernel.
+  /// @tparam WIN_SX  width of sliding window
+  /// @tparam WIN_SY  height of sliding window
+  /// @param in       input image
+  /// @param out      output buffer
+  /// @param sx       width of the input image 
+  /// @param sy       height of the input image
+  template <int WIN_SX, int WIN_SY>
+  void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
+    // compute optimal number of steps of each sliding window
+    const int steps = divRndUp(sy, 15 * WIN_SY);
+    
+    // prepare grid size
+    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
+    
+    // finally transform this level
+    PERF_BEGIN
+    rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
+    PERF_END("        RDWT53", sx, sy)
+    CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
+  }
+    
+  
+  
+  /// Reverse 5/3 2D DWT. See common rules (above) for more details.
+  /// @param in      Input DWT coefficients. Format described in common rules.
+  ///                Will not be preserved (will be overwritten).
+  /// @param out     output buffer on GPU - will contain original image
+  ///                in normalized range [-128, 127].
+  /// @param sizeX   width of input image (in pixels)
+  /// @param sizeY   height of input image (in pixels)
+  /// @param levels  number of recursive DWT levels
+  void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
+    if(levels > 1) {
+      // let this function recursively reverse transform deeper levels first
+      const int llSizeX = divRndUp(sizeX, 2);
+      const int llSizeY = divRndUp(sizeY, 2);
+      rdwt53(in, out, llSizeX, llSizeY, levels - 1);
+      
+      // copy reverse transformed LL band from output back into the input
+      memCopy(in, out, llSizeX, llSizeY);
+    }
+    
+    // select right width of kernel for the size of the image
+    if(sizeX >= 960) {
+      launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
+    } else if (sizeX >= 480) {
+      launchRDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
+    } else {
+      launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
+    }
+  }
+  
+
+} // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/rdwt97.cu
+++ b/examples/dwt2d/dwt_cuda/rdwt97.cu
@ -0,0 +1,363 @@
+/// 
+/// @file    rdwt97.cu
+/// @brief   CUDA implementation of reverse 9/7 2D DWT.
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @date    2011-02-03 21:59
+///
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+
+
+#include "common.h"
+#include "transform_buffer.h"
+#include "io.h"
+
+
+namespace dwt_cuda {
+
+  
+  /// Wraps shared memory buffer and methods for computing 9/7 RDWT using
+  /// lifting schema and sliding window.
+  /// @tparam WIN_SIZE_X  width of the sliding window
+  /// @tparam WIN_SIZE_Y  height of the sliding window
+  template <int WIN_SIZE_X, int WIN_SIZE_Y>
+  class RDWT97 {
+  private:
+    
+    /// Info related to loading of one input column.
+    /// @tparam CHECKED true if boundary chould be checked,
+    ///                 false if there is no near boudnary
+    template <bool CHECKED>
+    struct RDWT97Column  {
+      /// laoder of input pxels for given column.
+      VerticalDWTBandLoader<float, CHECKED> loader;
+      
+      /// Offset of loaded column in shared memory buffer.
+      int offset;
+      
+      /// Sets all fields to some values to avoid 'uninitialized' warnings.
+      __device__ void clear() {
+        loader.clear();
+        offset = 0;
+      }
+    };
+
+
+    /// Shared memory buffer used for 9/7 DWT transforms.
+    typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> RDWT97Buffer;
+
+    /// Shared buffer used for reverse 9/7 DWT.
+    RDWT97Buffer buffer;
+
+    /// Difference between indices of two vertical neighbors in buffer.
+    enum { STRIDE = RDWT97Buffer::VERTICAL_STRIDE };
+
+
+    /// Horizontal 9/7 RDWT on specified lines of transform buffer.
+    /// @param lines      number of lines to be transformed
+    /// @param firstLine  index of the first line to be transformed
+    __device__ void horizontalRDWT97(int lines, int firstLine) {
+      __syncthreads();
+      buffer.scaleHorizontal(scale97Mul, scale97Div, firstLine, lines);
+      __syncthreads();
+      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update2));
+      __syncthreads();
+      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97predict2));
+      __syncthreads();
+      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update1));
+      __syncthreads();
+      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97Predict1));
+      __syncthreads();
+    }
+
+
+    /// Initializes one column of shared transform buffer with 7 input pixels.
+    /// Those 7 pixels will not be transformed. Also initializes given loader.
+    /// @tparam CHECKED  true if there are near image boundaries
+    /// @param colIndex  index of column in shared transform buffer
+    /// @param input     input image
+    /// @param sizeX     width of the input image
+    /// @param sizeY     height of the input image
+    /// @param column    (uninitialized) info about loading one column
+    /// @param firstY    index of first image row to be transformed
+    template <bool CHECKED>
+    __device__ void initColumn(const int colIndex, const float * const input, 
+                               const int sizeX, const int sizeY,
+                               RDWT97Column<CHECKED> & column,
+                               const int firstY) {
+      // coordinates of the first coefficient to be loaded
+      const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
+
+      // offset of the column with index 'colIndex' in the transform buffer
+      column.offset = buffer.getColumnOffset(colIndex);
+
+      if(blockIdx.y == 0) {
+        // topmost block - apply mirroring rules when loading first 7 rows
+        column.loader.init(sizeX, sizeY, firstX, firstY);
+
+        // load pixels in mirrored way
+        buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
+        buffer[column.offset + 4 * STRIDE] =
+        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
+        buffer[column.offset + 5 * STRIDE] =
+        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
+        buffer[column.offset + 6 * STRIDE] = 
+        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
+      } else {
+        // non-topmost row - regular loading:
+        column.loader.init(sizeX, sizeY, firstX, firstY - 3);
+        buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
+        buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
+        buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
+        buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
+        buffer[column.offset + 4 * STRIDE] = column.loader.loadHighFrom(input);
+        buffer[column.offset + 5 * STRIDE] = column.loader.loadLowFrom(input);
+        buffer[column.offset + 6 * STRIDE] = column.loader.loadHighFrom(input);
+      }
+      // Now, the next coefficient, which will be loaded by loader, is #4.
+    }
+
+
+    /// Using given loader, it loads another WIN_SIZE_Y coefficients
+    /// into specified column.
+    /// @tparam CHECKED  true if there are near image boundaries
+    /// @param col       info about loaded column
+    /// @param input     buffer with input coefficients
+    template <bool CHECKED>
+    inline __device__ void loadWindowIntoColumn(RDWT97Column<CHECKED> & col,
+                                                const float * const input) {
+      for(int i = 7; i < (7 + WIN_SIZE_Y); i += 2) {
+        buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
+        buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
+      }
+    }
+
+
+    /// Actual GPU 9/7 RDWT sliding window lifting schema implementation.
+    /// @tparam CHECKED_LOADS   true if loader should check boundaries
+    /// @tparam CHECKED_WRITES  true if boundaries should be taken into account
+    ///                         when writing into output buffer
+    /// @param in        input image (9/7 transformed coefficients)
+    /// @param out       output buffer (for reverse transformed image)
+    /// @param sizeX     width of the output image 
+    /// @param sizeY     height of the output image
+    /// @param winSteps  number of steps of sliding window
+    template <bool CHECKED_LOADS, bool CHECKED_WRITES>
+    __device__ void transform(const float * const in, float * const out,
+                              const int sizeX, const int sizeY,
+                              const int winSteps) {
+      // info about one main column and one boundary column
+      RDWT97Column<CHECKED_LOADS> column;
+      RDWT97Column<CHECKED_LOADS> boundaryColumn;
+
+      // index of first image row to be transformed
+      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
+
+      // initialize boundary columns
+      boundaryColumn.clear();
+      if(threadIdx.x < 7) {
+        // each thread among first 7 ones gets index of one of boundary columns
+        const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7);
+
+        // Thread initializes offset of the boundary column (in shared  
+        // buffer), first 7 pixels of the column and a loader for this column.
+        initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
+      }
+
+      // All threads initialize central columns.
+      initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
+
+      // horizontally transform first 7 rows
+      horizontalRDWT97(7, 0);
+
+      // writer of output pixels - initialize it
+      const int outputX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
+      VerticalDWTPixelWriter<float, CHECKED_WRITES> writer;
+      writer.init(sizeX, sizeY, outputX, firstY);
+
+      // offset of column (in transform buffer) saved by this thread
+      const int outColumnOffset = buffer.getColumnOffset(threadIdx.x);
+
+      // (Each iteration assumes that first 7 rows of transform buffer are 
+      // already loaded with horizontally transformed pixels.)
+      for(int w = 0; w < winSteps; w++) {
+        // Load another WIN_SIZE_Y lines of this thread's column
+        // into the transform buffer.
+        loadWindowIntoColumn(column, in);
+
+        // possibly load boundary columns
+        if(threadIdx.x < 7) {
+          loadWindowIntoColumn(boundaryColumn, in);
+        }
+
+        // horizontally transform all newly loaded lines
+        horizontalRDWT97(WIN_SIZE_Y, 7);
+
+        // Using 7 registers, remember current values of last 7 rows 
+        // of transform buffer. These rows are transformed horizontally 
+        // only and will be used in next iteration.
+        float last7Lines[7];
+        for(int i = 0; i < 7; i++) {
+          last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
+        }
+
+        // vertically transform all central columns
+        buffer.scaleVertical(scale97Div, scale97Mul, outColumnOffset,
+                             WIN_SIZE_Y + 7, 0);
+        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update2));
+        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97predict2));
+        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update1));
+        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97Predict1));
+
+        // Save all results of current window. Results are in transform buffer
+        // at rows from #3 to #(3 + WIN_SIZE_Y). Other rows are invalid now.
+        // (They only served as a boundary for vertical RDWT.)
+        for(int i = 3; i < (3 + WIN_SIZE_Y); i++) {
+          writer.writeInto(out, buffer[outColumnOffset + i * STRIDE]);
+        }
+
+        // Use last 7 remembered lines as first 7 lines for next iteration.
+        // As expected, these lines are already horizontally transformed.
+        for(int i = 0; i < 7; i++) {
+          buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
+        }
+
+        // Wait for all writing threads before proceeding to loading new
+        // coeficients in next iteration. (Not to overwrite those which
+        // are not written yet.)
+        __syncthreads();
+      }
+    }
+
+
+  public:
+    /// Main GPU 9/7 RDWT entry point.
+    /// @param in     input image (9/7 transformed coefficients)
+    /// @param out    output buffer (for reverse transformed image)
+    /// @param sizeX  width of the output image 
+    /// @param sizeY  height of the output image
+    __device__ static void run(const float * const input, float * const output,
+                               const int sx, const int sy, const int steps) {
+      // prepare instance with buffer in shared memory
+      __shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97;
+      
+      // Compute limits of this threadblock's block of pixels and use them to
+      // determine, whether this threadblock will have to deal with boundary.
+      // (3 in next expressions is for radius of impulse response of 9/7 RDWT.)
+      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
+      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
+      const bool atRightBoudary = maxX >= sx;
+      const bool atBottomBoudary = maxY >= sy;
+
+      // Select specialized version of code according to distance of this
+      // threadblock's pixels from image boundary.
+      if(atBottomBoudary) {
+        // near bottom boundary => check both writing and reading
+        rdwt97.transform<true, true>(input, output, sx, sy, steps);
+      } else if(atRightBoudary) {
+        // near right boundary only => check writing only
+        rdwt97.transform<false, true>(input, output, sx, sy, steps);
+      } else {
+        // no nearby boundary => check nothing
+        rdwt97.transform<false, false>(input, output, sx, sy, steps);
+      }
+    }
+    
+  }; // end of class RDWT97
+  
+    
+  
+  /// Main GPU 9/7 RDWT entry point.
+  /// @param in     input image (9/7 transformed coefficients)
+  /// @param out    output buffer (for reverse transformed image)
+  /// @param sizeX  width of the output image 
+  /// @param sizeY  height of the output image
+  template <int WIN_SX, int WIN_SY>
+  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8))
+  __global__ void rdwt97Kernel(const float * const in, float * const out,
+                               const int sx, const int sy, const int steps) {
+    RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
+  }
+  
+  
+  
+  /// Only computes optimal number of sliding window steps, 
+  /// number of threadblocks and then lanches the 9/7 RDWT kernel.
+  /// @tparam WIN_SX  width of sliding window
+  /// @tparam WIN_SY  height of sliding window
+  /// @param in       input image
+  /// @param out      output buffer
+  /// @param sx       width of the input image 
+  /// @param sy       height of the input image
+  template <int WIN_SX, int WIN_SY>
+  void launchRDWT97Kernel (float * in, float * out, int sx, int sy) {
+    // compute optimal number of steps of each sliding window
+    const int steps = divRndUp(sy, 15 * WIN_SY);
+    
+    // prepare grid size
+    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
+    
+    // finally launch kernel
+    PERF_BEGIN
+    rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
+    PERF_END("        RDWT97", sx, sy)
+    CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel");
+  }
+  
+  
+  
+  /// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details.
+  /// @param in      Input DWT coefficients. Format described in common rules.
+  ///                Will not be preserved (will be overwritten).
+  /// @param out     output buffer on GPU - will contain original image
+  ///                in normalized range [-0.5, 0.5].
+  /// @param sizeX   width of input image (in pixels)
+  /// @param sizeY   height of input image (in pixels)
+  /// @param levels  number of recursive DWT levels
+  void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
+    if(levels > 1) {
+      // let this function recursively reverse transform deeper levels first
+      const int llSizeX = divRndUp(sizeX, 2);
+      const int llSizeY = divRndUp(sizeY, 2);
+      rdwt97(in, out, llSizeX, llSizeY, levels - 1);
+      
+      // copy reverse transformed LL band from output back into the input
+      memCopy(in, out, llSizeX, llSizeY);
+    }
+    
+    // select right width of kernel for the size of the image
+    if(sizeX >= 960) {
+      launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
+    } else if (sizeX >= 480) {
+      launchRDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
+    } else {
+      launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
+    }
+  }
+  
+
+  
+} // end of namespace dwt_cuda
--- a/examples/dwt2d/dwt_cuda/transform_buffer.h
+++ b/examples/dwt2d/dwt_cuda/transform_buffer.h
@ -0,0 +1,373 @@
+/// line 248 the index
+/// @file    transform_buffer.h
+/// @brief   Buffer with separated even and odd columns and related algorithms.
+/// @author  Martin Jirman (207962@mail.muni.cz)
+/// @date    2011-01-20 18:33
+///
+///
+/// Copyright (c) 2011 Martin Jirman
+/// All rights reserved.
+/// 
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions are met:
+/// 
+///     * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+///     * Redistributions in binary form must reproduce the above copyright
+///       notice, this list of conditions and the following disclaimer in the
+///       documentation and/or other materials provided with the distribution.
+/// 
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+///
+
+
+#ifndef TRANSFORM_BUFFER_H
+#define	TRANSFORM_BUFFER_H
+
+
+namespace dwt_cuda {
+  
+  
+  /// Buffer (in shared memory of GPU) where block of input image is stored,
+  /// but odd and even lines are separated. (Generates less bank conflicts when 
+  /// using lifting schema.) All operations expect SIZE_X threads.
+  /// Also implements basic building blocks of lifting schema.
+  /// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
+  ///                     a number of threads participating on all operations.)
+  ///                     Must be divisible by 4.
+  /// @tparam SIZE_Y      height of buffer (total number of lines)
+  /// @tparam BOUNDARY_X  number of extra pixels at the left and right side
+  ///                     boundary is expected to be smaller than half SIZE_X
+  ///                     Must be divisible by 2.
+  template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
+  class TransformBuffer {
+  public:
+    enum {
+      /// difference between pointers to two vertical neigbors
+      VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
+    };
+    
+  private:
+    enum {
+      /// number of shared memory banks - needed for correct padding
+      #ifdef __CUDA_ARCH__
+      SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
+      #else
+      SHM_BANKS = 16,  // for host code only - can be anything, won't be used
+      #endif
+      
+      /// size of one of two buffers (odd or even)
+      BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
+      
+      /// unused space between two buffers
+      PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
+      
+      /// offset of the odd columns buffer from the beginning of data buffer
+      ODD_OFFSET = BUFFER_SIZE + PADDING,
+    };
+
+    /// buffer for both even and odd columns
+    T data[2 * BUFFER_SIZE + PADDING];
+    
+    
+    
+    /// Applies specified function to all central elements while also passing
+    /// previous and next elements as parameters.
+    /// @param count         count of central elements to apply function to
+    /// @param prevOffset    offset of first central element
+    /// @param midOffset     offset of first central element's predecessor
+    /// @param nextOffset    offset of first central element's successor
+    /// @param function      the function itself
+    template <typename FUNC>
+    __device__ void horizontalStep(const int count, const int prevOffset, 
+                                   const int midOffset, const int nextOffset,
+                                   const FUNC & function) {
+      // number of unchecked iterations
+      const int STEPS = count / SIZE_X;
+      
+      // items remaining after last unchecked iteration
+      const int finalCount = count % SIZE_X; 
+      
+      // offset of items processed in last (checked) iteration
+      const int finalOffset = count - finalCount;  
+      
+      // all threads perform fixed number of iterations ...
+      for(int i = 0; i < STEPS; i++) {
+      // for(int i = 0; i < 3; i++) {
+        const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
+        const T next     = data[nextOffset + i * SIZE_X + threadIdx.x];
+        T & center       = data[midOffset  + i * SIZE_X + threadIdx.x];
+        // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
+        function(previous, center, next);// the real one
+      }
+      
+      // ... but not all threads participate on final iteration
+      if(threadIdx.x < finalCount) {
+        const T previous = data[prevOffset + finalOffset + threadIdx.x];
+        const T next     = data[nextOffset + finalOffset + threadIdx.x];
+        T & center = data[midOffset + finalOffset + threadIdx.x];
+        // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
+        // kaixi
+        function(previous, center, next);//the real one
+      }
+    }
+
+  public:
+
+    __device__ void getPrintData() {
+      //
+      for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) {     
+          printf(" index: %d  data: %f \n ", i ,data[i]);  
+      }
+
+   }
+
+    
+    /// Gets offset of the column with given index. Central columns have 
+    /// indices from 0 to NUM_LINES - 1, left boundary columns have negative 
+    /// indices and right boundary columns indices start with NUM_LINES.
+    /// @param columnIndex  index of column to get pointer to
+    /// @return  offset of the first item of column with specified index
+    __device__ int getColumnOffset(int columnIndex) {
+      columnIndex += BOUNDARY_X;             // skip boundary
+      return columnIndex / 2                 // select right column
+          + (columnIndex & 1) * ODD_OFFSET;  // select odd or even buffer
+    }
+    
+    
+    /// Provides access to data of the transform buffer.
+    /// @param index  index of the item to work with
+    /// @return reference to item at given index
+    __device__ T & operator[] (const int index) {
+      return data[index];
+    }
+    
+            
+    /// Applies specified function to all horizontally even elements in 
+    /// specified lines. (Including even elements in boundaries except 
+    /// first even element in first left boundary.) SIZE_X threads participate 
+    /// and synchronization is needed before result can be used.
+    /// @param firstLine  index of first line
+    /// @param numLines   count of lines
+    /// @param func       function to be applied on all even elements
+    ///                   parameters: previous (odd) element, the even
+    ///                   element itself and finally next (odd) element
+    template <typename FUNC>
+    __device__ void forEachHorizontalEven(const int firstLine,
+                                          const int numLines,
+                                          const FUNC & func) {
+      // number of even elemens to apply function to
+      const int count = numLines * VERTICAL_STRIDE - 1;
+      // offset of first even element
+      const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
+      // offset of odd predecessor of first even element
+      const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
+      // offset of odd successor of first even element
+      const int nextOffset = prevOffset + 1;
+
+      // if(threadIdx.x == 0) {
+
+      //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
+      // }
+      
+      // call generic horizontal step function
+      horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
+    }
+    
+    
+    /// Applies given function to all horizontally odd elements in specified
+    /// lines. (Including odd elements in boundaries except last odd element
+    /// in last right boundary.) SIZE_X threads participate and synchronization
+    /// is needed before result can be used.
+    /// @param firstLine  index of first line
+    /// @param numLines   count of lines
+    /// @param func       function to be applied on all odd elements
+    ///                   parameters: previous (even) element, the odd
+    ///                   element itself and finally next (even) element
+    template <typename FUNC>
+    __device__ void forEachHorizontalOdd(const int firstLine,
+                                         const int numLines,
+                                         const FUNC & func) {
+      // numbet of odd elements to apply function to
+      const int count = numLines * VERTICAL_STRIDE - 1;
+      // offset of even predecessor of first odd element
+      const int prevOffset = firstLine * VERTICAL_STRIDE;
+      // offset of first odd element
+      const int centerOffset = prevOffset + ODD_OFFSET;
+      // offset of even successor of first odd element
+      const int nextOffset = prevOffset + 1;
+
+      //  if(threadIdx.x == 0) {
+      //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
+      // }
+      
+      
+      // call generic horizontal step function
+      horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
+    }
+    
+    
+    /// Applies specified function to all even elements (except element #0)
+    /// of given column. Each thread takes care of one column, so there's 
+    /// no need for synchronization.
+    /// @param columnOffset  offset of thread's column
+    /// @param f             function to be applied on all even elements
+    ///                      parameters: previous (odd) element, the even
+    ///                      element itself and finally next (odd) element
+    template <typename F>
+    __device__ void forEachVerticalEven(const int columnOffset, const F & f) {
+      if(SIZE_Y > 3) { // makes no sense otherwise
+        const int steps = SIZE_Y / 2 - 1;
+        for(int i = 0; i < steps; i++) {
+          const int row = 2 + i * 2;
+          const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
+          const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
+          f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next);
+		  
+		  //--------------- FOR TEST -----------------
+/*		__syncthreads();
+		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
+			diffOut[2500]++;
+			diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
+		}	
+		__syncthreads();
+*/		  //--------------- FOR TEST -----------------
+		  
+		  
+        }
+      }
+    }
+    
+    
+    /// Applies specified function to all odd elements of given column.
+    /// Each thread takes care of one column, so there's no need for
+    /// synchronization.
+    /// @param columnOffset  offset of thread's column
+    /// @param f             function to be applied on all odd elements
+    ///                      parameters: previous (even) element, the odd
+    ///                      element itself and finally next (even) element
+    template <typename F>
+    __device__ void forEachVerticalOdd(const int columnOffset, const F & f) {
+      const int steps = (SIZE_Y - 1) / 2;
+      for(int i = 0; i < steps; i++) {
+        const int row = i * 2 + 1;
+        const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
+        const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
+
+		f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
+		
+		
+		  //--------------- FOR TEST -----------------
+/*		__syncthreads();
+		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
+			diffOut[2500]++;
+			diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];
+		}
+
+		__syncthreads();
+*/		  //--------------- FOR TEST -----------------
+      }
+    }
+    
+    
+    
+    /// Scales elements at specified lines.
+    /// @param evenScale  scaling factor for horizontally even elements
+    /// @param oddScale   scaling factor for horizontally odd elements
+    /// @param numLines   number of lines, whose elements should be scaled
+    /// @param firstLine  index of first line to scale elements in
+    __device__ void scaleHorizontal(const T evenScale, const T oddScale,
+                                    const int firstLine, const int numLines) {
+      const int offset = firstLine * VERTICAL_STRIDE;
+      const int count = numLines * VERTICAL_STRIDE;
+      const int steps = count / SIZE_X;
+      const int finalCount = count % SIZE_X;
+      const int finalOffset = count - finalCount;
+
+      // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);
+      
+      // run iterations, whete all threads participate
+      for(int i = 0; i < steps; i++) {
+        data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
+        // if(threadIdx.x + i * SIZE_X + offset == 531) {
+        //   printf("threadidx 531: %d \n", threadIdx.x);
+        // }
+        // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
+        //   printf("threadidx 531: %d \n", threadIdx.x);
+        // }
+        data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
+      }
+      
+      // some threads also finish remaining unscaled items
+      if(threadIdx.x < finalCount) {
+        data[threadIdx.x + finalOffset + offset] *= evenScale;
+        // if(threadIdx.x + finalOffset + offset == 531) {
+        //   printf("threadidx 531: %d \n", threadIdx.x);
+        // }
+        //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
+        //   printf("threadidx 531: %d \n", threadIdx.x);
+        // }
+        data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
+      }
+
+    }
+    
+    
+    /// Scales elements in specified column.
+    /// @param evenScale     scaling factor for vertically even elements
+    /// @param oddScale      scaling factor for vertically odd elements
+    /// @param columnOffset  offset of the column to work with
+    /// @param numLines      number of lines, whose elements should be scaled
+    /// @param firstLine     index of first line to scale elements in
+    __device__ void scaleVertical(const T evenScale, const T oddScale,
+                                  const int columnOffset, const int numLines,
+                                  const int firstLine) {
+      for(int i = firstLine; i < (numLines + firstLine); i++) {
+        if(i & 1) {
+          data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
+        } else {
+          data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
+        }
+      }
+    }
+	
+	
+	//****************For Test(Feb23), test inter parameters*************
+	__device__ int getVERTICAL_STRIDE(){
+		return VERTICAL_STRIDE;
+	}
+	__device__ int getSHM_BANKS(){
+		return SHM_BANKS;
+	}
+	__device__ int  getBuffersize(){		
+		return BUFFER_SIZE;
+	}
+	__device__ int getPADDING(){
+		return PADDING;
+	}
+	__device__ int getODD_OFFSET(){
+		return ODD_OFFSET;
+	}
+
+
+    //****************For Test(Feb23), test inter parameters*************
+	
+	
+  };  // end of class TransformBuffer
+
+
+} // namespace dwt_cuda
+
+
+#endif	// TRANSFORM_BUFFER_H
+
--- a/examples/dwt2d/main.cu
+++ b/examples/dwt2d/main.cu
@ -0,0 +1,401 @@
+/* 
+ * Copyright (c) 2009, Jiri Matela
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <getopt.h>
+
+#include "common.h"
+#include "components.h"
+#include "dwt.h"
+
+struct dwt {
+    char * srcFilename;
+    char * outFilename;
+    unsigned char *srcImg;
+    int pixWidth;
+    int pixHeight;
+    int components;
+    int dwtLvls;
+};
+
+int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
+{
+    // printf("Loading ipnput: %s\n", srcFilename);
+    char *path = "../../data/dwt2d/";
+    char *newSrc = NULL;
+    
+    if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
+    {
+        newSrc[0] = '\0';
+        strcat(newSrc, path);
+        strcat(newSrc, srcFilename);
+        srcFilename= newSrc;
+    }
+    printf("Loading ipnput: %s\n", srcFilename);
+
+    //srcFilename = strcat("../../data/dwt2d/",srcFilename);
+    //read image
+    int i = open(srcFilename, O_RDONLY, 0644);
+    if (i == -1) { 
+        error(0,errno,"cannot access %s", srcFilename);
+        return -1;
+    }
+    int ret = read(i, srcImg, inputSize);
+    printf("precteno %d, inputsize %d\n", ret, inputSize);
+    close(i);
+
+    return 0;
+}
+
+
+void usage() {
+    printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
+  -d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
+  -c, --components\t\tnumber of color components, default 3\n\
+  -b, --depth\t\t\tbit depth, default 8\n\
+  -l, --level\t\t\tDWT level, default 3\n\
+  -D, --device\t\t\tcuda device\n\
+  -f, --forward\t\t\tforward transform\n\
+  -r, --reverse\t\t\treverse transform\n\
+  -9, --97\t\t\t9/7 transform\n\
+  -5, --53\t\t\t5/3 transform\n\
+  -w  --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n");
+}
+
+template <typename T>
+void processDWT(struct dwt *d, int forward, int writeVisual)
+{
+    int componentSize = d->pixWidth*d->pixHeight*sizeof(T);
+    
+    T *c_r_out, *backup ;
+    cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size
+    cudaCheckError("Alloc device memory");
+    cudaMemset(c_r_out, 0, componentSize);
+    cudaCheckError("Memset device memory");
+    
+    cudaMalloc((void**)&backup, componentSize); //< aligned component size
+    cudaCheckError("Alloc device memory");
+    cudaMemset(backup, 0, componentSize);
+    cudaCheckError("Memset device memory");
+	
+    if (d->components == 3) {
+        /* Alloc two more buffers for G and B */
+        T *c_g_out, *c_b_out;
+        cudaMalloc((void**)&c_g_out, componentSize); //< aligned component size
+        cudaCheckError("Alloc device memory");
+        cudaMemset(c_g_out, 0, componentSize);
+        cudaCheckError("Memset device memory");
+        
+        cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size
+        cudaCheckError("Alloc device memory");
+        cudaMemset(c_b_out, 0, componentSize);
+        cudaCheckError("Memset device memory");
+        
+        /* Load components */
+        T *c_r, *c_g, *c_b;
+        cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size
+        cudaCheckError("Alloc device memory");
+        cudaMemset(c_r, 0, componentSize);
+        cudaCheckError("Memset device memory");
+
+        cudaMalloc((void**)&c_g, componentSize); //< G, aligned component size
+        cudaCheckError("Alloc device memory");
+        cudaMemset(c_g, 0, componentSize);
+        cudaCheckError("Memset device memory");
+
+        cudaMalloc((void**)&c_b, componentSize); //< B, aligned component size
+        cudaCheckError("Alloc device memory");
+        cudaMemset(c_b, 0, componentSize);
+        cudaCheckError("Memset device memory");
+
+        rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight);
+		
+
+        /* Compute DWT and always store into file */
+        nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
+        nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
+        nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
+     
+        // -------test----------
+        // T *h_r_out=(T*)malloc(componentSize);
+		// cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost);
+        // int ii;
+		// for(ii=0;ii<componentSize/sizeof(T);ii++) {
+			// fprintf(stderr, "%d ", h_r_out[ii]);
+			// if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
+        // }
+        // -------test----------
+        
+		
+        /* Store DWT to file */
+        writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
+        // writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
+        // writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
+#ifdef OUTPUT        
+        if (writeVisual) {
+            writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
+            writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
+            writeNStage2DDWT(c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
+        } else {
+            writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
+            writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
+            writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
+        }
+#endif
+
+
+        cudaFree(c_r);
+        cudaCheckError("Cuda free");
+        cudaFree(c_g);
+        cudaCheckError("Cuda free");
+        cudaFree(c_b);
+        cudaCheckError("Cuda free");
+        cudaFree(c_g_out);
+        cudaCheckError("Cuda free");
+        cudaFree(c_b_out);
+        cudaCheckError("Cuda free");
+
+    } 
+    else if (d->components == 1) {
+        //Load component
+        T *c_r;
+        cudaMalloc((void**)&(c_r), componentSize); //< R, aligned component size
+        cudaCheckError("Alloc device memory");
+        cudaMemset(c_r, 0, componentSize);
+        cudaCheckError("Memset device memory");
+
+        bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight);
+
+        // Compute DWT 
+        nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
+
+        // Store DWT to file 
+// #ifdef OUTPUT        
+        if (writeVisual) {
+            writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out");
+        } else {
+            writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".lin.out");
+        }
+// #endif
+        cudaFree(c_r);
+        cudaCheckError("Cuda free");
+    }
+
+    cudaFree(c_r_out);
+    cudaCheckError("Cuda free device");
+    cudaFree(backup);
+    cudaCheckError("Cuda free device");
+}
+
+int main(int argc, char **argv) 
+{
+    int optindex = 0;
+    char ch;
+    struct option longopts[] = {
+        {"dimension",   required_argument, 0, 'd'}, //dimensions of src img
+        {"components",  required_argument, 0, 'c'}, //numger of components of src img
+        {"depth",       required_argument, 0, 'b'}, //bit depth of src img
+        {"level",       required_argument, 0, 'l'}, //level of dwt
+        {"device",      required_argument, 0, 'D'}, //cuda device
+        {"forward",     no_argument,       0, 'f'}, //forward transform
+        {"reverse",     no_argument,       0, 'r'}, //reverse transform
+        {"97",          no_argument,       0, '9'}, //9/7 transform
+        {"53",          no_argument,       0, '5' }, //5/3transform
+        {"write-visual",no_argument,       0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
+        {"help",        no_argument,       0, 'h'}  
+    };
+    
+    int pixWidth    = 0; //<real pixWidth
+    int pixHeight   = 0; //<real pixHeight
+    int compCount   = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
+    int bitDepth    = 8; 
+    int dwtLvls     = 3; //default numuber of DWT levels
+    int device      = 0;
+    int forward     = 1; //forward transform
+    int dwt97       = 1; //1=dwt9/7, 0=dwt5/3 transform
+    int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
+    char * pos;
+
+    while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) {
+        switch (ch) {
+        case 'd':
+            pixWidth = atoi(optarg);
+            pos = strstr(optarg, "x");
+            if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) {
+                usage();
+                return -1;
+            }
+            pixHeight = atoi(pos+1);
+            break;
+        case 'c':
+            compCount = atoi(optarg);
+            break;
+        case 'b':
+            bitDepth = atoi(optarg);
+            break;
+        case 'l':
+            dwtLvls = atoi(optarg);
+            break;
+        case 'D':
+            device = atoi(optarg);
+            break;
+        case 'f':
+            forward = 1;
+            break;
+        case 'r':
+            forward = 0;
+            break;
+        case '9':
+            dwt97 = 1;
+            break;
+        case '5':
+            dwt97 = 0;
+            break;
+        case 'w':
+            writeVisual = 1;
+            break;
+        case 'h':
+            usage();
+            return 0;
+        case '?':
+            return -1;
+        default :
+            usage();
+            return -1;
+        }
+    }
+	argc -= optind;
+	argv += optind;
+
+    if (argc == 0) { // at least one filename is expected
+        printf("Please supply src file name\n");
+        usage();
+        return -1;
+    }
+
+    if (pixWidth <= 0 || pixHeight <=0) {
+        printf("Wrong or missing dimensions\n");
+        usage();
+        return -1;
+    }
+
+    if (forward == 0) {
+        writeVisual = 0; //do not write visual when RDWT
+    }
+
+    // device init
+    int devCount;
+    cudaSetDevice(0);
+    cudaGetDeviceCount(&devCount);
+    cudaCheckError("Get device count");
+    if (devCount == 0) {
+        printf("No CUDA enabled device\n");
+        return -1;
+    } 
+    if (device < 0 || device > devCount -1) {
+        printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n", 
+               device, 0, devCount -1);
+        return -1;
+    }
+    cudaDeviceProp devProp;                                          
+    cudaGetDeviceProperties(&devProp, device);  
+    cudaCheckError("Get device properties");
+    // if (devProp.major < 1) {                                         
+    //     printf("Device %d does not support CUDA\n", device);
+    //     return -1;
+    // }                                                                   
+    printf("Using device %d: %s\n", device, devProp.name);
+    cudaSetDevice(device);
+    cudaCheckError("Set selected device");
+
+    struct dwt *d;
+    d = (struct dwt *)malloc(sizeof(struct dwt));
+    d->srcImg = NULL;
+    d->pixWidth = pixWidth;
+    d->pixHeight = pixHeight;
+    d->components = compCount;
+    d->dwtLvls  = dwtLvls;
+
+    // file names
+    d->srcFilename = (char *)malloc(strlen(argv[0]));
+    strcpy(d->srcFilename, argv[0]);
+    if (argc == 1) { // only one filename supplyed
+        d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
+        strcpy(d->outFilename, d->srcFilename);
+        strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
+    } else {
+        d->outFilename = strdup(argv[1]);
+    }
+
+    //Input review
+    printf("Source file:\t\t%s\n", d->srcFilename);
+    printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
+    printf(" Components count:\t%d\n", compCount);
+    printf(" Bit depth:\t\t%d\n", bitDepth);
+    printf(" DWT levels:\t\t%d\n", dwtLvls);
+    printf(" Forward transform:\t%d\n", forward);
+    printf(" 9/7 transform:\t\t%d\n", dwt97);
+    
+    //data sizes
+    int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
+
+    //load img source image
+    cudaMallocHost((void **)&d->srcImg, inputSize);
+    cudaCheckError("Alloc host memory");
+    if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) 
+        return -1;
+
+    /* DWT */
+    if (forward == 1) {
+        if(dwt97 == 1 )
+            processDWT<float>(d, forward, writeVisual);
+        else // 5/3
+            processDWT<int>(d, forward, writeVisual);
+    }
+    else { // reverse
+        if(dwt97 == 1 )
+            processDWT<float>(d, forward, writeVisual);
+        else // 5/3
+            processDWT<int>(d, forward, writeVisual);
+    }
+
+    //writeComponent(r_cuda, pixWidth, pixHeight, srcFilename, ".g");
+    //writeComponent(g_wave_cuda, 512000, ".g");
+    //writeComponent(g_cuda, componentSize, ".g");
+    //writeComponent(b_wave_cuda, componentSize, ".b");
+    cudaFreeHost(d->srcImg);
+    cudaCheckError("Cuda free host");
+
+    return 0;
+}
--- a/examples/dwt2d/run.sh
+++ b/examples/dwt2d/run.sh
@ -0,0 +1,8 @@
+./dwt2d 4.bmp z.dwt -d 4x4 -f -5 -l 3
+# ./dwt2d 8.bmp -d 8x8 -f -5 -l 3
+# ./dwt2d 16.bmp -d 16x16 -f -5 -l 3
+# ./dwt2d 64.bmp -d 64x64 -f -5 -l 3
+
+# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
+# ls
+# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
--- a/examples/dwt2d/run_cpu.sh
+++ b/examples/dwt2d/run_cpu.sh
@ -0,0 +1,8 @@
+# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
+# ls
+# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
+# ./dwt2d 16.bmp -d 16x16 -f -9 -l 3\
+./dwt2d 4.bmp  -d 4x4 -r -5 -l 3
+# ./dwt2d 4.bmp  -d 4x4 -r -9 -l 3
+# ./dwt2d 8.bmp  -d 8x8 -f -9 -l 3
+
--- a/examples/dwt2d/run_nvcc.sh
+++ b/examples/dwt2d/run_nvcc.sh
@ -0,0 +1,14 @@
+# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
+# ls
+# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
+# ./nvcc_dwt2d 4.bmp -d 4x4 -f -9 -l 3
+./nvcc_dwt2d 4.bmp  -d 4x4 -f -5 -l 3
+# ./nvcc_dwt2d 8.bmp -d 8x8 -f -9 -l 3
+# ./nvcc_dwt2d 16.bmp -d 16x16 -f -5 -l 3
+# ./nvcc_dwt2d 16.bmp -d 16x16 -r -5 -l 3
+# ./nvcc_dwt2d  16.bmp -d 16x16 -f -9 -l 3
+# ./nvcc_dwt2d 4.bmp  -d 4x4 -r -9 -l 3
+# ./nvcc_dwt2d 64.bmp -d 64x64 -f -5 -l 3
+# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
+# ls
+# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
--- a/examples/dwt2d/test_compile_cpu.sh
+++ b/examples/dwt2d/test_compile_cpu.sh
@ -0,0 +1,51 @@
+
+
+#!/bin/bash
+
+clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu  dwt_cuda/common.cu  dwt_cuda/rdwt97.cu  dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c  --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -I. -I/include -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
+
+export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
+
+../../build/compilation/kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc
+../../build/compilation/kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc
+../../build/compilation/kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc
+
+../../build/compilation/kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc
+
+../../build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
+../../build/compilation/hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
+../../build/compilation/hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
+../../build/compilation/hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
+../../build/compilation/hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
+
+../../build/compilation/hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
+../../build/compilation/hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
+../../build/compilation/hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
+../../build/compilation/kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc
+../../build/compilation/kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc
+../../build/compilation/kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc
+
+llc --relocation-model=pic --filetype=obj  common.bc
+llc --relocation-model=pic --filetype=obj  components.bc
+llc --relocation-model=pic --filetype=obj  fdwt53.bc
+
+llc --relocation-model=pic --filetype=obj  dwt.bc
+
+
+llc --relocation-model=pic --filetype=obj  host.bc
+
+llc --relocation-model=pic --filetype=obj  common_host.bc
+llc --relocation-model=pic --filetype=obj  components_host.bc
+llc --relocation-model=pic --filetype=obj  fdwt53_host.bc
+
+llc --relocation-model=pic --filetype=obj  dwt_host.bc
+
+
+llc --relocation-model=pic --filetype=obj  fdwt97_host.bc
+llc --relocation-model=pic --filetype=obj  rdwt97_host.bc
+llc --relocation-model=pic --filetype=obj  rdwt53_host.bc
+llc --relocation-model=pic --filetype=obj  fdwt97.bc
+llc --relocation-model=pic --filetype=obj  rdwt97.bc
+llc --relocation-model=pic --filetype=obj  rdwt53.bc
+
+g++ -g -Wall -L../../build/runtime  -L../../build/runtime/threadPool -o dwt2d -fPIC -no-pie common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lc -lx86Runtime -lthreadPool -lpthread
--- a/examples/dwt2d/test_compile_nvcc.sh
+++ b/examples/dwt2d/test_compile_nvcc.sh
@ -0,0 +1,18 @@
+/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c main.cu -o main.cu.o
+/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt.cu -o dwt.cu.o
+/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c components.cu -o components.cu.o
+/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt53.cu -o dwt_cuda/fdwt53.cu.o
+/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt97.cu -o dwt_cuda/fdwt97.cu.o
+/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/common.cu -o dwt_cuda/common.cu.o
+/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
+/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include  -O2  --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
+g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
+
+
+
+
+
+
+
+
+
--- a/runtime/lib/cudaRuntimeImpl.cpp
+++ b/runtime/lib/cudaRuntimeImpl.cpp
@ -39,6 +39,12 @@ cudaError_t cudaMalloc(void **devPtr, size_t size) {
    return cudaErrorMemoryAllocation;
  return cudaSuccess;
 }
+cudaError_t cudaMallocHost(void **devPtr, size_t size) {
+  *devPtr = malloc(size);
+  if (devPtr == NULL)
+    return cudaErrorMemoryAllocation;
+ return cudaSuccess;
+}
 cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
  memset(devPtr, value, count);
  return cudaSuccess;
@ -58,7 +64,7 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
    memcpy(dst, src, count);
  } else if (kind == cudaMemcpyDeviceToDevice) {

-    memcpy(dst, dst, count);
+    memcpy(dst, src, count);
  } else if (kind == cudaMemcpyDefault) {
    memcpy(dst, src, count);
  }