From 1add53fa861d7623e5c10d5a9d38ef2191a3f479 Mon Sep 17 00:00:00 2001 From: jchen706 Date: Sat, 21 May 2022 15:55:49 -0400 Subject: [PATCH 1/3] add dwt2d example and fixes --- compilation/KernelTranslation/lib/init.cpp | 6 +- .../lib/insert_warp_loop.cpp | 6 + .../lib/memory_hierarchy.cpp | 18 + examples/dwt2d/common.h | 62 +++ examples/dwt2d/components.cu | 193 +++++++ examples/dwt2d/components.h | 38 ++ examples/dwt2d/dwt.cu | 385 ++++++++++++++ examples/dwt2d/dwt.h | 40 ++ examples/dwt2d/dwt_cuda/common.cu | 35 ++ examples/dwt2d/dwt_cuda/common.h | 261 ++++++++++ examples/dwt2d/dwt_cuda/dwt.h | 112 ++++ examples/dwt2d/dwt_cuda/fdwt53.cu | 400 +++++++++++++++ examples/dwt2d/dwt_cuda/fdwt97.cu | 383 ++++++++++++++ examples/dwt2d/dwt_cuda/io.h | 483 ++++++++++++++++++ examples/dwt2d/dwt_cuda/rdwt53.cu | 360 +++++++++++++ examples/dwt2d/dwt_cuda/rdwt97.cu | 363 +++++++++++++ examples/dwt2d/dwt_cuda/transform_buffer.h | 373 ++++++++++++++ examples/dwt2d/main.cu | 401 +++++++++++++++ examples/dwt2d/run.sh | 8 + examples/dwt2d/run_cpu.sh | 8 + examples/dwt2d/run_nvcc.sh | 14 + examples/dwt2d/test_compile_cpu.sh | 51 ++ examples/dwt2d/test_compile_nvcc.sh | 18 + runtime/lib/cudaRuntimeImpl.cpp | 8 +- 24 files changed, 4024 insertions(+), 2 deletions(-) create mode 100755 examples/dwt2d/common.h create mode 100755 examples/dwt2d/components.cu create mode 100755 examples/dwt2d/components.h create mode 100755 examples/dwt2d/dwt.cu create mode 100755 examples/dwt2d/dwt.h create mode 100755 examples/dwt2d/dwt_cuda/common.cu create mode 100755 examples/dwt2d/dwt_cuda/common.h create mode 100755 examples/dwt2d/dwt_cuda/dwt.h create mode 100755 examples/dwt2d/dwt_cuda/fdwt53.cu create mode 100755 examples/dwt2d/dwt_cuda/fdwt97.cu create mode 100755 examples/dwt2d/dwt_cuda/io.h create mode 100755 examples/dwt2d/dwt_cuda/rdwt53.cu create mode 100755 examples/dwt2d/dwt_cuda/rdwt97.cu create mode 100755 examples/dwt2d/dwt_cuda/transform_buffer.h create mode 100755 examples/dwt2d/main.cu create mode 100755 examples/dwt2d/run.sh create mode 100755 examples/dwt2d/run_cpu.sh create mode 100644 examples/dwt2d/run_nvcc.sh create mode 100644 examples/dwt2d/test_compile_cpu.sh create mode 100755 examples/dwt2d/test_compile_nvcc.sh diff --git a/compilation/KernelTranslation/lib/init.cpp b/compilation/KernelTranslation/lib/init.cpp index 2c31392..4cd5984 100644 --- a/compilation/KernelTranslation/lib/init.cpp +++ b/compilation/KernelTranslation/lib/init.cpp @@ -395,5 +395,9 @@ void init_block(llvm::Module *M, std::ofstream &fout) { // replace asm Inline replace_asm_call(M); // replace dynamic shared memory - replace_dynamic_shared_memory(M); + auto dynamic_shared_memory_addr = + M->getGlobalVariable("dynamic_shared_memory"); + if (dynamic_shared_memory_addr) { + replace_dynamic_shared_memory(M); + } } diff --git a/compilation/KernelTranslation/lib/insert_warp_loop.cpp b/compilation/KernelTranslation/lib/insert_warp_loop.cpp index 7855778..28a5a61 100644 --- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp +++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp @@ -270,11 +270,17 @@ void AddContextSaveRestore(llvm::Instruction *instruction, AddContextSave(instruction, alloca, intra_warp_loop); std::vector uses; + Function *f2 = instruction->getParent()->getParent(); + for (Instruction::use_iterator ui = instruction->use_begin(), ue = instruction->use_end(); ui != ue; ++ui) { llvm::Instruction *user = cast(ui->getUser()); + Function *f1 = user->getParent()->getParent(); + if(f2->getName() != f1->getName()) { + continue; + } if (user == NULL) continue; if (user == theStore) diff --git a/compilation/KernelTranslation/lib/memory_hierarchy.cpp b/compilation/KernelTranslation/lib/memory_hierarchy.cpp index 96eacd4..b05bffd 100644 --- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp +++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp @@ -86,6 +86,24 @@ void mem_share2global(llvm::Module *M) { corresponding_global_memory.insert( std::pair(share_memory, global_memory)); + } else if (element_type->isStructTy()) { + auto undef = llvm::UndefValue::get(element_type); + llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( + *M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef, + new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, + false); + global_memory->setDSOLocal(true); + Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName())); + comdat->setSelectionKind(Comdat::SelectionKind::Any); + global_memory->setComdat(comdat); + global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage); + global_memory->setInitializer(undef); + global_memory->setAlignment(share_memory->getAlignment()); + corresponding_global_memory.insert( + std::pair(share_memory, + global_memory)); + + } else { assert(0 && "The required Share Memory Type is not supported\n"); } diff --git a/examples/dwt2d/common.h b/examples/dwt2d/common.h new file mode 100755 index 0000000..f649804 --- /dev/null +++ b/examples/dwt2d/common.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2009, Jiri Matela + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _COMMON_H +#define _COMMON_H + +//24-bit multiplication is faster on G80, +//but we must be sure to multiply integers +//only within [-8M, 8M - 1] range +#define IMUL(a, b) __mul24(a, b) + +////cuda timing macros +//#define CTIMERINIT cudaEvent_t cstart, cstop; \ +// cudaEventCreate(&cstart); \ +// cudaEventCreate(&cstop); \ +// float elapsedTime +//#define CTIMERSTART(cstart) cudaEventRecord(cstart,0) +//#define CTIMERSTOP(cstop) cudaEventRecord(cstop,0); \ +// cudaEventSynchronize(cstop); \ +// cudaEventElapsedTime(&elapsedTime, cstart, cstop) + +//divide and round up macro +#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b))) + +# define cudaCheckError( msg ) { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "%s: %i: %s: %s.\n", \ + __FILE__, __LINE__, msg, cudaGetErrorString( err) ); \ + exit(-1); \ + } } + +# define cudaCheckAsyncError( msg ) { \ + cudaThreadSynchronize(); \ + cudaCheckError( msg ); \ + } + + +#endif diff --git a/examples/dwt2d/components.cu b/examples/dwt2d/components.cu new file mode 100755 index 0000000..b9721ce --- /dev/null +++ b/examples/dwt2d/components.cu @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2009, Jiri Matela + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include "components.h" +#include "common.h" + +#define THREADS 256 + +/* Store 3 RGB float components */ +__device__ void storeComponents(float *d_r, float *d_g, float *d_b, float r, float g, float b, int pos) +{ + d_r[pos] = (r/255.0f) - 0.5f; + d_g[pos] = (g/255.0f) - 0.5f; + d_b[pos] = (b/255.0f) - 0.5f; +} + +/* Store 3 RGB intege components */ +__device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int b, int pos) +{ + d_r[pos] = r - 128; + d_g[pos] = g - 128; + d_b[pos] = b - 128; +} + +/* Store float component */ +__device__ void storeComponent(float *d_c, float c, int pos) +{ + d_c[pos] = (c/255.0f) - 0.5f; +} + +/* Store integer component */ +__device__ void storeComponent(int *d_c, int c, int pos) +{ + d_c[pos] = c - 128; +} + +/* Copy img src data into three separated component buffers */ +template +__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b, + unsigned char * d_src, + int pixels) +{ + int x = threadIdx.x; + int gX = blockDim.x*blockIdx.x; + + __shared__ unsigned char sData[THREADS*3]; + + /* Copy data to shared mem by 4bytes + other checks are not necessary, since + d_src buffer is aligned to sharedDataSize */ + if ( (x*4) < THREADS*3 ) { + float *s = (float *)d_src; + float *d = (float *)sData; + d[x] = s[((gX*3)>>2) + x]; + } + __syncthreads(); + + T r, g, b; + + int offset = x*3; + r = (T)(sData[offset]); + g = (T)(sData[offset+1]); + b = (T)(sData[offset+2]); + + int globalOutputPosition = gX + x; + if (globalOutputPosition < pixels) { + storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition); + } +} + +/* Copy img src data into three separated component buffers */ +template +__global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels) +{ + int x = threadIdx.x; + int gX = blockDim.x*blockIdx.x; + + __shared__ unsigned char sData[THREADS]; + + /* Copy data to shared mem by 4bytes + other checks are not necessary, since + d_src buffer is aligned to sharedDataSize */ + if ( (x*4) < THREADS) { + float *s = (float *)d_src; + float *d = (float *)sData; + d[x] = s[(gX>>2) + x]; + } + __syncthreads(); + + T c; + + c = (T)(sData[x]); + + int globalOutputPosition = gX + x; + if (globalOutputPosition < pixels) { + storeComponent(d_c, c, globalOutputPosition); + } +} + + +/* Separate compoents of 8bit RGB source image */ +template +void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height) +{ + unsigned char * d_src; + int pixels = width*height; + int alignedSize = DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS + + /* Alloc d_src buffer */ + cudaMalloc((void **)&d_src, alignedSize); + cudaCheckAsyncError("Cuda malloc") + cudaMemset(d_src, 0, alignedSize); + + /* Copy data to device */ + cudaMemcpy(d_src, src, pixels*3, cudaMemcpyHostToDevice); + cudaCheckError("Copy data to device") + + /* Kernel */ + dim3 threads(THREADS); + dim3 grid(alignedSize/(THREADS*3)); + assert(alignedSize%(THREADS*3) == 0); + c_CopySrcToComponents<<>>(d_r, d_g, d_b, d_src, pixels); + cudaCheckAsyncError("CopySrcToComponents kernel") + + /* Free Memory */ + cudaFree(d_src); + cudaCheckAsyncError("Free memory") +} +template void rgbToComponents(float *d_r, float *d_g, float *d_b, unsigned char * src, int width, int height); +template void rgbToComponents(int *d_r, int *d_g, int *d_b, unsigned char * src, int width, int height); + + +/* Copy a 8bit source image data into a color compoment of type T */ +template +void bwToComponent(T *d_c, unsigned char * src, int width, int height) +{ + unsigned char * d_src; + int pixels = width*height; + int alignedSize = DIVANDRND(pixels, THREADS) * THREADS; //aligned to thread block size -- THREADS + + /* Alloc d_src buffer */ + cudaMalloc((void **)&d_src, alignedSize); + cudaCheckAsyncError("Cuda malloc") + cudaMemset(d_src, 0, alignedSize); + + /* Copy data to device */ + cudaMemcpy(d_src, src, pixels, cudaMemcpyHostToDevice); + cudaCheckError("Copy data to device") + + /* Kernel */ + dim3 threads(THREADS); + dim3 grid(alignedSize/(THREADS)); + assert(alignedSize%(THREADS) == 0); + c_CopySrcToComponent<<>>(d_c, d_src, pixels); + cudaCheckAsyncError("CopySrcToComponent kernel") + + /* Free Memory */ + cudaFree(d_src); + cudaCheckAsyncError("Free memory") +} + +template void bwToComponent(float *d_c, unsigned char *src, int width, int height); +template void bwToComponent(int *d_c, unsigned char *src, int width, int height); diff --git a/examples/dwt2d/components.h b/examples/dwt2d/components.h new file mode 100755 index 0000000..98a2b12 --- /dev/null +++ b/examples/dwt2d/components.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2009, Jiri Matela + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _COMPONENTS_H +#define _COMPONENTS_H + +/* Separate compoents of source 8bit RGB image */ +template +void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height); + +/* Copy a 8bit source image data into a color compoment of type T */ +template +void bwToComponent(T *d_c, unsigned char * src, int width, int height); + +#endif diff --git a/examples/dwt2d/dwt.cu b/examples/dwt2d/dwt.cu new file mode 100755 index 0000000..f06f2d9 --- /dev/null +++ b/examples/dwt2d/dwt.cu @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2009, Jiri Matela + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "dwt_cuda/dwt.h" +#include "dwt_cuda/common.h" +#include "dwt.h" +#include "common.h" +#include +#include + +inline void fdwt(float *in, float *out, int width, int height, int levels) +{ + printf(" Running fdwt97 Float \n"); + dwt_cuda::fdwt97(in, out, width, height, levels); +} +/* +inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut) +{ + dwt_cuda::fdwt97(in, out, width, height, levels, diffOut); +} +*/ + + + +inline void fdwt(int *in, int *out, int width, int height, int levels) +{ + printf(" Running fdwt53 Int \n"); + + dwt_cuda::fdwt53(in, out, width, height, levels); +} +/* +inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut) +{ + dwt_cuda::fdwt53(in, out, width, height, levels, diffOut); +} +*/ + + + +inline void rdwt(float *in, float *out, int width, int height, int levels) +{ + printf(" Running rdwt97 Float \n"); + + dwt_cuda::rdwt97(in, out, width, height, levels); +} + +inline void rdwt(int *in, int *out, int width, int height, int levels) +{ + printf(" Running rdwt53 Int \n"); + + dwt_cuda::rdwt53(in, out, width, height, levels); +} + +template +int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward) +{ + printf("\n*** %d stages of 2D forward DWT:\n", stages); + + /* create backup of input, because each test iteration overwrites it */ + const int size = pixHeight * pixWidth * sizeof(T); + cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice); + cudaCheckError("Memcopy device to device"); + + /* Measure time of individual levels. */ + if(forward) + fdwt(in, out, pixWidth, pixHeight, stages); + else + rdwt(in, out, pixWidth, pixHeight, stages); + + // Measure overall time of DWT. +/* #ifdef GPU_DWT_TESTING_1 + + dwt_cuda::CudaDWTTester tester; + for(int i = tester.getNumIterations(); i--; ) { + // Recover input and measure one overall DWT run. + cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); + cudaCheckError("Memcopy device to device"); + tester.beginTestIteration(); + if(forward) + fdwt(in, out, pixWidth, pixHeight, stages); + else + rdwt(in, out, pixWidth, pixHeight, stages); + tester.endTestIteration(); + } + tester.showPerformance(" Overall DWT", pixWidth, pixHeight); + #endif // GPU_DWT_TESTING + + cudaCheckAsyncError("DWT Kernel calls"); +*/ return 0; +} +template int nStage2dDWT(float*, float*, float*, int, int, int, bool); +template int nStage2dDWT(int*, int*, int*, int, int, int, bool); + + + +/* +template +int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut) +{ + printf("*** %d stages of 2D forward DWT:\n", stages); + + // create backup of input, because each test iteration overwrites it + const int size = pixHeight * pixWidth * sizeof(T); + cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice); + cudaCheckError("Memcopy device to device"); + + // Measure time of individual levels. + if(forward) + fdwt(in, out, pixWidth, pixHeight, stages, diffOut); + else + rdwt(in, out, pixWidth, pixHeight, stages); + + // Measure overall time of DWT. + #ifdef GPU_DWT_TESTING_1 + + dwt_cuda::CudaDWTTester tester; + for(int i = tester.getNumIterations(); i--; ) { + // Recover input and measure one overall DWT run. + cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); + cudaCheckError("Memcopy device to device"); + tester.beginTestIteration(); + if(forward) + fdwt(in, out, pixWidth, pixHeight, stages, diffOut); + else + rdwt(in, out, pixWidth, pixHeight, stages); + tester.endTestIteration(); + } + tester.showPerformance(" Overall DWT", pixWidth, pixHeight); + #endif // GPU_DWT_TESTING + + cudaCheckAsyncError("DWT Kernel calls"); + return 0; +} +template int nStage2dDWT(float*, float*, float*, int, int, int, bool, float*); +template int nStage2dDWT(int*, int*, int*, int, int, int, bool, int*); + +*/ + +void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename) +{ + int i; + std::ofstream outputFile; + char outfile[strlen(filename)+strlen(".txt")]; + strcpy(outfile, filename); + strcpy(outfile+strlen(filename), ".txt"); + outputFile.open(outfile); + + + for(i = 0; i < samplesNum; i++) { + float r = (src[i]+0.5f) * 255; + if (r > 255) r = 255; + if (r < 0) r = 0; + dst[i] = (unsigned char)r; + outputFile << "index: " << i << " val: "<< r <<" \n"; + + + } + outputFile.close(); +} + +void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename) +{ + int i; + std::ofstream outputFile; + char outfile[strlen(filename)+strlen(".txt")]; + strcpy(outfile, filename); + strcpy(outfile+strlen(filename), ".txt"); + outputFile.open(outfile); + for(i = 0; i < samplesNum; i++) { + int r = src[i]+128; + if (r > 255) r = 255; + if (r < 0) r = 0; + dst[i] = (unsigned char)r; + // added this line to output check + outputFile << "index: " << i << " val: "<< r <<" \n"; + } + outputFile.close(); +} + +///* Write output linear orderd*/ +template +int writeLinear(T *component_cuda, int pixWidth, int pixHeight, + const char * filename, const char * suffix) +{ + unsigned char * result; + T *gpu_output; + int i; + int size; + int samplesNum = pixWidth*pixHeight; + + size = samplesNum*sizeof(T); + cudaMallocHost((void **)&gpu_output, size); + cudaCheckError("Malloc host"); + memset(gpu_output, 0, size); + result = (unsigned char *)malloc(samplesNum); + cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost); + cudaCheckError("Memcopy device to host"); + + /* T to char */ + samplesToChar(result, gpu_output, samplesNum, filename); + + /* Write component */ + char outfile[strlen(filename)+strlen(suffix)]; + strcpy(outfile, filename); + strcpy(outfile+strlen(filename), suffix); + i = open(outfile, O_CREAT|O_WRONLY, 0644); + if (i == -1) { + error(0,errno,"cannot access %s", outfile); + return -1; + } + printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight); + ssize_t x ; + x = write(i, result, samplesNum); + close(i); + + /* Clean up */ + cudaFreeHost(gpu_output); + cudaCheckError("Cuda free host memory"); + free(result); + if(x == 0) return 1; + return 0; +} +template int writeLinear(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); +template int writeLinear(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); + +/* Write output visual ordered */ +template +int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, + int stages, const char * filename, const char * suffix) +{ + struct band { + int dimX; + int dimY; + }; + struct dimensions { + struct band LL; + struct band HL; + struct band LH; + struct band HH; + }; + + unsigned char * result; + T *src, *dst; + int i,s; + int size; + int offset; + int yOffset; + int samplesNum = pixWidth*pixHeight; + struct dimensions * bandDims; + + bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions)); + + bandDims[0].LL.dimX = DIVANDRND(pixWidth,2); + bandDims[0].LL.dimY = DIVANDRND(pixHeight,2); + bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX; + bandDims[0].HL.dimY = bandDims[0].LL.dimY; + bandDims[0].LH.dimX = bandDims[0].LL.dimX; + bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY; + bandDims[0].HH.dimX = bandDims[0].HL.dimX; + bandDims[0].HH.dimY = bandDims[0].LH.dimY; + + for (i = 1; i < stages; i++) { + bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2); + bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2); + bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX; + bandDims[i].HL.dimY = bandDims[i].LL.dimY; + bandDims[i].LH.dimX = bandDims[i].LL.dimX; + bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY; + bandDims[i].HH.dimX = bandDims[i].HL.dimX; + bandDims[i].HH.dimY = bandDims[i].LH.dimY; + } + +#if 0 + printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight); + for (i = 0; i < stages; i++) { + printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY); + printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY); + printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY); + printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY); + } +#endif + + size = samplesNum*sizeof(T); + cudaMallocHost((void **)&src, size); + cudaCheckError("Malloc host"); + dst = (T*)malloc(size); + memset(src, 0, size); + memset(dst, 0, size); + result = (unsigned char *)malloc(samplesNum); + cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost); + cudaCheckError("Memcopy device to host"); + + // LL Band + size = bandDims[stages-1].LL.dimX * sizeof(T); + for (i = 0; i < bandDims[stages-1].LL.dimY; i++) { + memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size); + } + + for (s = stages - 1; s >= 0; s--) { + // HL Band + size = bandDims[s].HL.dimX * sizeof(T); + offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY; + for (i = 0; i < bandDims[s].HL.dimY; i++) { + memcpy(dst+i*pixWidth+bandDims[s].LL.dimX, + src+offset+i*bandDims[s].HL.dimX, + size); + } + + // LH band + size = bandDims[s].LH.dimX * sizeof(T); + offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY; + yOffset = bandDims[s].LL.dimY; + for (i = 0; i < bandDims[s].HL.dimY; i++) { + memcpy(dst+(yOffset+i)*pixWidth, + src+offset+i*bandDims[s].LH.dimX, + size); + } + + //HH band + size = bandDims[s].HH.dimX * sizeof(T); + offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY; + yOffset = bandDims[s].HL.dimY; + for (i = 0; i < bandDims[s].HH.dimY; i++) { + memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX, + src+offset+i*bandDims[s].HH.dimX, + size); + } + } + + /* Write component */ + samplesToChar(result, dst, samplesNum, filename); + + char outfile[strlen(filename)+strlen(suffix)]; + strcpy(outfile, filename); + strcpy(outfile+strlen(filename), suffix); + i = open(outfile, O_CREAT|O_WRONLY, 0644); + if (i == -1) { + error(0,errno,"cannot access %s", outfile); + return -1; + } + printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight); + ssize_t x; + x = write(i, result, samplesNum); + close(i); + + cudaFreeHost(src); + cudaCheckError("Cuda free host memory"); + free(dst); + free(result); + free(bandDims); + if (x == 0) return 1; + return 0; +} +template int writeNStage2DDWT(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); +template int writeNStage2DDWT(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); diff --git a/examples/dwt2d/dwt.h b/examples/dwt2d/dwt.h new file mode 100755 index 0000000..fcb4b9a --- /dev/null +++ b/examples/dwt2d/dwt.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009, Jiri Matela + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _DWT_H +#define _DWT_H + +template +int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward); + +template +int writeNStage2DDWT(T *component_cuda, int width, int height, + int stages, const char * filename, const char * suffix); +template +int writeLinear(T *component_cuda, int width, int height, + const char * filename, const char * suffix); + +#endif diff --git a/examples/dwt2d/dwt_cuda/common.cu b/examples/dwt2d/dwt_cuda/common.cu new file mode 100755 index 0000000..8ce4984 --- /dev/null +++ b/examples/dwt2d/dwt_cuda/common.cu @@ -0,0 +1,35 @@ +/// +/// @file common.cu +/// @author Martin Jirman (207962@mail.muni.cz) +/// @date 2011-01-20 14:37 +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// + +#include "common.h" + +namespace dwt_cuda { + bool CudaDWTTester::testRunning = false; +} diff --git a/examples/dwt2d/dwt_cuda/common.h b/examples/dwt2d/dwt_cuda/common.h new file mode 100755 index 0000000..37c1979 --- /dev/null +++ b/examples/dwt2d/dwt_cuda/common.h @@ -0,0 +1,261 @@ +/// +/// @file common.h +/// @author Martin Jirman (207962@mail.muni.cz) +/// @brief Common stuff for all CUDA dwt functions. +/// @date 2011-01-20 14:19 +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// + + +#ifndef DWT_COMMON_H +#define DWT_COMMON_H + + +#include +#include +#include + + + +// compile time minimum macro +#define CTMIN(a,b) (((a) < (b)) ? (a) : (b)) + + + +// performance testing macros +#if defined(GPU_DWT_TESTING) + #define PERF_BEGIN \ + { \ + dwt_cuda::CudaDWTTester PERF_TESTER; \ + for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \ + { \ + PERF_TESTER.beginTestIteration(); + + #define PERF_END(PERF_NAME, PERF_W, PERF_H) \ + PERF_TESTER.endTestIteration(); \ + } \ + PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \ + } +#else // GPU_DWT_TESTING + #define PERF_BEGIN + #define PERF_END(PERF_NAME, PERF_W, PERF_H) +#endif // GPU_DWT_TESTING + + + +namespace dwt_cuda { + + + /// Divide and round up. + template + __device__ __host__ inline T divRndUp(const T & n, const T & d) { + return (n / d) + ((n % d) ? 1 : 0); + } + + + // 9/7 forward DWT lifting schema coefficients + const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1 + const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1 + const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2 + const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2 + + + // 9/7 reverse DWT lifting schema coefficients + const float r97update2 = -f97Update2; ///< undo 9/7 update 2 + const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2 + const float r97update1 = -f97Update1; ///< undo 9/7 update 1 + const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1 + + // FDWT 9/7 scaling coefficients + const float scale97Mul = 1.23017410491400f; + const float scale97Div = 1.0 / scale97Mul; + + + // 5/3 forward DWT lifting schema coefficients + const float forward53Predict = -0.5f; /// forward 5/3 predict + const float forward53Update = 0.25f; /// forward 5/3 update + + // 5/3 forward DWT lifting schema coefficients + const float reverse53Update = -forward53Update; /// undo 5/3 update + const float reverse53Predict = -forward53Predict; /// undo 5/3 predict + + + + /// Functor which adds scaled sum of neighbors to given central pixel. + struct AddScaledSum { + const float scale; // scale of neighbors + __device__ AddScaledSum(const float scale) : scale(scale) {} + __device__ void operator()(const float p, float & c, const float n) const { + + // if(threadIdx.x == 0) { + + // printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) ); + + // } + + c += scale * (p + n); + } + }; + + + + /// Returns index ranging from 0 to num threads, such that first half + /// of threads get even indices and others get odd indices. Each thread + /// gets different index. + /// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7 + /// parityIdx: 0 2 4 6 1 3 5 7 + /// @tparam THREADS total count of participating threads + /// @return parity-separated index of thread + template + __device__ inline int parityIdx() { + return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2)); + } + + + + /// size of shared memory + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) + const int SHM_SIZE = 48 * 1024; + #else + const int SHM_SIZE = 16 * 1024; + #endif + + + + /// Perrformance and return code tester. + class CudaDWTTester { + private: + static bool testRunning; ///< true if any test is currently running + cudaEvent_t beginEvent; ///< begin CUDA event + cudaEvent_t endEvent; ///< end CUDA event + std::vector times; ///< collected times + const bool disabled; ///< true if this object is disabled + public: + /// Checks CUDA related error. + /// @param status return code to be checked + /// @param message message to be shown if there was an error + /// @return true if there was no error, false otherwise + static bool check(const cudaError_t & status, const char * message) { + #if defined(GPU_DWT_TESTING) + if((!testRunning) && status != cudaSuccess) { + const char * errorString = cudaGetErrorString(status); + fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString); + fflush(stderr); + return false; + } + #endif // GPU_DWT_TESTING + return true; + } + + /// Checks last kernel call for errors. + /// @param message description of the kernel call + /// @return true if there was no error, false otherwise + static bool checkLastKernelCall(const char * message) { + #if defined(GPU_DWT_TESTING) + return testRunning ? true : check(cudaThreadSynchronize(), message); + #else // GPU_DWT_TESTING + return true; + #endif // GPU_DWT_TESTING + } + + /// Initializes DWT tester for time measurement + CudaDWTTester() : disabled(testRunning) {} + + /// Gets rpefered number of iterations + int getNumIterations() { + return disabled ? 1 : 31; + } + + /// Starts one test iteration. + void beginTestIteration() { + if(!disabled) { + cudaEventCreate(&beginEvent); + cudaEventCreate(&endEvent); + cudaEventRecord(beginEvent, 0); + testRunning = true; + } + } + + /// Ends on etest iteration. + void endTestIteration() { + if(!disabled) { + float time; + testRunning = false; + cudaEventRecord(endEvent, 0); + cudaEventSynchronize(endEvent); + cudaEventElapsedTime(&time, beginEvent, endEvent); + cudaEventDestroy(beginEvent); + cudaEventDestroy(endEvent); + times.push_back(time); + } + } + + /// Shows brief info about all iterations. + /// @param name name of processing method + /// @param sizeX width of processed image + /// @param sizeY height of processed image + void showPerformance(const char * name, const int sizeX, const int sizeY) { + if(!disabled) { + // compute mean and median + std::sort(times.begin(), times.end()); + double sum = 0; + for(int i = times.size(); i--; ) { + sum += times[i]; + } + const double median = (times[times.size() / 2] + + times[(times.size() - 1) / 2]) * 0.5f; + printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) " + "(%d x %d)\n", name, (sum / times.size()), median, + times[times.size() - 1], sizeX, sizeY); + } + } + }; + + + + /// Simple cudaMemcpy wrapped in performance tester. + /// @param dest destination bufer + /// @param src source buffer + /// @param sx width of copied image + /// @param sy height of copied image + template + inline void memCopy(T * const dest, const T * const src, + const size_t sx, const size_t sy) { + cudaError_t status; + PERF_BEGIN + status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice); + PERF_END(" memcpy", sx, sy) + CudaDWTTester::check(status, "memcpy device > device"); + } + + + +} // end of namespace dwt_cuda + + + +#endif // DWT_COMMON_CUDA_H + diff --git a/examples/dwt2d/dwt_cuda/dwt.h b/examples/dwt2d/dwt_cuda/dwt.h new file mode 100755 index 0000000..2c76708 --- /dev/null +++ b/examples/dwt2d/dwt_cuda/dwt.h @@ -0,0 +1,112 @@ +/// +/// @file dwt.h +/// @author Martin Jirman (207962@mail.muni.cz) +/// @brief Entry points for CUDA implementaion of 9/7 and 5/3 DWT. +/// @date 2011-01-20 11:41 +/// +/// +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// +/// +/// +/// Following conditions are common for all four DWT functions: +/// - Both input and output images are stored in GPU memory with no padding +/// of lines or interleaving of pixels. +/// - DWT coefficients are stored as follows: Each band is saved as one +/// consecutive chunk (no padding/stride/interleaving). Deepest level bands +/// (smallest ones) are stored first (at the beginning of the input/output +/// buffers), less deep bands follow. There is no padding between stored +/// bands in the buffer. Order of bands of the same level in the buffer is +/// following: Low-low band (or deeper level subbands) is stored first. +/// Vertical-low/horizontal-high band follows. Vertical-high/horizonal-low +/// band is saved next and finally, the high-high band is saved. Out of all +/// low-low bands, only th edeepest one is saved (right at the beginning of +/// the buffer), others are replaced with deeper level subbands. +/// - Input images of all functions won't be preserved (will be overwritten). +/// - Input and output buffers can't overlap. +/// - Size of output buffer must be greater or equal to size of input buffer. +/// +/// There are no common compile time settings (buffer size, etc...) for +/// all DWTs, because each DTW type needs different amount of GPU resources. +/// Instead, each DWT type has its own compile time settings, which can be +/// found in *.cu file, where it is implemented. +/// + +#ifndef DWT_CUDA_H +#define DWT_CUDA_H + + +namespace dwt_cuda { + + + /// Forward 5/3 2D DWT. See common rules (above) for more details. + /// @param in Expected to be normalized into range [-128, 127]. + /// Will not be preserved (will be overwritten). + /// @param out output buffer on GPU + /// @param sizeX width of input image (in pixels) + /// @param sizeY height of input image (in pixels) + /// @param levels number of recursive DWT levels + void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels); + + + /// Reverse 5/3 2D DWT. See common rules (above) for more details. + /// @param in Input DWT coefficients. Format described in common rules. + /// Will not be preserved (will be overwritten). + /// @param out output buffer on GPU - will contain original image + /// in normalized range [-128, 127]. + /// @param sizeX width of input image (in pixels) + /// @param sizeY height of input image (in pixels) + /// @param levels number of recursive DWT levels + void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels); + + + /// Forward 9/7 2D DWT. See common rules (above) for more details. + /// @param in Input DWT coefficients. Should be normalized (in range + /// [-0.5, 0.5]). Will not be preserved (will be overwritten). + /// @param out output buffer on GPU - format specified in common rules + /// @param sizeX width of input image (in pixels) + /// @param sizeY height of input image (in pixels) + /// @param levels number of recursive DWT levels + void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels); + + + /// Reverse 9/7 2D DWT. See common rules (above) for more details. + /// @param in Input DWT coefficients. Format described in common rules. + /// Will not be preserved (will be overwritten). + /// @param out output buffer on GPU - will contain original image + /// in normalized range [-0.5, 0.5]. + /// @param sizeX width of input image (in pixels) + /// @param sizeY height of input image (in pixels) + /// @param levels number of recursive DWT levels + void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels); + + +} // namespace dwt_cuda + + + +#endif // DWT_CUDA_H + diff --git a/examples/dwt2d/dwt_cuda/fdwt53.cu b/examples/dwt2d/dwt_cuda/fdwt53.cu new file mode 100755 index 0000000..588acf4 --- /dev/null +++ b/examples/dwt2d/dwt_cuda/fdwt53.cu @@ -0,0 +1,400 @@ +/// @file fdwt53.cu +/// @brief CUDA implementation of forward 5/3 2D DWT. +/// @author Martin Jirman (207962@mail.muni.cz) +/// @date 2011-02-04 13:23 +/// +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// + + +#include "common.h" +#include "transform_buffer.h" +#include "io.h" + +namespace dwt_cuda { + + + /// Wraps buffer and methods needed for computing one level of 5/3 FDWT + /// using sliding window approach. + /// @tparam WIN_SIZE_X width of sliding window + /// @tparam WIN_SIZE_Y height of sliding window + template + class FDWT53 { + private: + + /// Info needed for processing of one input column. + /// @tparam CHECKED_LOADER true if column's loader should check boundaries + /// false if there are no near boudnaries to check + template + struct FDWT53Column { + /// loader for the column + VerticalDWTPixelLoader loader; + + /// offset of the column in shared buffer + int offset; + + // backup of first 3 loaded pixels (not transformed) + int pixel0, pixel1, pixel2; + + /// Sets all fields to anything to prevent 'uninitialized' warnings. + __device__ void clear() { + offset = pixel0 = pixel1 = pixel2 = 0; + loader.clear(); + } + }; + + + /// Type of shared memory buffer for 5/3 FDWT transforms. + typedef TransformBuffer FDWT53Buffer; + + /// Actual shared buffer used for forward 5/3 DWT. + FDWT53Buffer buffer; + + /// Difference between indices of two vertical neighbors in buffer. + enum { STRIDE = FDWT53Buffer::VERTICAL_STRIDE }; + + + /// Forward 5/3 DWT predict operation. + struct Forward53Predict { + __device__ void operator() (const int p, int & c, const int n) const { + // c = n; + c -= (p + n) / 2; // F.8, page 126, ITU-T Rec. T.800 final draft the real one + } + }; + + + /// Forward 5/3 DWT update operation. + struct Forward53Update { + __device__ void operator() (const int p, int & c, const int n) const { + c += (p + n + 2) / 4; // F.9, page 126, ITU-T Rec. T.800 final draft + } + }; + + + /// Initializes one column: computes offset of the column in shared memory + /// buffer, initializes loader and finally uses it to load first 3 pixels. + /// @tparam CHECKED true if loader of the column checks boundaries + /// @param column (uninitialized) column info to be initialized + /// @param input input image + /// @param sizeX width of the input image + /// @param sizeY height of the input image + /// @param colIndex x-axis coordinate of the column (relative to the left + /// side of this threadblock's block of input pixels) + /// @param firstY y-axis coordinate of first image row to be transformed + + template + __device__ void initColumn(FDWT53Column & column, + const int * const input, + const int sizeX, const int sizeY, + const int colIndex, const int firstY) { + // get offset of the column with index 'cId' + column.offset = buffer.getColumnOffset(colIndex); + + // coordinates of the first pixel to be loaded + const int firstX = blockIdx.x * WIN_SIZE_X + colIndex; + + if(blockIdx.y == 0) { + // topmost block - apply mirroring rules when loading first 3 rows + column.loader.init(sizeX, sizeY, firstX, firstY); + + // load pixels in mirrored way + column.pixel2 = column.loader.loadFrom(input); // loaded pixel #0 + column.pixel1 = column.loader.loadFrom(input); // loaded pixel #1 + column.pixel0 = column.loader.loadFrom(input); // loaded pixel #2 + + // reinitialize loader to start with pixel #1 again + column.loader.init(sizeX, sizeY, firstX, firstY + 1); + } else { + // non-topmost row - regular loading: + column.loader.init(sizeX, sizeY, firstX, firstY - 2); + + // load 3 rows into the column + column.pixel0 = column.loader.loadFrom(input); + column.pixel1 = column.loader.loadFrom(input); + column.pixel2 = column.loader.loadFrom(input); + // Now, the next pixel, which will be loaded by loader, is pixel #1. + } + + } + + + /// Loads and vertically transforms given column. Assumes that first 3 + /// pixels are already loaded in column fields pixel0 ... pixel2. + /// @tparam CHECKED true if loader of the column checks boundaries + /// @param column column to be loaded and vertically transformed + /// @param input pointer to input image data + template + __device__ void loadAndVerticallyTransform(FDWT53Column & column, + const int * const input) { + // take 3 loaded pixels and put them into shared memory transform buffer + buffer[column.offset + 0 * STRIDE] = column.pixel0; + buffer[column.offset + 1 * STRIDE] = column.pixel1; + buffer[column.offset + 2 * STRIDE] = column.pixel2; + + // load remaining pixels to be able to vertically transform the window + + for(int i = 3; i < (3 + WIN_SIZE_Y); i++) + { + buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input); + } + + // remember last 3 pixels for use in next iteration + column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE]; + column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE]; + column.pixel2 = buffer[column.offset + (WIN_SIZE_Y + 2) * STRIDE]; + + // vertically transform the column in transform buffer + buffer.forEachVerticalOdd(column.offset, Forward53Predict()); + buffer.forEachVerticalEven(column.offset, Forward53Update()); + + } + + + /// Actual implementation of 5/3 FDWT. + /// @tparam CHECK_LOADS true if input loader must check boundaries + /// @tparam CHECK_WRITES true if output writer must check boundaries + /// @param in input image + /// @param out output buffer + /// @param sizeX width of the input image + /// @param sizeY height of the input image + /// @param winSteps number of sliding window steps + template + __device__ void transform(const int * const in, int * const out, + const int sizeX, const int sizeY, + const int winSteps) { + // info about one main and one boundary columns processed by this thread + FDWT53Column column; + FDWT53Column boundaryColumn; // only few threads use this + + // Initialize all column info: initialize loaders, compute offset of + // column in shared buffer and initialize loader of column. + const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps; + initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th + + + // first 3 threads initialize boundary columns, others do not use them + boundaryColumn.clear(); + if(threadIdx.x < 3) { + // index of boundary column (relative x-axis coordinate of the column) + const int colId = threadIdx.x + ((threadIdx.x == 0) ? WIN_SIZE_X : -3); + + // initialize the column + initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY); + + } + + + // index of column which will be written into output by this thread + const int outColumnIndex = parityIdx(); + + // offset of column which will be written by this thread into output + const int outColumnOffset = buffer.getColumnOffset(outColumnIndex); + + // initialize output writer for this thread + const int outputFirstX = blockIdx.x * WIN_SIZE_X + outColumnIndex; + VerticalDWTBandWriter writer; + writer.init(sizeX, sizeY, outputFirstX, firstY); + __syncthreads(); + + + // Sliding window iterations: + // Each iteration assumes that first 3 pixels of each column are loaded. + for(int w = 0; w < winSteps; w++) { + + // For each column (including boundary columns): load and vertically + // transform another WIN_SIZE_Y lines. + loadAndVerticallyTransform(column, in); + if(threadIdx.x < 3) { + loadAndVerticallyTransform(boundaryColumn, in); + } + + // wait for all columns to be vertically transformed and transform all + // output rows horizontally + __syncthreads(); + + + buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict()); + __syncthreads(); + + buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update()); + + // wait for all output rows to be transformed horizontally and write + // them into output buffer + __syncthreads(); + + + for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) { + // Write low coefficients from output column into low band ... + writer.writeLowInto(out, buffer[outColumnOffset + r * STRIDE]); + // ... and high coeficients into the high band. + writer.writeHighInto(out, buffer[outColumnOffset + (r+1) * STRIDE]); + } + + // before proceeding to next iteration, wait for all output columns + // to be written into the output + __syncthreads(); + + } + + } + + + public: + /// Determines, whether this block's pixels touch boundary and selects + /// right version of algorithm according to it - for many threadblocks, it + /// selects version which does not deal with boundary mirroring and thus is + /// slightly faster. + /// @param in input image + /// @param out output buffer + /// @param sx width of the input image + /// @param sy height of the input image + /// @param steps number of sliding window steps + __device__ static void run(const int * const in, int * const out, + const int sx, const int sy, const int steps) { + // if(blockIdx.x==0 && blockIdx.y ==11 && threadIdx.x >=0&&threadIdx.x <64){ + // object with transform buffer in shared memory + __shared__ FDWT53 fdwt53; + + // Compute limits of this threadblock's block of pixels and use them to + // determine, whether this threadblock will have to deal with boundary. + // (1 in next expressions is for radius of impulse response of 9/7 FDWT.) + const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1; + const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1; + const bool atRightBoudary = maxX >= sx; + const bool atBottomBoudary = maxY >= sy; + + // Select specialized version of code according to distance of this + // threadblock's pixels from image boundary. + + // if(threadIdx.x == 0) { + // printf("fdwt53 run"); + // } + if(atBottomBoudary) + { + // near bottom boundary => check both writing and reading + fdwt53.transform(in, out, sx, sy, steps); + } else if(atRightBoudary) + { + // near right boundary only => check writing only + fdwt53.transform(in, out, sx, sy, steps); + } else + { + // no nearby boundary => check nothing + fdwt53.transform(in, out, sx, sy, steps); + } + } + // } + + }; // end of class FDWT53 + + + + /// Main GPU 5/3 FDWT entry point. + /// @tparam WIN_SX width of sliding window to be used + /// @tparam WIN_SY height of sliding window to be used + /// @param input input image + /// @param output output buffer + /// @param sizeX width of the input image + /// @param sizeY height of the input image + /// @param winSteps number of sliding window steps + template + __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT53), 8)) + __global__ void fdwt53Kernel(const int * const input, int * const output, + const int sizeX, const int sizeY, + const int winSteps) { + FDWT53::run(input, output, sizeX, sizeY, winSteps); + } + + + + /// Only computes optimal number of sliding window steps, + /// number of threadblocks and then lanches the 5/3 FDWT kernel. + /// @tparam WIN_SX width of sliding window + /// @tparam WIN_SY height of sliding window + /// @param in input image + /// @param out output buffer + /// @param sx width of the input image + /// @param sy height of the input image + template + void launchFDWT53Kernel (int * in, int * out, int sx, int sy) { + // compute optimal number of steps of each sliding window + + const int steps = divRndUp(sy, 15 * WIN_SY); + + int gx = divRndUp(sx, WIN_SX); + int gy = divRndUp(sy, WIN_SY * steps); + + printf("\n sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy); + + // prepare grid size + dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); + // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX); + + // run kernel, possibly measure time and finally check the call + // PERF_BEGIN + fdwt53Kernel<<>>(in, out, sx, sy, steps); + // PERF_END(" FDWT53", sx, sy) + // CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel"); + printf("fdwt53Kernel in launchFDWT53Kernel has finished"); + + } + + + + /// Forward 5/3 2D DWT. See common rules (above) for more details. + /// @param in Expected to be normalized into range [-128, 127]. + /// Will not be preserved (will be overwritten). + /// @param out output buffer on GPU + /// @param sizeX width of input image (in pixels) + /// @param sizeY height of input image (in pixels) + /// @param levels number of recursive DWT levels + void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) { + // select right width of kernel for the size of the image + + if(sizeX >= 960) { + launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY); + } else if (sizeX >= 480) { + launchFDWT53Kernel<128, 8>(in, out, sizeX, sizeY); + } else { + launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY); + } + + // if this was not the last level, continue recursively with other levels + if(levels > 1) { + // copy output's LL band back into input buffer + const int llSizeX = divRndUp(sizeX, 2); + const int llSizeY = divRndUp(sizeY, 2); + // printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY); + memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238 + + // run remaining levels of FDWT + fdwt53(in, out, llSizeX, llSizeY, levels - 1); + } + } + + + +} // end of namespace dwt_cuda diff --git a/examples/dwt2d/dwt_cuda/fdwt97.cu b/examples/dwt2d/dwt_cuda/fdwt97.cu new file mode 100755 index 0000000..d61f674 --- /dev/null +++ b/examples/dwt2d/dwt_cuda/fdwt97.cu @@ -0,0 +1,383 @@ +/// +/// @file fdwt97.cu +/// @brief CUDA implementation of forward 9/7 2D DWT. +/// @author Martin Jirman (207962@mail.muni.cz) +/// @date 2011-01-20 13:18 +/// +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// + + +#include "common.h" +#include "transform_buffer.h" +#include "io.h" + + +namespace dwt_cuda { + + + + /// Wraps a buffer and methods for computing 9/7 FDWT with sliding window + /// of specified size. Template arguments specify this size. + /// @tparam WIN_SIZE_X width of sliding window + /// @tparam WIN_SIZE_Y height of sliding window + template + class FDWT97 { + private: + /// Type of shared memory buffer used for 9/7 DWT. + typedef TransformBuffer FDWT97Buffer; + + /// Actual shared buffer used for forward 9/7 DWT. + FDWT97Buffer buffer; + + /// Difference of indices of two vertically neighboring items in buffer. + enum { STRIDE = FDWT97Buffer::VERTICAL_STRIDE }; + + + /// One thread's info about loading input image + /// @tparam CHECKED true if loader should check for image boundaries + template + struct FDWT97ColumnLoadingInfo { + /// Loader of pixels from some input image. + VerticalDWTPixelLoader loader; + + /// Offset of column loaded by loader. (Offset in shared buffer.) + int offset; + }; + + + /// Horizontal 9/7 FDWT on specified lines of transform buffer. + /// @param lines number of lines to be transformed + /// @param firstLine index of the first line to be transformed + __device__ void horizontalFDWT97(const int lines, const int firstLine) { + __syncthreads(); + + buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict1)); + __syncthreads(); + buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update1)); + __syncthreads(); + buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict2)); + __syncthreads(); + buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update2)); + __syncthreads(); + + buffer.scaleHorizontal(scale97Div, scale97Mul, firstLine, lines); + + __syncthreads(); + + } + + + /// Initializes one column of shared transform buffer with 7 input pixels. + /// Those 7 pixels will not be transformed. Also initializes given loader. + /// @tparam CHECKED true if loader should check for image boundaries + /// @param column (uninitialized) object for loading input pixels + /// @param columnIndex index (not offset!) of the column to be loaded + /// (relative to threadblock's first column) + /// @param input pointer to input image in GPU memory + /// @param sizeX width of the input image + /// @param sizeY height of the input image + /// @param firstY index of first row to be loaded from image + template + __device__ void initColumn(FDWT97ColumnLoadingInfo & column, + const int columnIndex, const float * const input, + const int sizeX, const int sizeY, + const int firstY) { + // get offset of the column with index 'columnIndex' + column.offset = buffer.getColumnOffset(columnIndex); + + // printf(" offset: %d , threadIdx: %d, blockIdx.y %d\n ", column.offset, threadIdx.x, blockIdx.y); + + // x-coordinate of the first pixel to be loaded by given loader + const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex; + + if(blockIdx.y == 0) { + // topmost block - apply mirroring rules when loading first 7 rows + column.loader.init(sizeX, sizeY, firstX, firstY); + + // load pixels in mirrored way + buffer[column.offset + 4 * STRIDE] = column.loader.loadFrom(input); + buffer[column.offset + 3 * STRIDE] = + buffer[column.offset + 5 * STRIDE] = column.loader.loadFrom(input); + buffer[column.offset + 2 * STRIDE] = + buffer[column.offset + 6 * STRIDE] = column.loader.loadFrom(input); + buffer[column.offset + 1 * STRIDE] = column.loader.loadFrom(input); + buffer[column.offset + 0 * STRIDE] = column.loader.loadFrom(input); + + // reinitialize loader to start with pixel #3 again + column.loader.init(sizeX, sizeY, firstX, firstY + 3); + + } else { + // non-topmost row - regular loading: + column.loader.init(sizeX, sizeY, firstX, firstY - 4); + + // load 7 rows into the transform buffer + for(int i = 0; i < 7; i++) { + buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input); + + } + } + // Now, the next pixel, which will be loaded by loader, is pixel #3. + } + + + /// Loads another WIN_SIZE_Y pixels into given column using given loader. + /// @tparam CHECKED true if loader should check for image boundaries + /// @param input input image to load from + /// @param column loader and offset of loaded column in shared buffer + template + inline __device__ void loadWindowIntoColumn(const float * const input, + FDWT97ColumnLoadingInfo & column) { + for(int i = 7; i < (7 + WIN_SIZE_Y); i++) { + buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input); + } + } + + + /// Main GPU 9/7 FDWT entry point. + /// @tparam CHECK_LOADS true if boundaries should be checked when loading + /// @tparam CHECK_WRITES true if boundaries should be checked when writing + /// @param in input image + /// @param out output buffer + /// @param sizeX width of the input image + /// @param sizeY height of the input image + /// @param winSteps number of steps of sliding window + template + __device__ void transform(const float * const in, float * const out, + const int sizeX, const int sizeY, + const int winSteps) { + // info about columns loaded by this thread: one main column and possibly + // one boundary column. (Only some threads load some boundary column.) + FDWT97ColumnLoadingInfo loadedColumn; + FDWT97ColumnLoadingInfo boundaryColumn; + + // Initialize first 7 lines of transform buffer. + const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps; + initColumn(loadedColumn, threadIdx.x, in, sizeX, sizeY, firstY); + + // Some threads initialize boundary columns. + boundaryColumn.offset = 0; + boundaryColumn.loader.clear(); + if(threadIdx.x < 7) { + // each thread among first 7 ones gets index of one of boundary columns + const int colId = threadIdx.x + ((threadIdx.x < 3) ? WIN_SIZE_X : -7); + + // Thread initializes offset of the boundary column (in shared buffer), + // first 7 pixels of the column and a loader for this column. + initColumn(boundaryColumn, colId, in, sizeX, sizeY, firstY); + } + + // horizontally transform first 7 rows in all columns + horizontalFDWT97(7, 0); + + // Index of column handled by this thread. (First half of threads handle + // even columns and others handle odd columns.) + const int outColumnIndex = parityIdx(); + + // writer of output linear bands - initialize it + const int firstX = blockIdx.x * WIN_SIZE_X + outColumnIndex; + VerticalDWTBandWriter writer; + writer.init(sizeX, sizeY, firstX, firstY); + + // transform buffer offset of column transformed and saved by this thread + const int outColumnOffset = buffer.getColumnOffset(outColumnIndex); + + // (Each iteration of this loop assumes that first 7 rows of transform + // buffer are already loaded with horizontally transformed coefficients.) + for(int w = 0; w < winSteps; w++) { + // Load another WIN_SIZE_Y lines of thread's column into the buffer. + loadWindowIntoColumn(in, loadedColumn); + + // some threads also load boundary columns + if(threadIdx.x < 7) { + loadWindowIntoColumn(in, boundaryColumn); + } + + // horizontally transform all newly loaded lines + horizontalFDWT97(WIN_SIZE_Y, 7); + + // Using 7 registers, remember current values of last 7 rows of + // transform buffer. These rows are transformed horizontally only + // and will be used in next iteration. + float last7Lines[7]; + for(int i = 0; i < 7; i++) { + last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE]; + } + + // vertically transform all central columns (do not scale yet) + buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict1)); + buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update1)); + buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict2)); + buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update2)); + + // Save all results of current window. Results are in transform buffer + // at rows from #4 to #(4 + WIN_SIZE_Y). Other rows are invalid now. + // (They only served as a boundary for vertical FDWT.) + + for(int i = 4; i < (4 + WIN_SIZE_Y); i += 2) { + const int index = outColumnOffset + i * STRIDE; + // Write low coefficients from column into low band ... + writer.writeLowInto(out, buffer[index] * scale97Div); + // ... and high coeficients into the high band. + writer.writeHighInto(out, buffer[index + STRIDE] * scale97Mul); + } + + // Use last 7 remembered lines as first 7 lines for next iteration. + // As expected, these lines are already horizontally transformed. + for(int i = 0; i < 7; i++) { + buffer[outColumnOffset + i * STRIDE] = last7Lines[i]; + + } + + // Wait for all writing threads before proceeding to loading new + // pixels in next iteration. (Not to overwrite those which + // are not written yet.) + __syncthreads(); + } + + } + + + public: + /// Runs one of specialized variants of 9/7 FDWT according to distance of + /// processed pixels to image boudnary. Some variants do not check for + /// boudnary and thus are slightly faster. + /// @param in input image + /// @param out output buffer + /// @param sx width of the input image + /// @param sy height of the input image + /// @param steps number of steps of sliding window + __device__ static void run(const float * const input, float * const output, + const int sx, const int sy, const int steps) { + // object with transform buffer in shared memory + __shared__ FDWT97 fdwt97; + + // Compute limits of this threadblock's block of pixels and use them to + // determine, whether this threadblock will have to deal with boundary. + // (3 in next expressions is for radius of impulse response of 9/7 FDWT.) + const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3; + const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3; + const bool atRightBoudary = maxX >= sx; + const bool atBottomBoudary = maxY >= sy; + + // Select specialized version of code according to distance of this + // threadblock's pixels from image boundary. + if(atBottomBoudary) { + // near bottom boundary => check both writing and reading + // printf("\n atBottomBoudary \n "); + fdwt97.transform(input, output, sx, sy, steps); + } else if(atRightBoudary) { + + // near right boundary only => check writing only + fdwt97.transform(input, output, sx, sy, steps); + } else { + + // no nearby boundary => check nothing + fdwt97.transform(input, output, sx, sy, steps); + } + } + + }; // end of class FDWT97 + + + + /// Main GPU 9/7 FDWT entry point. + /// @param input input image + /// @parma output output buffer + /// @param sx width of the input image + /// @param sy height of the input image + /// @param steps number of steps of sliding window + template + __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT97), 8)) + __global__ void fdwt97Kernel(const float * const input, float * const output, + const int sx, const int sy, const int steps) { + // Excuse me, dear reader of this code - this call have to be here. If you + // try to simply put contents of following method right here, CUDA compiler + // (version 3.2) will spit tons of nonsense messy errors ... + // Hope they will not break it even more in future releases. + FDWT97::run(input, output, sx, sy, steps); + } + + + + /// Only computes optimal number of sliding window steps, + /// number of threadblocks and then lanches the 9/7 FDWT kernel. + /// @tparam WIN_SX width of sliding window + /// @tparam WIN_SY height of sliding window + /// @param in input image + /// @param out output buffer + /// @param sx width of the input image + /// @param sy height of the input image + template + void launchFDWT97Kernel (float * in, float * out, int sx, int sy) { + // compute optimal number of steps of each sliding window + const int steps = divRndUp(sy, 15 * WIN_SY); + + // prepare grid size + dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); + printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX); + + // run kernel, possibly measure time and finally check the call + PERF_BEGIN + fdwt97Kernel<<>>(in, out, sx, sy, steps); + PERF_END(" FDWT97", sx, sy) + CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel"); + } + + + + /// Forward 9/7 2D DWT. See common rules (dwt.h) for more details. + /// @param in Input DWT coefficients. Should be normalized (in range + /// [-0.5, 0.5]). Will not be preserved (will be overwritten). + /// @param out output buffer on GPU - format specified in common rules + /// @param sizeX width of input image (in pixels) + /// @param sizeY height of input image (in pixels) + /// @param levels number of recursive DWT levels + void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels) { + // select right width of kernel for the size of the image + if(sizeX >= 960) { + launchFDWT97Kernel<192, 8>(in, out, sizeX, sizeY); + } else if (sizeX >= 480) { + launchFDWT97Kernel<128, 6>(in, out, sizeX, sizeY); + } else { + launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY); + } + + // if this was not the last level, continue recursively with other levels + if(levels > 1) { + // copy output's LL band back into input buffer + const int llSizeX = divRndUp(sizeX, 2); + const int llSizeY = divRndUp(sizeY, 2); + memCopy(in, out, llSizeX, llSizeY); + + // run remaining levels of FDWT + fdwt97(in, out, llSizeX, llSizeY, levels - 1); + } + } + + + +} // end of namespace dwt_cuda diff --git a/examples/dwt2d/dwt_cuda/io.h b/examples/dwt2d/dwt_cuda/io.h new file mode 100755 index 0000000..741def0 --- /dev/null +++ b/examples/dwt2d/dwt_cuda/io.h @@ -0,0 +1,483 @@ +/// +/// @file: io.h +/// @brief Manages loading and saving lineary stored bands and input images. +/// @author Martin Jirman (207962@mail.muni.cz) +/// @date 2011-01-20 22:38 +/// +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// + + +#ifndef IO_H +#define IO_H + + +#include "common.h" + +namespace dwt_cuda { + + + /// Base for all IO classes - manages mirroring. + class DWTIO { + protected: + /// Handles mirroring of image at edges in a DWT correct way. + /// @param d a position in the image (will be replaced by mirrored d) + /// @param sizeD size of the image along the dimension of 'd' + __device__ static void mirror(int & d, const int & sizeD) { + // TODO: enable multiple mirroring: +// if(sizeD > 1) { +// if(d < 0) { +// const int underflow = -1 - d; +// const int phase = (underflow / (sizeD - 1)) & 1; +// const int remainder = underflow % (sizeD - 1); +// if(phase == 0) { +// d = remainder + 1; +// } else { +// d = sizeD - 2 - remainder; +// } +// } else if(d >= sizeD) { +// const int overflow = d - sizeD; +// const int phase = (overflow / (sizeD - 1)) & 1; +// const int remainder = overflow % (sizeD - 1); +// if(phase == 0) { +// d = sizeD - 2 - remainder; +// } else { +// d = remainder + 1; +// } +// } +// } else { +// d = 0; +// } + //for test the mirror's use Feb 17 + if(d >= sizeD) { + d = 2 * sizeD - 2 - d; + } else if(d < 0) { + d = -d; + } + } + }; + + + /// Base class for pixel loader and writer - manages computing start index, + /// stride and end of image for loading column of pixels. + /// @tparam T type of image pixels + /// @tparam CHECKED true = be prepared to image boundary, false = don't care + template + class VerticalDWTPixelIO : protected DWTIO { + protected: + int end; ///< index of bottom neightbor of last pixel of column + int stride; ///< increment of pointer to get to next pixel + + /// Initializes pixel IO - sets end index and a position of first pixel. + /// @param sizeX width of the image + /// @param sizeY height of the image + /// @param firstX x-coordinate of first pixel to use + /// @param firstY y-coordinate of first pixel to use + /// @return index of pixel at position [x, y] in the image + __device__ int initialize(const int sizeX, const int sizeY, + int firstX, int firstY) { + // initialize all pointers and stride + end = CHECKED ? (sizeY * sizeX + firstX) : 0; + stride = sizeX; + return firstX + sizeX * firstY; + } + }; + + + + /// Writes reverse transformed pixels directly into output image. + /// @tparam T type of output pixels + /// @tparam CHECKED true = be prepared to image boundary, false = don't care + template + class VerticalDWTPixelWriter : VerticalDWTPixelIO { + private: + int next; // index of the next pixel to be loaded + + public: + /// Initializes writer - sets output buffer and a position of first pixel. + /// @param sizeX width of the image + /// @param sizeY height of the image + /// @param firstX x-coordinate of first pixel to write into + /// @param firstY y-coordinate of first pixel to write into + __device__ void init(const int sizeX, const int sizeY, + int firstX, int firstY) { + if(firstX < sizeX) { + next = this->initialize(sizeX, sizeY, firstX, firstY); + } else { + this->end = 0; + this->stride = 0; + next = 0; + } + } + + /// Writes given value at next position and advances internal pointer while + /// correctly handling mirroring. + /// @param output output image to write pixel into + /// @param value value of the pixel to be written + __device__ void writeInto(T * const output, const T & value) { + if((!CHECKED) || (next != this->end)) { + output[next] = value; + next += this->stride; + } + } + }; + + + + /// Loads pixels from input image. + /// @tparam T type of image input pixels + /// @tparam CHECKED true = be prepared to image boundary, false = don't care + template + class VerticalDWTPixelLoader + : protected VerticalDWTPixelIO { + private: + int last; ///< index of last loaded pixel + public: + + + //******************* FOR TEST ********************** + __device__ int getlast(){ + return last; + } + __device__ int getend(){ + return this->end; + } + __device__ int getstride(){ + return this->stride; + } + __device__ void setend(int a){ + this->end=a; + } + //******************* FOR TEST ********************** + + + + /// Initializes loader - sets input size and a position of first pixel. + /// @param sizeX width of the image + /// @param sizeY height of the image + /// @param firstX x-coordinate of first pixel to load + /// @param firstY y-coordinate of first pixel to load + __device__ void init(const int sizeX, const int sizeY, + int firstX, int firstY) { + // correctly mirror x coordinate + this->mirror(firstX, sizeX); + + // 'last' always points to already loaded pixel (subtract sizeX = stride) + last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX; + //last = (FirstX + sizeX * FirstY) - sizeX + } + + /// Sets all fields to zeros, for compiler not to complain about + /// uninitialized stuff. + __device__ void clear() { + this->end = 0; + this->stride = 0; + this->last = 0; + } + + /// Gets another pixel and advancees internal pointer to following one. + /// @param input input image to load next pixel from + /// @return next pixel from given image + __device__ T loadFrom(const T * const input) { + last += this->stride; + if(CHECKED && (last == this->end)) { + last -= 2 * this->stride; + this->stride = -this->stride; // reverse loader's direction + } + // avoid reading from negative indices if loader is checked + // return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later + if(last < 0 ) { + return 0; + } + + return input[last]; + // return this->end; + // return last; + // return this->stride; + } + }; + + + + /// Base for band write and loader. Manages computing strides and pointers + /// to first and last pixels in a linearly-stored-bands correct way. + /// @tparam T type of band coefficients + /// @tparam CHECKED true = be prepared to image boundary, false = don't care + template + class VerticalDWTBandIO : protected DWTIO { + protected: + /// index of bottom neighbor of last pixel of loaded column + int end; + + /// increment of index to get from highpass band to the lowpass one + int strideHighToLow; + + /// increment of index to get from the lowpass band to the highpass one + int strideLowToHigh; + + /// Initializes IO - sets size of image and a position of first pixel. + /// @param imageSizeX width of the image + /// @param imageSizeY height of the image + /// @param firstX x-coordinate of first pixel to use + /// (Parity determines vertically low or high band.) + /// @param firstY y-coordinate of first pixel to use + /// (Parity determines horizontally low or high band.) + /// @return index of first item specified by firstX and firstY + __device__ int initialize(const int imageSizeX, const int imageSizeY, + int firstX, int firstY) { + // index of first pixel (topmost one) of the column with index firstX + int columnOffset = firstX / 2; + + // difference between indices of two vertically neighboring pixels + // in the same band + int verticalStride; + + // resolve index of first pixel according to horizontal parity + if(firstX & 1) { + // first pixel in one of right bands + verticalStride = imageSizeX / 2; + columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2); + strideLowToHigh = (imageSizeX * imageSizeY) / 2; + } else { + // first pixel in one of left bands + verticalStride = imageSizeX / 2 + (imageSizeX & 1); + strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX; + } + + // set the other stride + strideHighToLow = verticalStride - strideLowToHigh; + + // compute index of coefficient which indicates end of image + if(CHECKED) { + end = columnOffset // right column + + (imageSizeY / 2) * verticalStride // right row + + (imageSizeY & 1) * strideLowToHigh; // possibly in high band + } else { + end = 0; + } + + + //***********for test************** + // end = CHECKED; + //***********for test************** + + + // finally, return index of the first item + return columnOffset // right column + + (firstY / 2) * verticalStride // right row + + (firstY & 1) * strideLowToHigh; // possibly in high band + } + }; + + + + + /// Directly loads coefficients from four consecutively stored transformed + /// bands. + /// @tparam T type of input band coefficients + /// @tparam CHECKED true = be prepared to image boundary, false = don't care + template + class VerticalDWTBandLoader : public VerticalDWTBandIO { + private: + int last; ///< index of last loaded pixel + + /// Checks internal index and possibly reverses direction of loader. + /// (Handles mirroring at the bottom of the image.) + /// @param input input image to load next coefficient from + /// @param stride stride to use now (one of two loader's strides) + /// @return loaded coefficient + __device__ T updateAndLoad(const T * const input, const int & stride) { + last += stride; + if(CHECKED && (last == this->end)) { + // undo last two updates of index (to get to previous mirrored item) + last -= (this->strideLowToHigh + this->strideHighToLow); + + // swap and reverse strides (to move up in the loaded column now) + const int temp = this->strideLowToHigh; + this->strideLowToHigh = -this->strideHighToLow; + this->strideHighToLow = -temp; + } + if(last < 0 ) { + return 0; + } + // avoid reading from negative indices if loader is checked + // return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later + return input[last]; + } + public: + + /// Initializes loader - sets input size and a position of first pixel. + /// @param imageSizeX width of the image + /// @param imageSizeY height of the image + /// @param firstX x-coordinate of first pixel to load + /// (Parity determines vertically low or high band.) + /// @param firstY y-coordinate of first pixel to load + /// (Parity determines horizontally low or high band.) + __device__ void init(const int imageSizeX, const int imageSizeY, + int firstX, const int firstY) { + this->mirror(firstX, imageSizeX); + last = this->initialize(imageSizeX, imageSizeY, firstX, firstY); + + // adjust to point to previous item + last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow; + } + + /// Sets all fields to zeros, for compiler not to complain about + /// uninitialized stuff. + __device__ void clear() { + this->end = 0; + this->strideHighToLow = 0; + this->strideLowToHigh = 0; + this->last = 0; + } + + /// Gets another coefficient from lowpass band and advances internal index. + /// Call this method first if position of first pixel passed to init + /// was in high band. + /// @param input input image to load next coefficient from + /// @return next coefficient from the lowpass band of the given image + __device__ T loadLowFrom(const T * const input) { + return updateAndLoad(input, this->strideHighToLow); + } + + /// Gets another coefficient from the highpass band and advances index. + /// Call this method first if position of first pixel passed to init + /// was in high band. + /// @param input input image to load next coefficient from + /// @return next coefficient from the highbass band of the given image + __device__ T loadHighFrom(const T * const input) { + return updateAndLoad(input, this->strideLowToHigh); + } + + }; + + + + + /// Directly saves coefficients into four transformed bands. + /// @tparam T type of output band coefficients + /// @tparam CHECKED true = be prepared to image boundary, false = don't care + template + class VerticalDWTBandWriter : public VerticalDWTBandIO { + private: + int next; ///< index of last loaded pixel + + /// Checks internal index and possibly stops the writer. + /// (Handles mirroring at edges of the image.) + /// @param output output buffer + /// @param item item to put into the output + /// @param stride increment of the pointer to get to next output index + __device__ int saveAndUpdate(T * const output, const T & item, + const int & stride) { +// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){ //test, Mar 20 + if((!CHECKED) || (next != this->end)) { + // if(next == 4) { + // printf(" next: %d stride: %d val: %f \n", next, stride, item ); + // } + output[next] = item; + next += stride; + } +// } + // if((!CHECKED) || (next != this->end)) { //the real one + // output[next] = item; + // next += stride; //stride has been test + // } + return next; + } + public: + + /// Initializes writer - sets output size and a position of first pixel. + /// @param output output image + /// @param imageSizeX width of the image + /// @param imageSizeY height of the image + /// @param firstX x-coordinate of first pixel to write + /// (Parity determines vertically low or high band.) + /// @param firstY y-coordinate of first pixel to write + /// (Parity determines horizontally low or high band.) + __device__ void init(const int imageSizeX, const int imageSizeY, + const int firstX, const int firstY) { + if (firstX < imageSizeX) { + next = this->initialize(imageSizeX, imageSizeY, firstX, firstY); + } else { + clear(); + } + } + + /// Sets all fields to zeros, for compiler not to complain about + /// uninitialized stuff. + __device__ void clear() { + this->end = 0; + this->strideHighToLow = 0; + this->strideLowToHigh = 0; + this->next = 0; + } + + /// Writes another coefficient into the band which was specified using + /// init's firstX and firstY parameters and advances internal pointer. + /// Call this method first if position of first pixel passed to init + /// was in lowpass band. + /// @param output output image + /// @param low lowpass coefficient to save into the lowpass band + __device__ int writeLowInto(T * const output, const T & primary) { + return saveAndUpdate(output, primary, this->strideLowToHigh); + } + + /// Writes another coefficient from the other band and advances pointer. + /// Call this method first if position of first pixel passed to init + /// was in highpass band. + /// @param output output image + /// @param high highpass coefficient to save into the highpass band + __device__ int writeHighInto(T * const output, const T & other) { + return saveAndUpdate(output, other, this->strideHighToLow); + } + + //*******Add three functions to get private values******* + __device__ int getnext(){ + return next; + } + + __device__ int getend(){ + return this->end; + } + + __device__ int getstrideHighToLow(){ + return this->strideHighToLow; + } + + __device__ int getstrideLowToHigh(){ + return this->strideLowToHigh; + } + + //*******Add three functions to get private values******* + }; + + + +} // namespace dwt_cuda + + +#endif // IO_H + diff --git a/examples/dwt2d/dwt_cuda/rdwt53.cu b/examples/dwt2d/dwt_cuda/rdwt53.cu new file mode 100755 index 0000000..c5ecb2b --- /dev/null +++ b/examples/dwt2d/dwt_cuda/rdwt53.cu @@ -0,0 +1,360 @@ +/// +/// @file rdwt53.cu +/// @brief CUDA implementation of reverse 5/3 2D DWT. +/// @author Martin Jirman (207962@mail.muni.cz) +/// @date 2011-02-04 14:19 +/// +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// + + +#include "common.h" +#include "transform_buffer.h" +#include "io.h" + + +namespace dwt_cuda { + + + + /// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT + /// using sliding window and lifting schema. + /// @tparam WIN_SIZE_X width of sliding window + /// @tparam WIN_SIZE_Y height of sliding window + template + class RDWT53 { + private: + + /// Shared memory buffer used for 5/3 DWT transforms. + typedef TransformBuffer RDWT53Buffer; + + /// Shared buffer used for reverse 5/3 DWT. + RDWT53Buffer buffer; + + /// Difference between indices of two vertically neighboring items in buffer. + enum { STRIDE = RDWT53Buffer::VERTICAL_STRIDE }; + + + /// Info needed for loading of one input column from input image. + /// @tparam CHECKED true if loader should check boundaries + template + struct RDWT53Column { + /// loader of pixels from column in input image + VerticalDWTBandLoader loader; + + /// Offset of corresponding column in shared buffer. + int offset; + + /// Sets all fields to some values to avoid 'uninitialized' warnings. + __device__ void clear() { + offset = 0; + loader.clear(); + } + }; + + + /// 5/3 DWT reverse update operation. + struct Reverse53Update { + __device__ void operator() (const int p, int & c, const int n) const { + c -= (p + n + 2) / 4; // F.3, page 118, ITU-T Rec. T.800 final draft + } + }; + + + /// 5/3 DWT reverse predict operation. + struct Reverse53Predict { + __device__ void operator() (const int p, int & c, const int n) const { + c += (p + n) / 2; // F.4, page 118, ITU-T Rec. T.800 final draft + } + }; + + + /// Horizontal 5/3 RDWT on specified lines of transform buffer. + /// @param lines number of lines to be transformed + /// @param firstLine index of the first line to be transformed + __device__ void horizontalTransform(const int lines, const int firstLine) { + __syncthreads(); + buffer.forEachHorizontalEven(firstLine, lines, Reverse53Update()); + __syncthreads(); + buffer.forEachHorizontalOdd(firstLine, lines, Reverse53Predict()); + __syncthreads(); + } + + + /// Using given loader, it loads another WIN_SIZE_Y coefficients + /// into specified column. + /// @tparam CHECKED true if loader should check image boundaries + /// @param input input coefficients to load from + /// @param col info about loaded column + template + inline __device__ void loadWindowIntoColumn(const int * const input, + RDWT53Column & col) { + for(int i = 3; i < (3 + WIN_SIZE_Y); i += 2) { + buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input); + buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input); + } + } + + + /// Initializes one column of shared transform buffer with 7 input pixels. + /// Those 7 pixels will not be transformed. Also initializes given loader. + /// @tparam CHECKED true if loader should check image boundaries + /// @param columnX x coordinate of column in shared transform buffer + /// @param input input image + /// @param sizeX width of the input image + /// @param sizeY height of the input image + /// @param loader (uninitialized) info about loaded column + template + __device__ void initColumn(const int columnX, const int * const input, + const int sizeX, const int sizeY, + RDWT53Column & column, + const int firstY) { + // coordinates of the first coefficient to be loaded + const int firstX = blockIdx.x * WIN_SIZE_X + columnX; + + // offset of the column with index 'colIndex' in the transform buffer + column.offset = buffer.getColumnOffset(columnX); + + if(blockIdx.y == 0) { + // topmost block - apply mirroring rules when loading first 3 rows + column.loader.init(sizeX, sizeY, firstX, firstY); + + // load pixels in mirrored way + buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input); + buffer[column.offset + 0 * STRIDE] = + buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input); + } else { + // non-topmost row - regular loading: + column.loader.init(sizeX, sizeY, firstX, firstY - 1); + buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input); + buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input); + buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input); + } + // Now, the next coefficient, which will be loaded by loader, is #2. + } + + + /// Actual GPU 5/3 RDWT implementation. + /// @tparam CHECKED_LOADS true if boundaries must be checked when reading + /// @tparam CHECKED_WRITES true if boundaries must be checked when writing + /// @param in input image (5/3 transformed coefficients) + /// @param out output buffer (for reverse transformed image) + /// @param sizeX width of the output image + /// @param sizeY height of the output image + /// @param winSteps number of sliding window steps + template + __device__ void transform(const int * const in, int * const out, + const int sizeX, const int sizeY, + const int winSteps) { + // info about one main and one boundary column + RDWT53Column column, boundaryColumn; + + // index of first row to be transformed + const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps; + + // some threads initialize boundary columns + boundaryColumn.clear(); + if(threadIdx.x < 3) { + // First 3 threads also handle boundary columns. Thread #0 gets right + // column #0, thread #1 get right column #1 and thread #2 left column. + const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3); + + // Thread initializes offset of the boundary column (in shared + // buffer), first 3 pixels of the column and a loader for this column. + initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY); + } + + // All threads initialize central columns. + initColumn(parityIdx(), in, sizeX, sizeY, column, firstY); + + // horizontally transform first 3 rows + horizontalTransform(3, 0); + + // writer of output pixels - initialize it + const int outX = blockIdx.x * WIN_SIZE_X + threadIdx.x; + VerticalDWTPixelWriter writer; + writer.init(sizeX, sizeY, outX, firstY); + + // offset of column (in transform buffer) saved by this thread + const int outputColumnOffset = buffer.getColumnOffset(threadIdx.x); + + // (Each iteration assumes that first 3 rows of transform buffer are + // already loaded with horizontally transformed pixels.) + for(int w = 0; w < winSteps; w++) { + // Load another WIN_SIZE_Y lines of this thread's column + // into the transform buffer. + loadWindowIntoColumn(in, column); + + // possibly load boundary columns + if(threadIdx.x < 3) { + loadWindowIntoColumn(in, boundaryColumn); + } + + // horizontally transform all newly loaded lines + horizontalTransform(WIN_SIZE_Y, 3); + + // Using 3 registers, remember current values of last 3 rows + // of transform buffer. These rows are transformed horizontally + // only and will be used in next iteration. + int last3Lines[3]; + last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE]; + last3Lines[1] = buffer[outputColumnOffset + (WIN_SIZE_Y + 1) * STRIDE]; + last3Lines[2] = buffer[outputColumnOffset + (WIN_SIZE_Y + 2) * STRIDE]; + + // vertically transform all central columns + buffer.forEachVerticalOdd(outputColumnOffset, Reverse53Update()); + buffer.forEachVerticalEven(outputColumnOffset, Reverse53Predict()); + + // Save all results of current window. Results are in transform buffer + // at rows from #1 to #(1 + WIN_SIZE_Y). Other rows are invalid now. + // (They only served as a boundary for vertical RDWT.) + for(int i = 1; i < (1 + WIN_SIZE_Y); i++) { + writer.writeInto(out, buffer[outputColumnOffset + i * STRIDE]); + } + + // Use last 3 remembered lines as first 3 lines for next iteration. + // As expected, these lines are already horizontally transformed. + buffer[outputColumnOffset + 0 * STRIDE] = last3Lines[0]; + buffer[outputColumnOffset + 1 * STRIDE] = last3Lines[1]; + buffer[outputColumnOffset + 2 * STRIDE] = last3Lines[2]; + + // Wait for all writing threads before proceeding to loading new + // coeficients in next iteration. (Not to overwrite those which + // are not written yet.) + __syncthreads(); + } + } + + + public: + /// Main GPU 5/3 RDWT entry point. + /// @param in input image (5/3 transformed coefficients) + /// @param out output buffer (for reverse transformed image) + /// @param sizeX width of the output image + /// @param sizeY height of the output image + /// @param winSteps number of sliding window steps + __device__ static void run(const int * const input, int * const output, + const int sx, const int sy, const int steps) { + // prepare instance with buffer in shared memory + __shared__ RDWT53 rdwt53; + + // Compute limits of this threadblock's block of pixels and use them to + // determine, whether this threadblock will have to deal with boundary. + // (1 in next expressions is for radius of impulse response of 5/3 RDWT.) + const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1; + const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1; + const bool atRightBoudary = maxX >= sx; + const bool atBottomBoudary = maxY >= sy; + + // Select specialized version of code according to distance of this + // threadblock's pixels from image boundary. + if(atBottomBoudary) { + // near bottom boundary => check both writing and reading + rdwt53.transform(input, output, sx, sy, steps); + } else if(atRightBoudary) { + // near right boundary only => check writing only + rdwt53.transform(input, output, sx, sy, steps); + } else { + // no nearby boundary => check nothing + rdwt53.transform(input, output, sx, sy, steps); + } + } + + }; // end of class RDWT53 + + + + /// Main GPU 5/3 RDWT entry point. + /// @param in input image (5/3 transformed coefficients) + /// @param out output buffer (for reverse transformed image) + /// @param sizeX width of the output image + /// @param sizeY height of the output image + /// @param winSteps number of sliding window steps + template + __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT53), 8)) + __global__ void rdwt53Kernel(const int * const in, int * const out, + const int sx, const int sy, const int steps) { + RDWT53::run(in, out, sx, sy, steps); + } + + + + /// Only computes optimal number of sliding window steps, + /// number of threadblocks and then lanches the 5/3 RDWT kernel. + /// @tparam WIN_SX width of sliding window + /// @tparam WIN_SY height of sliding window + /// @param in input image + /// @param out output buffer + /// @param sx width of the input image + /// @param sy height of the input image + template + void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) { + // compute optimal number of steps of each sliding window + const int steps = divRndUp(sy, 15 * WIN_SY); + + // prepare grid size + dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); + + // finally transform this level + PERF_BEGIN + rdwt53Kernel<<>>(in, out, sx, sy, steps); + PERF_END(" RDWT53", sx, sy) + CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel"); + } + + + + /// Reverse 5/3 2D DWT. See common rules (above) for more details. + /// @param in Input DWT coefficients. Format described in common rules. + /// Will not be preserved (will be overwritten). + /// @param out output buffer on GPU - will contain original image + /// in normalized range [-128, 127]. + /// @param sizeX width of input image (in pixels) + /// @param sizeY height of input image (in pixels) + /// @param levels number of recursive DWT levels + void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels) { + if(levels > 1) { + // let this function recursively reverse transform deeper levels first + const int llSizeX = divRndUp(sizeX, 2); + const int llSizeY = divRndUp(sizeY, 2); + rdwt53(in, out, llSizeX, llSizeY, levels - 1); + + // copy reverse transformed LL band from output back into the input + memCopy(in, out, llSizeX, llSizeY); + } + + // select right width of kernel for the size of the image + if(sizeX >= 960) { + launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY); + } else if (sizeX >= 480) { + launchRDWT53Kernel<128, 8>(in, out, sizeX, sizeY); + } else { + launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY); + } + } + + +} // end of namespace dwt_cuda diff --git a/examples/dwt2d/dwt_cuda/rdwt97.cu b/examples/dwt2d/dwt_cuda/rdwt97.cu new file mode 100755 index 0000000..151d69d --- /dev/null +++ b/examples/dwt2d/dwt_cuda/rdwt97.cu @@ -0,0 +1,363 @@ +/// +/// @file rdwt97.cu +/// @brief CUDA implementation of reverse 9/7 2D DWT. +/// @author Martin Jirman (207962@mail.muni.cz) +/// @date 2011-02-03 21:59 +/// +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// + + +#include "common.h" +#include "transform_buffer.h" +#include "io.h" + + +namespace dwt_cuda { + + + /// Wraps shared memory buffer and methods for computing 9/7 RDWT using + /// lifting schema and sliding window. + /// @tparam WIN_SIZE_X width of the sliding window + /// @tparam WIN_SIZE_Y height of the sliding window + template + class RDWT97 { + private: + + /// Info related to loading of one input column. + /// @tparam CHECKED true if boundary chould be checked, + /// false if there is no near boudnary + template + struct RDWT97Column { + /// laoder of input pxels for given column. + VerticalDWTBandLoader loader; + + /// Offset of loaded column in shared memory buffer. + int offset; + + /// Sets all fields to some values to avoid 'uninitialized' warnings. + __device__ void clear() { + loader.clear(); + offset = 0; + } + }; + + + /// Shared memory buffer used for 9/7 DWT transforms. + typedef TransformBuffer RDWT97Buffer; + + /// Shared buffer used for reverse 9/7 DWT. + RDWT97Buffer buffer; + + /// Difference between indices of two vertical neighbors in buffer. + enum { STRIDE = RDWT97Buffer::VERTICAL_STRIDE }; + + + /// Horizontal 9/7 RDWT on specified lines of transform buffer. + /// @param lines number of lines to be transformed + /// @param firstLine index of the first line to be transformed + __device__ void horizontalRDWT97(int lines, int firstLine) { + __syncthreads(); + buffer.scaleHorizontal(scale97Mul, scale97Div, firstLine, lines); + __syncthreads(); + buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update2)); + __syncthreads(); + buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97predict2)); + __syncthreads(); + buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update1)); + __syncthreads(); + buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97Predict1)); + __syncthreads(); + } + + + /// Initializes one column of shared transform buffer with 7 input pixels. + /// Those 7 pixels will not be transformed. Also initializes given loader. + /// @tparam CHECKED true if there are near image boundaries + /// @param colIndex index of column in shared transform buffer + /// @param input input image + /// @param sizeX width of the input image + /// @param sizeY height of the input image + /// @param column (uninitialized) info about loading one column + /// @param firstY index of first image row to be transformed + template + __device__ void initColumn(const int colIndex, const float * const input, + const int sizeX, const int sizeY, + RDWT97Column & column, + const int firstY) { + // coordinates of the first coefficient to be loaded + const int firstX = blockIdx.x * WIN_SIZE_X + colIndex; + + // offset of the column with index 'colIndex' in the transform buffer + column.offset = buffer.getColumnOffset(colIndex); + + if(blockIdx.y == 0) { + // topmost block - apply mirroring rules when loading first 7 rows + column.loader.init(sizeX, sizeY, firstX, firstY); + + // load pixels in mirrored way + buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input); + buffer[column.offset + 4 * STRIDE] = + buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input); + buffer[column.offset + 5 * STRIDE] = + buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input); + buffer[column.offset + 6 * STRIDE] = + buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input); + } else { + // non-topmost row - regular loading: + column.loader.init(sizeX, sizeY, firstX, firstY - 3); + buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input); + buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input); + buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input); + buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input); + buffer[column.offset + 4 * STRIDE] = column.loader.loadHighFrom(input); + buffer[column.offset + 5 * STRIDE] = column.loader.loadLowFrom(input); + buffer[column.offset + 6 * STRIDE] = column.loader.loadHighFrom(input); + } + // Now, the next coefficient, which will be loaded by loader, is #4. + } + + + /// Using given loader, it loads another WIN_SIZE_Y coefficients + /// into specified column. + /// @tparam CHECKED true if there are near image boundaries + /// @param col info about loaded column + /// @param input buffer with input coefficients + template + inline __device__ void loadWindowIntoColumn(RDWT97Column & col, + const float * const input) { + for(int i = 7; i < (7 + WIN_SIZE_Y); i += 2) { + buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input); + buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input); + } + } + + + /// Actual GPU 9/7 RDWT sliding window lifting schema implementation. + /// @tparam CHECKED_LOADS true if loader should check boundaries + /// @tparam CHECKED_WRITES true if boundaries should be taken into account + /// when writing into output buffer + /// @param in input image (9/7 transformed coefficients) + /// @param out output buffer (for reverse transformed image) + /// @param sizeX width of the output image + /// @param sizeY height of the output image + /// @param winSteps number of steps of sliding window + template + __device__ void transform(const float * const in, float * const out, + const int sizeX, const int sizeY, + const int winSteps) { + // info about one main column and one boundary column + RDWT97Column column; + RDWT97Column boundaryColumn; + + // index of first image row to be transformed + const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps; + + // initialize boundary columns + boundaryColumn.clear(); + if(threadIdx.x < 7) { + // each thread among first 7 ones gets index of one of boundary columns + const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7); + + // Thread initializes offset of the boundary column (in shared + // buffer), first 7 pixels of the column and a loader for this column. + initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY); + } + + // All threads initialize central columns. + initColumn(parityIdx(), in, sizeX, sizeY, column, firstY); + + // horizontally transform first 7 rows + horizontalRDWT97(7, 0); + + // writer of output pixels - initialize it + const int outputX = blockIdx.x * WIN_SIZE_X + threadIdx.x; + VerticalDWTPixelWriter writer; + writer.init(sizeX, sizeY, outputX, firstY); + + // offset of column (in transform buffer) saved by this thread + const int outColumnOffset = buffer.getColumnOffset(threadIdx.x); + + // (Each iteration assumes that first 7 rows of transform buffer are + // already loaded with horizontally transformed pixels.) + for(int w = 0; w < winSteps; w++) { + // Load another WIN_SIZE_Y lines of this thread's column + // into the transform buffer. + loadWindowIntoColumn(column, in); + + // possibly load boundary columns + if(threadIdx.x < 7) { + loadWindowIntoColumn(boundaryColumn, in); + } + + // horizontally transform all newly loaded lines + horizontalRDWT97(WIN_SIZE_Y, 7); + + // Using 7 registers, remember current values of last 7 rows + // of transform buffer. These rows are transformed horizontally + // only and will be used in next iteration. + float last7Lines[7]; + for(int i = 0; i < 7; i++) { + last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE]; + } + + // vertically transform all central columns + buffer.scaleVertical(scale97Div, scale97Mul, outColumnOffset, + WIN_SIZE_Y + 7, 0); + buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update2)); + buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97predict2)); + buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update1)); + buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97Predict1)); + + // Save all results of current window. Results are in transform buffer + // at rows from #3 to #(3 + WIN_SIZE_Y). Other rows are invalid now. + // (They only served as a boundary for vertical RDWT.) + for(int i = 3; i < (3 + WIN_SIZE_Y); i++) { + writer.writeInto(out, buffer[outColumnOffset + i * STRIDE]); + } + + // Use last 7 remembered lines as first 7 lines for next iteration. + // As expected, these lines are already horizontally transformed. + for(int i = 0; i < 7; i++) { + buffer[outColumnOffset + i * STRIDE] = last7Lines[i]; + } + + // Wait for all writing threads before proceeding to loading new + // coeficients in next iteration. (Not to overwrite those which + // are not written yet.) + __syncthreads(); + } + } + + + public: + /// Main GPU 9/7 RDWT entry point. + /// @param in input image (9/7 transformed coefficients) + /// @param out output buffer (for reverse transformed image) + /// @param sizeX width of the output image + /// @param sizeY height of the output image + __device__ static void run(const float * const input, float * const output, + const int sx, const int sy, const int steps) { + // prepare instance with buffer in shared memory + __shared__ RDWT97 rdwt97; + + // Compute limits of this threadblock's block of pixels and use them to + // determine, whether this threadblock will have to deal with boundary. + // (3 in next expressions is for radius of impulse response of 9/7 RDWT.) + const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3; + const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3; + const bool atRightBoudary = maxX >= sx; + const bool atBottomBoudary = maxY >= sy; + + // Select specialized version of code according to distance of this + // threadblock's pixels from image boundary. + if(atBottomBoudary) { + // near bottom boundary => check both writing and reading + rdwt97.transform(input, output, sx, sy, steps); + } else if(atRightBoudary) { + // near right boundary only => check writing only + rdwt97.transform(input, output, sx, sy, steps); + } else { + // no nearby boundary => check nothing + rdwt97.transform(input, output, sx, sy, steps); + } + } + + }; // end of class RDWT97 + + + + /// Main GPU 9/7 RDWT entry point. + /// @param in input image (9/7 transformed coefficients) + /// @param out output buffer (for reverse transformed image) + /// @param sizeX width of the output image + /// @param sizeY height of the output image + template + __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97), 8)) + __global__ void rdwt97Kernel(const float * const in, float * const out, + const int sx, const int sy, const int steps) { + RDWT97::run(in, out, sx, sy, steps); + } + + + + /// Only computes optimal number of sliding window steps, + /// number of threadblocks and then lanches the 9/7 RDWT kernel. + /// @tparam WIN_SX width of sliding window + /// @tparam WIN_SY height of sliding window + /// @param in input image + /// @param out output buffer + /// @param sx width of the input image + /// @param sy height of the input image + template + void launchRDWT97Kernel (float * in, float * out, int sx, int sy) { + // compute optimal number of steps of each sliding window + const int steps = divRndUp(sy, 15 * WIN_SY); + + // prepare grid size + dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); + + // finally launch kernel + PERF_BEGIN + rdwt97Kernel<<>>(in, out, sx, sy, steps); + PERF_END(" RDWT97", sx, sy) + CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel"); + } + + + + /// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details. + /// @param in Input DWT coefficients. Format described in common rules. + /// Will not be preserved (will be overwritten). + /// @param out output buffer on GPU - will contain original image + /// in normalized range [-0.5, 0.5]. + /// @param sizeX width of input image (in pixels) + /// @param sizeY height of input image (in pixels) + /// @param levels number of recursive DWT levels + void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels) { + if(levels > 1) { + // let this function recursively reverse transform deeper levels first + const int llSizeX = divRndUp(sizeX, 2); + const int llSizeY = divRndUp(sizeY, 2); + rdwt97(in, out, llSizeX, llSizeY, levels - 1); + + // copy reverse transformed LL band from output back into the input + memCopy(in, out, llSizeX, llSizeY); + } + + // select right width of kernel for the size of the image + if(sizeX >= 960) { + launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY); + } else if (sizeX >= 480) { + launchRDWT97Kernel<128, 6>(in, out, sizeX, sizeY); + } else { + launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY); + } + } + + + +} // end of namespace dwt_cuda diff --git a/examples/dwt2d/dwt_cuda/transform_buffer.h b/examples/dwt2d/dwt_cuda/transform_buffer.h new file mode 100755 index 0000000..69b74e2 --- /dev/null +++ b/examples/dwt2d/dwt_cuda/transform_buffer.h @@ -0,0 +1,373 @@ +/// line 248 the index +/// @file transform_buffer.h +/// @brief Buffer with separated even and odd columns and related algorithms. +/// @author Martin Jirman (207962@mail.muni.cz) +/// @date 2011-01-20 18:33 +/// +/// +/// Copyright (c) 2011 Martin Jirman +/// All rights reserved. +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above copyright +/// notice, this list of conditions and the following disclaimer in the +/// documentation and/or other materials provided with the distribution. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. +/// + + +#ifndef TRANSFORM_BUFFER_H +#define TRANSFORM_BUFFER_H + + +namespace dwt_cuda { + + + /// Buffer (in shared memory of GPU) where block of input image is stored, + /// but odd and even lines are separated. (Generates less bank conflicts when + /// using lifting schema.) All operations expect SIZE_X threads. + /// Also implements basic building blocks of lifting schema. + /// @tparam SIZE_X width of the buffer excluding two boundaries (Also + /// a number of threads participating on all operations.) + /// Must be divisible by 4. + /// @tparam SIZE_Y height of buffer (total number of lines) + /// @tparam BOUNDARY_X number of extra pixels at the left and right side + /// boundary is expected to be smaller than half SIZE_X + /// Must be divisible by 2. + template + class TransformBuffer { + public: + enum { + /// difference between pointers to two vertical neigbors + VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2) + }; + + private: + enum { + /// number of shared memory banks - needed for correct padding + #ifdef __CUDA_ARCH__ + SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16), + #else + SHM_BANKS = 16, // for host code only - can be anything, won't be used + #endif + + /// size of one of two buffers (odd or even) + BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y, + + /// unused space between two buffers + PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS), + + /// offset of the odd columns buffer from the beginning of data buffer + ODD_OFFSET = BUFFER_SIZE + PADDING, + }; + + /// buffer for both even and odd columns + T data[2 * BUFFER_SIZE + PADDING]; + + + + /// Applies specified function to all central elements while also passing + /// previous and next elements as parameters. + /// @param count count of central elements to apply function to + /// @param prevOffset offset of first central element + /// @param midOffset offset of first central element's predecessor + /// @param nextOffset offset of first central element's successor + /// @param function the function itself + template + __device__ void horizontalStep(const int count, const int prevOffset, + const int midOffset, const int nextOffset, + const FUNC & function) { + // number of unchecked iterations + const int STEPS = count / SIZE_X; + + // items remaining after last unchecked iteration + const int finalCount = count % SIZE_X; + + // offset of items processed in last (checked) iteration + const int finalOffset = count - finalCount; + + // all threads perform fixed number of iterations ... + for(int i = 0; i < STEPS; i++) { + // for(int i = 0; i < 3; i++) { + const T previous = data[prevOffset + i * SIZE_X + threadIdx.x]; + const T next = data[nextOffset + i * SIZE_X + threadIdx.x]; + T & center = data[midOffset + i * SIZE_X + threadIdx.x]; + // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x)); + function(previous, center, next);// the real one + } + + // ... but not all threads participate on final iteration + if(threadIdx.x < finalCount) { + const T previous = data[prevOffset + finalOffset + threadIdx.x]; + const T next = data[nextOffset + finalOffset + threadIdx.x]; + T & center = data[midOffset + finalOffset + threadIdx.x]; + // function(previous, center, (nextOffset+finalOffset+threadIdx.x)); + // kaixi + function(previous, center, next);//the real one + } + } + + public: + + __device__ void getPrintData() { + // + for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) { + printf(" index: %d data: %f \n ", i ,data[i]); + } + + } + + + /// Gets offset of the column with given index. Central columns have + /// indices from 0 to NUM_LINES - 1, left boundary columns have negative + /// indices and right boundary columns indices start with NUM_LINES. + /// @param columnIndex index of column to get pointer to + /// @return offset of the first item of column with specified index + __device__ int getColumnOffset(int columnIndex) { + columnIndex += BOUNDARY_X; // skip boundary + return columnIndex / 2 // select right column + + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer + } + + + /// Provides access to data of the transform buffer. + /// @param index index of the item to work with + /// @return reference to item at given index + __device__ T & operator[] (const int index) { + return data[index]; + } + + + /// Applies specified function to all horizontally even elements in + /// specified lines. (Including even elements in boundaries except + /// first even element in first left boundary.) SIZE_X threads participate + /// and synchronization is needed before result can be used. + /// @param firstLine index of first line + /// @param numLines count of lines + /// @param func function to be applied on all even elements + /// parameters: previous (odd) element, the even + /// element itself and finally next (odd) element + template + __device__ void forEachHorizontalEven(const int firstLine, + const int numLines, + const FUNC & func) { + // number of even elemens to apply function to + const int count = numLines * VERTICAL_STRIDE - 1; + // offset of first even element + const int centerOffset = firstLine * VERTICAL_STRIDE + 1; + // offset of odd predecessor of first even element + const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET; + // offset of odd successor of first even element + const int nextOffset = prevOffset + 1; + + // if(threadIdx.x == 0) { + + // printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); + // } + + // call generic horizontal step function + horizontalStep(count, prevOffset, centerOffset, nextOffset, func); + } + + + /// Applies given function to all horizontally odd elements in specified + /// lines. (Including odd elements in boundaries except last odd element + /// in last right boundary.) SIZE_X threads participate and synchronization + /// is needed before result can be used. + /// @param firstLine index of first line + /// @param numLines count of lines + /// @param func function to be applied on all odd elements + /// parameters: previous (even) element, the odd + /// element itself and finally next (even) element + template + __device__ void forEachHorizontalOdd(const int firstLine, + const int numLines, + const FUNC & func) { + // numbet of odd elements to apply function to + const int count = numLines * VERTICAL_STRIDE - 1; + // offset of even predecessor of first odd element + const int prevOffset = firstLine * VERTICAL_STRIDE; + // offset of first odd element + const int centerOffset = prevOffset + ODD_OFFSET; + // offset of even successor of first odd element + const int nextOffset = prevOffset + 1; + + // if(threadIdx.x == 0) { + // printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); + // } + + + // call generic horizontal step function + horizontalStep(count, prevOffset, centerOffset, nextOffset, func); + } + + + /// Applies specified function to all even elements (except element #0) + /// of given column. Each thread takes care of one column, so there's + /// no need for synchronization. + /// @param columnOffset offset of thread's column + /// @param f function to be applied on all even elements + /// parameters: previous (odd) element, the even + /// element itself and finally next (odd) element + template + __device__ void forEachVerticalEven(const int columnOffset, const F & f) { + if(SIZE_Y > 3) { // makes no sense otherwise + const int steps = SIZE_Y / 2 - 1; + for(int i = 0; i < steps; i++) { + const int row = 2 + i * 2; + const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; + const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; + f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next); + + //--------------- FOR TEST ----------------- +/* __syncthreads(); + if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ + diffOut[2500]++; + diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE]; + } + __syncthreads(); +*/ //--------------- FOR TEST ----------------- + + + } + } + } + + + /// Applies specified function to all odd elements of given column. + /// Each thread takes care of one column, so there's no need for + /// synchronization. + /// @param columnOffset offset of thread's column + /// @param f function to be applied on all odd elements + /// parameters: previous (even) element, the odd + /// element itself and finally next (even) element + template + __device__ void forEachVerticalOdd(const int columnOffset, const F & f) { + const int steps = (SIZE_Y - 1) / 2; + for(int i = 0; i < steps; i++) { + const int row = i * 2 + 1; + const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; + const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; + + f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); + + + //--------------- FOR TEST ----------------- +/* __syncthreads(); + if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ + diffOut[2500]++; + diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE]; + } + + __syncthreads(); +*/ //--------------- FOR TEST ----------------- + } + } + + + + /// Scales elements at specified lines. + /// @param evenScale scaling factor for horizontally even elements + /// @param oddScale scaling factor for horizontally odd elements + /// @param numLines number of lines, whose elements should be scaled + /// @param firstLine index of first line to scale elements in + __device__ void scaleHorizontal(const T evenScale, const T oddScale, + const int firstLine, const int numLines) { + const int offset = firstLine * VERTICAL_STRIDE; + const int count = numLines * VERTICAL_STRIDE; + const int steps = count / SIZE_X; + const int finalCount = count % SIZE_X; + const int finalOffset = count - finalCount; + + // printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset); + + // run iterations, whete all threads participate + for(int i = 0; i < steps; i++) { + data[threadIdx.x + i * SIZE_X + offset] *= evenScale; + // if(threadIdx.x + i * SIZE_X + offset == 531) { + // printf("threadidx 531: %d \n", threadIdx.x); + // } + // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) { + // printf("threadidx 531: %d \n", threadIdx.x); + // } + data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale; + } + + // some threads also finish remaining unscaled items + if(threadIdx.x < finalCount) { + data[threadIdx.x + finalOffset + offset] *= evenScale; + // if(threadIdx.x + finalOffset + offset == 531) { + // printf("threadidx 531: %d \n", threadIdx.x); + // } + // if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) { + // printf("threadidx 531: %d \n", threadIdx.x); + // } + data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale; + } + + } + + + /// Scales elements in specified column. + /// @param evenScale scaling factor for vertically even elements + /// @param oddScale scaling factor for vertically odd elements + /// @param columnOffset offset of the column to work with + /// @param numLines number of lines, whose elements should be scaled + /// @param firstLine index of first line to scale elements in + __device__ void scaleVertical(const T evenScale, const T oddScale, + const int columnOffset, const int numLines, + const int firstLine) { + for(int i = firstLine; i < (numLines + firstLine); i++) { + if(i & 1) { + data[columnOffset + i * VERTICAL_STRIDE] *= oddScale; + } else { + data[columnOffset + i * VERTICAL_STRIDE] *= evenScale; + } + } + } + + + //****************For Test(Feb23), test inter parameters************* + __device__ int getVERTICAL_STRIDE(){ + return VERTICAL_STRIDE; + } + __device__ int getSHM_BANKS(){ + return SHM_BANKS; + } + __device__ int getBuffersize(){ + return BUFFER_SIZE; + } + __device__ int getPADDING(){ + return PADDING; + } + __device__ int getODD_OFFSET(){ + return ODD_OFFSET; + } + + + //****************For Test(Feb23), test inter parameters************* + + + }; // end of class TransformBuffer + + +} // namespace dwt_cuda + + +#endif // TRANSFORM_BUFFER_H + diff --git a/examples/dwt2d/main.cu b/examples/dwt2d/main.cu new file mode 100755 index 0000000..899e38c --- /dev/null +++ b/examples/dwt2d/main.cu @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2009, Jiri Matela + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "components.h" +#include "dwt.h" + +struct dwt { + char * srcFilename; + char * outFilename; + unsigned char *srcImg; + int pixWidth; + int pixHeight; + int components; + int dwtLvls; +}; + +int getImg(char * srcFilename, unsigned char *srcImg, int inputSize) +{ + // printf("Loading ipnput: %s\n", srcFilename); + char *path = "../../data/dwt2d/"; + char *newSrc = NULL; + + if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL) + { + newSrc[0] = '\0'; + strcat(newSrc, path); + strcat(newSrc, srcFilename); + srcFilename= newSrc; + } + printf("Loading ipnput: %s\n", srcFilename); + + //srcFilename = strcat("../../data/dwt2d/",srcFilename); + //read image + int i = open(srcFilename, O_RDONLY, 0644); + if (i == -1) { + error(0,errno,"cannot access %s", srcFilename); + return -1; + } + int ret = read(i, srcImg, inputSize); + printf("precteno %d, inputsize %d\n", ret, inputSize); + close(i); + + return 0; +} + + +void usage() { + printf("dwt [otpions] src_img.rgb \n\ + -d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\ + -c, --components\t\tnumber of color components, default 3\n\ + -b, --depth\t\t\tbit depth, default 8\n\ + -l, --level\t\t\tDWT level, default 3\n\ + -D, --device\t\t\tcuda device\n\ + -f, --forward\t\t\tforward transform\n\ + -r, --reverse\t\t\treverse transform\n\ + -9, --97\t\t\t9/7 transform\n\ + -5, --53\t\t\t5/3 transform\n\ + -w --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n"); +} + +template +void processDWT(struct dwt *d, int forward, int writeVisual) +{ + int componentSize = d->pixWidth*d->pixHeight*sizeof(T); + + T *c_r_out, *backup ; + cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size + cudaCheckError("Alloc device memory"); + cudaMemset(c_r_out, 0, componentSize); + cudaCheckError("Memset device memory"); + + cudaMalloc((void**)&backup, componentSize); //< aligned component size + cudaCheckError("Alloc device memory"); + cudaMemset(backup, 0, componentSize); + cudaCheckError("Memset device memory"); + + if (d->components == 3) { + /* Alloc two more buffers for G and B */ + T *c_g_out, *c_b_out; + cudaMalloc((void**)&c_g_out, componentSize); //< aligned component size + cudaCheckError("Alloc device memory"); + cudaMemset(c_g_out, 0, componentSize); + cudaCheckError("Memset device memory"); + + cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size + cudaCheckError("Alloc device memory"); + cudaMemset(c_b_out, 0, componentSize); + cudaCheckError("Memset device memory"); + + /* Load components */ + T *c_r, *c_g, *c_b; + cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size + cudaCheckError("Alloc device memory"); + cudaMemset(c_r, 0, componentSize); + cudaCheckError("Memset device memory"); + + cudaMalloc((void**)&c_g, componentSize); //< G, aligned component size + cudaCheckError("Alloc device memory"); + cudaMemset(c_g, 0, componentSize); + cudaCheckError("Memset device memory"); + + cudaMalloc((void**)&c_b, componentSize); //< B, aligned component size + cudaCheckError("Alloc device memory"); + cudaMemset(c_b, 0, componentSize); + cudaCheckError("Memset device memory"); + + rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight); + + + /* Compute DWT and always store into file */ + nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); + nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); + nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); + + // -------test---------- + // T *h_r_out=(T*)malloc(componentSize); + // cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost); + // int ii; + // for(ii=0;iipixWidth) == 0) fprintf(stderr, "\n"); + // } + // -------test---------- + + + /* Store DWT to file */ + writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r"); + // writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g"); + // writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b"); +#ifdef OUTPUT + if (writeVisual) { + writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r"); + writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g"); + writeNStage2DDWT(c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b"); + } else { + writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r"); + writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g"); + writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b"); + } +#endif + + + cudaFree(c_r); + cudaCheckError("Cuda free"); + cudaFree(c_g); + cudaCheckError("Cuda free"); + cudaFree(c_b); + cudaCheckError("Cuda free"); + cudaFree(c_g_out); + cudaCheckError("Cuda free"); + cudaFree(c_b_out); + cudaCheckError("Cuda free"); + + } + else if (d->components == 1) { + //Load component + T *c_r; + cudaMalloc((void**)&(c_r), componentSize); //< R, aligned component size + cudaCheckError("Alloc device memory"); + cudaMemset(c_r, 0, componentSize); + cudaCheckError("Memset device memory"); + + bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight); + + // Compute DWT + nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); + + // Store DWT to file +// #ifdef OUTPUT + if (writeVisual) { + writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out"); + } else { + writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".lin.out"); + } +// #endif + cudaFree(c_r); + cudaCheckError("Cuda free"); + } + + cudaFree(c_r_out); + cudaCheckError("Cuda free device"); + cudaFree(backup); + cudaCheckError("Cuda free device"); +} + +int main(int argc, char **argv) +{ + int optindex = 0; + char ch; + struct option longopts[] = { + {"dimension", required_argument, 0, 'd'}, //dimensions of src img + {"components", required_argument, 0, 'c'}, //numger of components of src img + {"depth", required_argument, 0, 'b'}, //bit depth of src img + {"level", required_argument, 0, 'l'}, //level of dwt + {"device", required_argument, 0, 'D'}, //cuda device + {"forward", no_argument, 0, 'f'}, //forward transform + {"reverse", no_argument, 0, 'r'}, //reverse transform + {"97", no_argument, 0, '9'}, //9/7 transform + {"53", no_argument, 0, '5' }, //5/3transform + {"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear + {"help", no_argument, 0, 'h'} + }; + + int pixWidth = 0; //= strlen(optarg))) { + usage(); + return -1; + } + pixHeight = atoi(pos+1); + break; + case 'c': + compCount = atoi(optarg); + break; + case 'b': + bitDepth = atoi(optarg); + break; + case 'l': + dwtLvls = atoi(optarg); + break; + case 'D': + device = atoi(optarg); + break; + case 'f': + forward = 1; + break; + case 'r': + forward = 0; + break; + case '9': + dwt97 = 1; + break; + case '5': + dwt97 = 0; + break; + case 'w': + writeVisual = 1; + break; + case 'h': + usage(); + return 0; + case '?': + return -1; + default : + usage(); + return -1; + } + } + argc -= optind; + argv += optind; + + if (argc == 0) { // at least one filename is expected + printf("Please supply src file name\n"); + usage(); + return -1; + } + + if (pixWidth <= 0 || pixHeight <=0) { + printf("Wrong or missing dimensions\n"); + usage(); + return -1; + } + + if (forward == 0) { + writeVisual = 0; //do not write visual when RDWT + } + + // device init + int devCount; + cudaSetDevice(0); + cudaGetDeviceCount(&devCount); + cudaCheckError("Get device count"); + if (devCount == 0) { + printf("No CUDA enabled device\n"); + return -1; + } + if (device < 0 || device > devCount -1) { + printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n", + device, 0, devCount -1); + return -1; + } + cudaDeviceProp devProp; + cudaGetDeviceProperties(&devProp, device); + cudaCheckError("Get device properties"); + // if (devProp.major < 1) { + // printf("Device %d does not support CUDA\n", device); + // return -1; + // } + printf("Using device %d: %s\n", device, devProp.name); + cudaSetDevice(device); + cudaCheckError("Set selected device"); + + struct dwt *d; + d = (struct dwt *)malloc(sizeof(struct dwt)); + d->srcImg = NULL; + d->pixWidth = pixWidth; + d->pixHeight = pixHeight; + d->components = compCount; + d->dwtLvls = dwtLvls; + + // file names + d->srcFilename = (char *)malloc(strlen(argv[0])); + strcpy(d->srcFilename, argv[0]); + if (argc == 1) { // only one filename supplyed + d->outFilename = (char *)malloc(strlen(d->srcFilename)+4); + strcpy(d->outFilename, d->srcFilename); + strcpy(d->outFilename+strlen(d->srcFilename), ".dwt"); + } else { + d->outFilename = strdup(argv[1]); + } + + //Input review + printf("Source file:\t\t%s\n", d->srcFilename); + printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight); + printf(" Components count:\t%d\n", compCount); + printf(" Bit depth:\t\t%d\n", bitDepth); + printf(" DWT levels:\t\t%d\n", dwtLvls); + printf(" Forward transform:\t%d\n", forward); + printf(" 9/7 transform:\t\t%d\n", dwt97); + + //data sizes + int inputSize = pixWidth*pixHeight*compCount; //srcImg, inputSize); + cudaCheckError("Alloc host memory"); + if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) + return -1; + + /* DWT */ + if (forward == 1) { + if(dwt97 == 1 ) + processDWT(d, forward, writeVisual); + else // 5/3 + processDWT(d, forward, writeVisual); + } + else { // reverse + if(dwt97 == 1 ) + processDWT(d, forward, writeVisual); + else // 5/3 + processDWT(d, forward, writeVisual); + } + + //writeComponent(r_cuda, pixWidth, pixHeight, srcFilename, ".g"); + //writeComponent(g_wave_cuda, 512000, ".g"); + //writeComponent(g_cuda, componentSize, ".g"); + //writeComponent(b_wave_cuda, componentSize, ".b"); + cudaFreeHost(d->srcImg); + cudaCheckError("Cuda free host"); + + return 0; +} diff --git a/examples/dwt2d/run.sh b/examples/dwt2d/run.sh new file mode 100755 index 0000000..ce51838 --- /dev/null +++ b/examples/dwt2d/run.sh @@ -0,0 +1,8 @@ +./dwt2d 4.bmp z.dwt -d 4x4 -f -5 -l 3 +# ./dwt2d 8.bmp -d 8x8 -f -5 -l 3 +# ./dwt2d 16.bmp -d 16x16 -f -5 -l 3 +# ./dwt2d 64.bmp -d 64x64 -f -5 -l 3 + +# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3 +# ls +# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3 diff --git a/examples/dwt2d/run_cpu.sh b/examples/dwt2d/run_cpu.sh new file mode 100755 index 0000000..5a23d74 --- /dev/null +++ b/examples/dwt2d/run_cpu.sh @@ -0,0 +1,8 @@ +# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3 +# ls +# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3 +# ./dwt2d 16.bmp -d 16x16 -f -9 -l 3\ +./dwt2d 4.bmp -d 4x4 -r -5 -l 3 +# ./dwt2d 4.bmp -d 4x4 -r -9 -l 3 +# ./dwt2d 8.bmp -d 8x8 -f -9 -l 3 + diff --git a/examples/dwt2d/run_nvcc.sh b/examples/dwt2d/run_nvcc.sh new file mode 100644 index 0000000..f197cc3 --- /dev/null +++ b/examples/dwt2d/run_nvcc.sh @@ -0,0 +1,14 @@ +# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3 +# ls +# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3 +# ./nvcc_dwt2d 4.bmp -d 4x4 -f -9 -l 3 +./nvcc_dwt2d 4.bmp -d 4x4 -f -5 -l 3 +# ./nvcc_dwt2d 8.bmp -d 8x8 -f -9 -l 3 +# ./nvcc_dwt2d 16.bmp -d 16x16 -f -5 -l 3 +# ./nvcc_dwt2d 16.bmp -d 16x16 -r -5 -l 3 +# ./nvcc_dwt2d 16.bmp -d 16x16 -f -9 -l 3 +# ./nvcc_dwt2d 4.bmp -d 4x4 -r -9 -l 3 +# ./nvcc_dwt2d 64.bmp -d 64x64 -f -5 -l 3 +# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3 +# ls +# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3 diff --git a/examples/dwt2d/test_compile_cpu.sh b/examples/dwt2d/test_compile_cpu.sh new file mode 100644 index 0000000..c84f63c --- /dev/null +++ b/examples/dwt2d/test_compile_cpu.sh @@ -0,0 +1,51 @@ + + +#!/bin/bash + +clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu dwt_cuda/common.cu dwt_cuda/rdwt97.cu dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -I. -I/include -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v + +export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH + +../../build/compilation/kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc +../../build/compilation/kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc +../../build/compilation/kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc + +../../build/compilation/kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc + +../../build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc +../../build/compilation/hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc +../../build/compilation/hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc +../../build/compilation/hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc +../../build/compilation/hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc + +../../build/compilation/hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc +../../build/compilation/hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc +../../build/compilation/hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc +../../build/compilation/kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc +../../build/compilation/kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc +../../build/compilation/kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc + +llc --relocation-model=pic --filetype=obj common.bc +llc --relocation-model=pic --filetype=obj components.bc +llc --relocation-model=pic --filetype=obj fdwt53.bc + +llc --relocation-model=pic --filetype=obj dwt.bc + + +llc --relocation-model=pic --filetype=obj host.bc + +llc --relocation-model=pic --filetype=obj common_host.bc +llc --relocation-model=pic --filetype=obj components_host.bc +llc --relocation-model=pic --filetype=obj fdwt53_host.bc + +llc --relocation-model=pic --filetype=obj dwt_host.bc + + +llc --relocation-model=pic --filetype=obj fdwt97_host.bc +llc --relocation-model=pic --filetype=obj rdwt97_host.bc +llc --relocation-model=pic --filetype=obj rdwt53_host.bc +llc --relocation-model=pic --filetype=obj fdwt97.bc +llc --relocation-model=pic --filetype=obj rdwt97.bc +llc --relocation-model=pic --filetype=obj rdwt53.bc + +g++ -g -Wall -L../../build/runtime -L../../build/runtime/threadPool -o dwt2d -fPIC -no-pie common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lc -lx86Runtime -lthreadPool -lpthread diff --git a/examples/dwt2d/test_compile_nvcc.sh b/examples/dwt2d/test_compile_nvcc.sh new file mode 100755 index 0000000..5cd6644 --- /dev/null +++ b/examples/dwt2d/test_compile_nvcc.sh @@ -0,0 +1,18 @@ +/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c main.cu -o main.cu.o +/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt.cu -o dwt.cu.o +/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c components.cu -o components.cu.o +/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt53.cu -o dwt_cuda/fdwt53.cu.o +/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt97.cu -o dwt_cuda/fdwt97.cu.o +/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/common.cu -o dwt_cuda/common.cu.o +/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o +/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o +g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart + + + + + + + + + diff --git a/runtime/lib/cudaRuntimeImpl.cpp b/runtime/lib/cudaRuntimeImpl.cpp index d15dae1..9dd5277 100644 --- a/runtime/lib/cudaRuntimeImpl.cpp +++ b/runtime/lib/cudaRuntimeImpl.cpp @@ -39,6 +39,12 @@ cudaError_t cudaMalloc(void **devPtr, size_t size) { return cudaErrorMemoryAllocation; return cudaSuccess; } +cudaError_t cudaMallocHost(void **devPtr, size_t size) { + *devPtr = malloc(size); + if (devPtr == NULL) + return cudaErrorMemoryAllocation; + return cudaSuccess; +} cudaError_t cudaMemset(void *devPtr, int value, size_t count) { memset(devPtr, value, count); return cudaSuccess; @@ -58,7 +64,7 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, memcpy(dst, src, count); } else if (kind == cudaMemcpyDeviceToDevice) { - memcpy(dst, dst, count); + memcpy(dst, src, count); } else if (kind == cudaMemcpyDefault) { memcpy(dst, src, count); } From 44c172ae99e6dfefb06a162b146bbea2c847c355 Mon Sep 17 00:00:00 2001 From: jchen706 Date: Sun, 22 May 2022 03:47:43 -0400 Subject: [PATCH 2/3] add dwtd workflow --- .github/workflows/build.yml | 38 +++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 368d98d..f9587b1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -155,3 +155,41 @@ jobs: llc --relocation-model=pic --filetype=obj host.bc g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread ./lavaMD -boxes1d 10 + - name: Execute the dwt2d example + run: | + cd ${{ github.workspace }}/SC_evaluate/rodinia-cox/dwt2d + clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu dwt_cuda/common.cu dwt_cuda/rdwt97.cu dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=${{ github.workspace }}/cuda-10.1 --cuda-gpu-arch=sm_61 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v || true + export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH + export PATH=${{ github.workspace }}/build/compilation:$PATH + kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc + kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc + kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc + kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc + kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc + kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc + kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc + hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc + hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc + hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc + hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc + hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc + hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc + hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc + hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc + llc --relocation-model=pic --filetype=obj common.bc + llc --relocation-model=pic --filetype=obj components.bc + llc --relocation-model=pic --filetype=obj fdwt53.bc + llc --relocation-model=pic --filetype=obj dwt.bc + llc --relocation-model=pic --filetype=obj host.bc + llc --relocation-model=pic --filetype=obj common_host.bc + llc --relocation-model=pic --filetype=obj components_host.bc + llc --relocation-model=pic --filetype=obj fdwt53_host.bc + llc --relocation-model=pic --filetype=obj dwt_host.bc + llc --relocation-model=pic --filetype=obj fdwt97_host.bc + llc --relocation-model=pic --filetype=obj rdwt97_host.bc + llc --relocation-model=pic --filetype=obj rdwt53_host.bc + llc --relocation-model=pic --filetype=obj fdwt97.bc + llc --relocation-model=pic --filetype=obj rdwt97.bc + llc --relocation-model=pic --filetype=obj rdwt53.bc + g++ -o dwt2d -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lpthread -lc -lx86Runtime -lthreadPool -pthread + ./dwt2d 4.bmp -d 4x4 -f -5 -l 3 From 82fe95734f57b88f8556958583f27abbf4ec0400 Mon Sep 17 00:00:00 2001 From: jchen706 Date: Sun, 22 May 2022 09:05:46 -0400 Subject: [PATCH 3/3] fix dwt2d worflow command --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f9587b1..49eed29 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -192,4 +192,4 @@ jobs: llc --relocation-model=pic --filetype=obj rdwt97.bc llc --relocation-model=pic --filetype=obj rdwt53.bc g++ -o dwt2d -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lpthread -lc -lx86Runtime -lthreadPool -pthread - ./dwt2d 4.bmp -d 4x4 -f -5 -l 3 + ./dwt2d 192.bmp -d 192x192 -f -5 -l 3