From 308e9284cba5d8092bacae1995274b8fb64c0e72 Mon Sep 17 00:00:00 2001 From: Bhanu Garg Date: Tue, 24 May 2022 20:43:47 -0400 Subject: [PATCH] fix linting issues --- .gitignore | 2 +- README.md | 8 +- compilation/KernelTranslation/lib/init.cpp | 2 +- .../lib/insert_warp_loop.cpp | 3 +- .../lib/memory_hierarchy.cpp | 14 +- examples/dwt2d/common.h | 42 +- examples/dwt2d/components.cu | 24 +- examples/dwt2d/components.h | 17 +- examples/dwt2d/dwt.cu | 76 +- examples/dwt2d/dwt.h | 25 +- examples/dwt2d/dwt_cuda/common.cu | 10 +- examples/dwt2d/dwt_cuda/common.h | 387 ++++----- examples/dwt2d/dwt_cuda/dwt.h | 99 +-- examples/dwt2d/dwt_cuda/fdwt53.cu | 112 +-- examples/dwt2d/dwt_cuda/fdwt97.cu | 72 +- examples/dwt2d/dwt_cuda/io.h | 801 +++++++++--------- examples/dwt2d/dwt_cuda/rdwt53.cu | 64 +- examples/dwt2d/dwt_cuda/rdwt97.cu | 72 +- examples/dwt2d/dwt_cuda/transform_buffer.h | 599 ++++++------- examples/dwt2d/main.cu | 64 +- examples/dwt2d/run_cpu.sh | 1 - examples/dwt2d/test_compile_nvcc.sh | 9 - examples/microbench/cudamemcpy_test.cc | 32 +- examples/microbench/dummy_kernel.cc | 31 +- examples/microbench/kerne_arg.cc | 31 +- examples/microbench/one_thread_kernel.cc | 31 +- runtime/lib/cudaRuntimeImpl.cpp | 2 +- 27 files changed, 1246 insertions(+), 1384 deletions(-) mode change 100755 => 100644 examples/dwt2d/common.h mode change 100755 => 100644 examples/dwt2d/components.h mode change 100755 => 100644 examples/dwt2d/dwt.h mode change 100755 => 100644 examples/dwt2d/dwt_cuda/common.h mode change 100755 => 100644 examples/dwt2d/dwt_cuda/dwt.h mode change 100755 => 100644 examples/dwt2d/dwt_cuda/io.h mode change 100755 => 100644 examples/dwt2d/dwt_cuda/transform_buffer.h diff --git a/.gitignore b/.gitignore index 6893379..0abb42e 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,4 @@ CMakeCache.txt # OS generated files .DS_Store -.DS_Store? \ No newline at end of file +.DS_Store? diff --git a/README.md b/README.md index 6fe1b2b..df76bca 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,11 @@ Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RIS export CuPBoP_PATH=`pwd` export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH ``` -If you are using boson, you can pre-installed llvm 10.0.0 - LLVM_PATH=/opt/llvm-10.0.0 - export PATH=$LLVM_PATH/bin:$PATH - + If you are using boson, you can pre-installed llvm 10.0.0\ + `LLVM_PATH=/opt/llvm-10.0.0`\ + `export PATH=$LLVM_PATH/bin:$PATH` + 2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file ```bash diff --git a/compilation/KernelTranslation/lib/init.cpp b/compilation/KernelTranslation/lib/init.cpp index 4cd5984..62ae6ff 100644 --- a/compilation/KernelTranslation/lib/init.cpp +++ b/compilation/KernelTranslation/lib/init.cpp @@ -396,7 +396,7 @@ void init_block(llvm::Module *M, std::ofstream &fout) { replace_asm_call(M); // replace dynamic shared memory auto dynamic_shared_memory_addr = - M->getGlobalVariable("dynamic_shared_memory"); + M->getGlobalVariable("dynamic_shared_memory"); if (dynamic_shared_memory_addr) { replace_dynamic_shared_memory(M); } diff --git a/compilation/KernelTranslation/lib/insert_warp_loop.cpp b/compilation/KernelTranslation/lib/insert_warp_loop.cpp index 28a5a61..1c090bf 100644 --- a/compilation/KernelTranslation/lib/insert_warp_loop.cpp +++ b/compilation/KernelTranslation/lib/insert_warp_loop.cpp @@ -272,13 +272,12 @@ void AddContextSaveRestore(llvm::Instruction *instruction, std::vector uses; Function *f2 = instruction->getParent()->getParent(); - for (Instruction::use_iterator ui = instruction->use_begin(), ue = instruction->use_end(); ui != ue; ++ui) { llvm::Instruction *user = cast(ui->getUser()); Function *f1 = user->getParent()->getParent(); - if(f2->getName() != f1->getName()) { + if (f2->getName() != f1->getName()) { continue; } if (user == NULL) diff --git a/compilation/KernelTranslation/lib/memory_hierarchy.cpp b/compilation/KernelTranslation/lib/memory_hierarchy.cpp index b05bffd..30fe5ab 100644 --- a/compilation/KernelTranslation/lib/memory_hierarchy.cpp +++ b/compilation/KernelTranslation/lib/memory_hierarchy.cpp @@ -89,20 +89,20 @@ void mem_share2global(llvm::Module *M) { } else if (element_type->isStructTy()) { auto undef = llvm::UndefValue::get(element_type); llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( - *M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef, - new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, - false); + *M, element_type, false, llvm::GlobalValue::ExternalLinkage, + undef, new_name, NULL, + llvm::GlobalValue::GeneralDynamicTLSModel, 0, false); global_memory->setDSOLocal(true); - Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName())); + Comdat *comdat = + M->getOrInsertComdat(StringRef(share_memory->getName())); comdat->setSelectionKind(Comdat::SelectionKind::Any); global_memory->setComdat(comdat); global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage); global_memory->setInitializer(undef); global_memory->setAlignment(share_memory->getAlignment()); corresponding_global_memory.insert( - std::pair(share_memory, - global_memory)); - + std::pair(share_memory, + global_memory)); } else { assert(0 && "The required Share Memory Type is not supported\n"); diff --git a/examples/dwt2d/common.h b/examples/dwt2d/common.h old mode 100755 new mode 100644 index f649804..ac276a1 --- a/examples/dwt2d/common.h +++ b/examples/dwt2d/common.h @@ -1,16 +1,16 @@ -/* +/* * Copyright (c) 2009, Jiri Matela * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -27,9 +27,9 @@ #ifndef _COMMON_H #define _COMMON_H -//24-bit multiplication is faster on G80, -//but we must be sure to multiply integers -//only within [-8M, 8M - 1] range +// 24-bit multiplication is faster on G80, +// but we must be sure to multiply integers +// only within [-8M, 8M - 1] range #define IMUL(a, b) __mul24(a, b) ////cuda timing macros @@ -42,21 +42,23 @@ // cudaEventSynchronize(cstop); \ // cudaEventElapsedTime(&elapsedTime, cstart, cstop) -//divide and round up macro +// divide and round up macro #define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b))) -# define cudaCheckError( msg ) { \ - cudaError_t err = cudaGetLastError(); \ - if( cudaSuccess != err) { \ - fprintf(stderr, "%s: %i: %s: %s.\n", \ - __FILE__, __LINE__, msg, cudaGetErrorString( err) ); \ - exit(-1); \ - } } - -# define cudaCheckAsyncError( msg ) { \ - cudaThreadSynchronize(); \ - cudaCheckError( msg ); \ - } +#define cudaCheckError(msg) \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg, \ + cudaGetErrorString(err)); \ + exit(-1); \ + } \ + } +#define cudaCheckAsyncError(msg) \ + { \ + cudaThreadSynchronize(); \ + cudaCheckError(msg); \ + } #endif diff --git a/examples/dwt2d/components.cu b/examples/dwt2d/components.cu index b9721ce..e768b4e 100755 --- a/examples/dwt2d/components.cu +++ b/examples/dwt2d/components.cu @@ -1,16 +1,16 @@ -/* +/* * Copyright (c) 2009, Jiri Matela * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -23,7 +23,7 @@ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ - + #include #include #include @@ -50,7 +50,7 @@ __device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int d_r[pos] = r - 128; d_g[pos] = g - 128; d_b[pos] = b - 128; -} +} /* Store float component */ __device__ void storeComponent(float *d_c, float c, int pos) @@ -66,8 +66,8 @@ __device__ void storeComponent(int *d_c, int c, int pos) /* Copy img src data into three separated component buffers */ template -__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b, - unsigned char * d_src, +__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b, + unsigned char * d_src, int pixels) { int x = threadIdx.x; @@ -75,8 +75,8 @@ __global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b, __shared__ unsigned char sData[THREADS*3]; - /* Copy data to shared mem by 4bytes - other checks are not necessary, since + /* Copy data to shared mem by 4bytes + other checks are not necessary, since d_src buffer is aligned to sharedDataSize */ if ( (x*4) < THREADS*3 ) { float *s = (float *)d_src; @@ -107,8 +107,8 @@ __global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels) __shared__ unsigned char sData[THREADS]; - /* Copy data to shared mem by 4bytes - other checks are not necessary, since + /* Copy data to shared mem by 4bytes + other checks are not necessary, since d_src buffer is aligned to sharedDataSize */ if ( (x*4) < THREADS) { float *s = (float *)d_src; diff --git a/examples/dwt2d/components.h b/examples/dwt2d/components.h old mode 100755 new mode 100644 index 98a2b12..3766cdd --- a/examples/dwt2d/components.h +++ b/examples/dwt2d/components.h @@ -1,16 +1,16 @@ -/* +/* * Copyright (c) 2009, Jiri Matela * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -28,11 +28,12 @@ #define _COMPONENTS_H /* Separate compoents of source 8bit RGB image */ -template -void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height); +template +void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width, + int height); /* Copy a 8bit source image data into a color compoment of type T */ -template -void bwToComponent(T *d_c, unsigned char * src, int width, int height); +template +void bwToComponent(T *d_c, unsigned char *src, int width, int height); #endif diff --git a/examples/dwt2d/dwt.cu b/examples/dwt2d/dwt.cu index f06f2d9..c102bce 100755 --- a/examples/dwt2d/dwt.cu +++ b/examples/dwt2d/dwt.cu @@ -1,16 +1,16 @@ -/* +/* * Copyright (c) 2009, Jiri Matela * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -85,25 +85,25 @@ template int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward) { printf("\n*** %d stages of 2D forward DWT:\n", stages); - + /* create backup of input, because each test iteration overwrites it */ const int size = pixHeight * pixWidth * sizeof(T); cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice); cudaCheckError("Memcopy device to device"); - + /* Measure time of individual levels. */ if(forward) fdwt(in, out, pixWidth, pixHeight, stages); else rdwt(in, out, pixWidth, pixHeight, stages); - - // Measure overall time of DWT. + + // Measure overall time of DWT. /* #ifdef GPU_DWT_TESTING_1 - + dwt_cuda::CudaDWTTester tester; for(int i = tester.getNumIterations(); i--; ) { - // Recover input and measure one overall DWT run. - cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); + // Recover input and measure one overall DWT run. + cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); cudaCheckError("Memcopy device to device"); tester.beginTestIteration(); if(forward) @@ -113,8 +113,8 @@ int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int st tester.endTestIteration(); } tester.showPerformance(" Overall DWT", pixWidth, pixHeight); - #endif // GPU_DWT_TESTING - + #endif // GPU_DWT_TESTING + cudaCheckAsyncError("DWT Kernel calls"); */ return 0; } @@ -128,25 +128,25 @@ template int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut) { printf("*** %d stages of 2D forward DWT:\n", stages); - - // create backup of input, because each test iteration overwrites it + + // create backup of input, because each test iteration overwrites it const int size = pixHeight * pixWidth * sizeof(T); cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice); cudaCheckError("Memcopy device to device"); - - // Measure time of individual levels. + + // Measure time of individual levels. if(forward) fdwt(in, out, pixWidth, pixHeight, stages, diffOut); else rdwt(in, out, pixWidth, pixHeight, stages); - - // Measure overall time of DWT. + + // Measure overall time of DWT. #ifdef GPU_DWT_TESTING_1 - + dwt_cuda::CudaDWTTester tester; for(int i = tester.getNumIterations(); i--; ) { - // Recover input and measure one overall DWT run. - cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); + // Recover input and measure one overall DWT run. + cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); cudaCheckError("Memcopy device to device"); tester.beginTestIteration(); if(forward) @@ -156,8 +156,8 @@ int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int st tester.endTestIteration(); } tester.showPerformance(" Overall DWT", pixWidth, pixHeight); - #endif // GPU_DWT_TESTING - + #endif // GPU_DWT_TESTING + cudaCheckAsyncError("DWT Kernel calls"); return 0; } @@ -178,8 +178,8 @@ void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char for(i = 0; i < samplesNum; i++) { float r = (src[i]+0.5f) * 255; - if (r > 255) r = 255; - if (r < 0) r = 0; + if (r > 255) r = 255; + if (r < 0) r = 0; dst[i] = (unsigned char)r; outputFile << "index: " << i << " val: "<< r <<" \n"; @@ -199,7 +199,7 @@ void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * for(i = 0; i < samplesNum; i++) { int r = src[i]+128; if (r > 255) r = 255; - if (r < 0) r = 0; + if (r < 0) r = 0; dst[i] = (unsigned char)r; // added this line to output check outputFile << "index: " << i << " val: "<< r <<" \n"; @@ -250,16 +250,16 @@ int writeLinear(T *component_cuda, int pixWidth, int pixHeight, if(x == 0) return 1; return 0; } -template int writeLinear(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); -template int writeLinear(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); +template int writeLinear(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); +template int writeLinear(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); /* Write output visual ordered */ template -int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, - int stages, const char * filename, const char * suffix) +int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, + int stages, const char * filename, const char * suffix) { struct band { - int dimX; + int dimX; int dimY; }; struct dimensions { @@ -309,7 +309,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY); } #endif - + size = samplesNum*sizeof(T); cudaMallocHost((void **)&src, size); cudaCheckError("Malloc host"); @@ -332,7 +332,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY; for (i = 0; i < bandDims[s].HL.dimY; i++) { memcpy(dst+i*pixWidth+bandDims[s].LL.dimX, - src+offset+i*bandDims[s].HL.dimX, + src+offset+i*bandDims[s].HL.dimX, size); } @@ -342,7 +342,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, yOffset = bandDims[s].LL.dimY; for (i = 0; i < bandDims[s].HL.dimY; i++) { memcpy(dst+(yOffset+i)*pixWidth, - src+offset+i*bandDims[s].LH.dimX, + src+offset+i*bandDims[s].LH.dimX, size); } @@ -352,7 +352,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, yOffset = bandDims[s].HL.dimY; for (i = 0; i < bandDims[s].HH.dimY; i++) { memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX, - src+offset+i*bandDims[s].HH.dimX, + src+offset+i*bandDims[s].HH.dimX, size); } } @@ -381,5 +381,5 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, if (x == 0) return 1; return 0; } -template int writeNStage2DDWT(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); -template int writeNStage2DDWT(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); +template int writeNStage2DDWT(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); +template int writeNStage2DDWT(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); diff --git a/examples/dwt2d/dwt.h b/examples/dwt2d/dwt.h old mode 100755 new mode 100644 index fcb4b9a..d84a18e --- a/examples/dwt2d/dwt.h +++ b/examples/dwt2d/dwt.h @@ -1,16 +1,16 @@ -/* +/* * Copyright (c) 2009, Jiri Matela * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -27,14 +27,15 @@ #ifndef _DWT_H #define _DWT_H -template -int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward); +template +int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight, + int stages, bool forward); -template -int writeNStage2DDWT(T *component_cuda, int width, int height, - int stages, const char * filename, const char * suffix); -template -int writeLinear(T *component_cuda, int width, int height, - const char * filename, const char * suffix); +template +int writeNStage2DDWT(T *component_cuda, int width, int height, int stages, + const char *filename, const char *suffix); +template +int writeLinear(T *component_cuda, int width, int height, const char *filename, + const char *suffix); #endif diff --git a/examples/dwt2d/dwt_cuda/common.cu b/examples/dwt2d/dwt_cuda/common.cu index 8ce4984..5936f57 100755 --- a/examples/dwt2d/dwt_cuda/common.cu +++ b/examples/dwt2d/dwt_cuda/common.cu @@ -1,20 +1,20 @@ -/// +/// /// @file common.cu /// @author Martin Jirman (207962@mail.muni.cz) /// @date 2011-01-20 14:37 /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -27,7 +27,7 @@ /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE /// POSSIBILITY OF SUCH DAMAGE. /// - + #include "common.h" namespace dwt_cuda { diff --git a/examples/dwt2d/dwt_cuda/common.h b/examples/dwt2d/dwt_cuda/common.h old mode 100755 new mode 100644 index 37c1979..6fc531e --- a/examples/dwt2d/dwt_cuda/common.h +++ b/examples/dwt2d/dwt_cuda/common.h @@ -1,4 +1,4 @@ -/// +/// /// @file common.h /// @author Martin Jirman (207962@mail.muni.cz) /// @brief Common stuff for all CUDA dwt functions. @@ -6,16 +6,16 @@ /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -29,233 +29,204 @@ /// POSSIBILITY OF SUCH DAMAGE. /// - #ifndef DWT_COMMON_H -#define DWT_COMMON_H +#define DWT_COMMON_H - -#include #include +#include #include - - // compile time minimum macro -#define CTMIN(a,b) (((a) < (b)) ? (a) : (b)) - - +#define CTMIN(a, b) (((a) < (b)) ? (a) : (b)) // performance testing macros #if defined(GPU_DWT_TESTING) - #define PERF_BEGIN \ - { \ - dwt_cuda::CudaDWTTester PERF_TESTER; \ - for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \ - { \ +#define PERF_BEGIN \ + { \ + dwt_cuda::CudaDWTTester PERF_TESTER; \ + for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) { \ PERF_TESTER.beginTestIteration(); - #define PERF_END(PERF_NAME, PERF_W, PERF_H) \ - PERF_TESTER.endTestIteration(); \ - } \ - PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \ +#define PERF_END(PERF_NAME, PERF_W, PERF_H) \ + PERF_TESTER.endTestIteration(); \ + } \ + PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \ } #else // GPU_DWT_TESTING - #define PERF_BEGIN - #define PERF_END(PERF_NAME, PERF_W, PERF_H) +#define PERF_BEGIN +#define PERF_END(PERF_NAME, PERF_W, PERF_H) #endif // GPU_DWT_TESTING - - namespace dwt_cuda { - - - /// Divide and round up. - template - __device__ __host__ inline T divRndUp(const T & n, const T & d) { - return (n / d) + ((n % d) ? 1 : 0); + +/// Divide and round up. +template +__device__ __host__ inline T divRndUp(const T &n, const T &d) { + return (n / d) + ((n % d) ? 1 : 0); +} + +// 9/7 forward DWT lifting schema coefficients +const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1 +const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1 +const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2 +const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2 + +// 9/7 reverse DWT lifting schema coefficients +const float r97update2 = -f97Update2; ///< undo 9/7 update 2 +const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2 +const float r97update1 = -f97Update1; ///< undo 9/7 update 1 +const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1 + +// FDWT 9/7 scaling coefficients +const float scale97Mul = 1.23017410491400f; +const float scale97Div = 1.0 / scale97Mul; + +// 5/3 forward DWT lifting schema coefficients +const float forward53Predict = -0.5f; /// forward 5/3 predict +const float forward53Update = 0.25f; /// forward 5/3 update + +// 5/3 forward DWT lifting schema coefficients +const float reverse53Update = -forward53Update; /// undo 5/3 update +const float reverse53Predict = -forward53Predict; /// undo 5/3 predict + +/// Functor which adds scaled sum of neighbors to given central pixel. +struct AddScaledSum { + const float scale; // scale of neighbors + __device__ AddScaledSum(const float scale) : scale(scale) {} + __device__ void operator()(const float p, float &c, const float n) const { + + // if(threadIdx.x == 0) { + + // printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, + // scale * (p + n) ); + + // } + + c += scale * (p + n); } - - - // 9/7 forward DWT lifting schema coefficients - const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1 - const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1 - const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2 - const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2 +}; +/// Returns index ranging from 0 to num threads, such that first half +/// of threads get even indices and others get odd indices. Each thread +/// gets different index. +/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7 +/// parityIdx: 0 2 4 6 1 3 5 7 +/// @tparam THREADS total count of participating threads +/// @return parity-separated index of thread +template __device__ inline int parityIdx() { + return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2)); +} - // 9/7 reverse DWT lifting schema coefficients - const float r97update2 = -f97Update2; ///< undo 9/7 update 2 - const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2 - const float r97update1 = -f97Update1; ///< undo 9/7 update 1 - const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1 - - // FDWT 9/7 scaling coefficients - const float scale97Mul = 1.23017410491400f; - const float scale97Div = 1.0 / scale97Mul; - - - // 5/3 forward DWT lifting schema coefficients - const float forward53Predict = -0.5f; /// forward 5/3 predict - const float forward53Update = 0.25f; /// forward 5/3 update - - // 5/3 forward DWT lifting schema coefficients - const float reverse53Update = -forward53Update; /// undo 5/3 update - const float reverse53Predict = -forward53Predict; /// undo 5/3 predict - - - - /// Functor which adds scaled sum of neighbors to given central pixel. - struct AddScaledSum { - const float scale; // scale of neighbors - __device__ AddScaledSum(const float scale) : scale(scale) {} - __device__ void operator()(const float p, float & c, const float n) const { +/// size of shared memory +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) +const int SHM_SIZE = 48 * 1024; +#else +const int SHM_SIZE = 16 * 1024; +#endif - // if(threadIdx.x == 0) { - - // printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) ); - - // } - - c += scale * (p + n); +/// Perrformance and return code tester. +class CudaDWTTester { +private: + static bool testRunning; ///< true if any test is currently running + cudaEvent_t beginEvent; ///< begin CUDA event + cudaEvent_t endEvent; ///< end CUDA event + std::vector times; ///< collected times + const bool disabled; ///< true if this object is disabled +public: + /// Checks CUDA related error. + /// @param status return code to be checked + /// @param message message to be shown if there was an error + /// @return true if there was no error, false otherwise + static bool check(const cudaError_t &status, const char *message) { +#if defined(GPU_DWT_TESTING) + if ((!testRunning) && status != cudaSuccess) { + const char *errorString = cudaGetErrorString(status); + fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString); + fflush(stderr); + return false; } - }; - - - - /// Returns index ranging from 0 to num threads, such that first half - /// of threads get even indices and others get odd indices. Each thread - /// gets different index. - /// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7 - /// parityIdx: 0 2 4 6 1 3 5 7 - /// @tparam THREADS total count of participating threads - /// @return parity-separated index of thread - template - __device__ inline int parityIdx() { - return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2)); +#endif // GPU_DWT_TESTING + return true; } - - - - /// size of shared memory - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) - const int SHM_SIZE = 48 * 1024; - #else - const int SHM_SIZE = 16 * 1024; - #endif - - - - /// Perrformance and return code tester. - class CudaDWTTester { - private: - static bool testRunning; ///< true if any test is currently running - cudaEvent_t beginEvent; ///< begin CUDA event - cudaEvent_t endEvent; ///< end CUDA event - std::vector times; ///< collected times - const bool disabled; ///< true if this object is disabled - public: - /// Checks CUDA related error. - /// @param status return code to be checked - /// @param message message to be shown if there was an error - /// @return true if there was no error, false otherwise - static bool check(const cudaError_t & status, const char * message) { - #if defined(GPU_DWT_TESTING) - if((!testRunning) && status != cudaSuccess) { - const char * errorString = cudaGetErrorString(status); - fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString); - fflush(stderr); - return false; - } - #endif // GPU_DWT_TESTING - return true; - } - /// Checks last kernel call for errors. - /// @param message description of the kernel call - /// @return true if there was no error, false otherwise - static bool checkLastKernelCall(const char * message) { - #if defined(GPU_DWT_TESTING) - return testRunning ? true : check(cudaThreadSynchronize(), message); - #else // GPU_DWT_TESTING - return true; - #endif // GPU_DWT_TESTING - } - - /// Initializes DWT tester for time measurement - CudaDWTTester() : disabled(testRunning) {} - - /// Gets rpefered number of iterations - int getNumIterations() { - return disabled ? 1 : 31; - } - - /// Starts one test iteration. - void beginTestIteration() { - if(!disabled) { - cudaEventCreate(&beginEvent); - cudaEventCreate(&endEvent); - cudaEventRecord(beginEvent, 0); - testRunning = true; - } - } - - /// Ends on etest iteration. - void endTestIteration() { - if(!disabled) { - float time; - testRunning = false; - cudaEventRecord(endEvent, 0); - cudaEventSynchronize(endEvent); - cudaEventElapsedTime(&time, beginEvent, endEvent); - cudaEventDestroy(beginEvent); - cudaEventDestroy(endEvent); - times.push_back(time); - } - } - - /// Shows brief info about all iterations. - /// @param name name of processing method - /// @param sizeX width of processed image - /// @param sizeY height of processed image - void showPerformance(const char * name, const int sizeX, const int sizeY) { - if(!disabled) { - // compute mean and median - std::sort(times.begin(), times.end()); - double sum = 0; - for(int i = times.size(); i--; ) { - sum += times[i]; - } - const double median = (times[times.size() / 2] - + times[(times.size() - 1) / 2]) * 0.5f; - printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) " - "(%d x %d)\n", name, (sum / times.size()), median, - times[times.size() - 1], sizeX, sizeY); - } - } - }; - - - - /// Simple cudaMemcpy wrapped in performance tester. - /// @param dest destination bufer - /// @param src source buffer - /// @param sx width of copied image - /// @param sy height of copied image - template - inline void memCopy(T * const dest, const T * const src, - const size_t sx, const size_t sy) { - cudaError_t status; - PERF_BEGIN - status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice); - PERF_END(" memcpy", sx, sy) - CudaDWTTester::check(status, "memcpy device > device"); + /// Checks last kernel call for errors. + /// @param message description of the kernel call + /// @return true if there was no error, false otherwise + static bool checkLastKernelCall(const char *message) { +#if defined(GPU_DWT_TESTING) + return testRunning ? true : check(cudaThreadSynchronize(), message); +#else // GPU_DWT_TESTING + return true; +#endif // GPU_DWT_TESTING } - - - + + /// Initializes DWT tester for time measurement + CudaDWTTester() : disabled(testRunning) {} + + /// Gets rpefered number of iterations + int getNumIterations() { return disabled ? 1 : 31; } + + /// Starts one test iteration. + void beginTestIteration() { + if (!disabled) { + cudaEventCreate(&beginEvent); + cudaEventCreate(&endEvent); + cudaEventRecord(beginEvent, 0); + testRunning = true; + } + } + + /// Ends on etest iteration. + void endTestIteration() { + if (!disabled) { + float time; + testRunning = false; + cudaEventRecord(endEvent, 0); + cudaEventSynchronize(endEvent); + cudaEventElapsedTime(&time, beginEvent, endEvent); + cudaEventDestroy(beginEvent); + cudaEventDestroy(endEvent); + times.push_back(time); + } + } + + /// Shows brief info about all iterations. + /// @param name name of processing method + /// @param sizeX width of processed image + /// @param sizeY height of processed image + void showPerformance(const char *name, const int sizeX, const int sizeY) { + if (!disabled) { + // compute mean and median + std::sort(times.begin(), times.end()); + double sum = 0; + for (int i = times.size(); i--;) { + sum += times[i]; + } + const double median = + (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f; + printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) " + "(%d x %d)\n", + name, (sum / times.size()), median, times[times.size() - 1], sizeX, + sizeY); + } + } +}; + +/// Simple cudaMemcpy wrapped in performance tester. +/// @param dest destination bufer +/// @param src source buffer +/// @param sx width of copied image +/// @param sy height of copied image +template +inline void memCopy(T *const dest, const T *const src, const size_t sx, + const size_t sy) { + cudaError_t status; + PERF_BEGIN + status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice); + PERF_END(" memcpy", sx, sy) + CudaDWTTester::check(status, "memcpy device > device"); +} + } // end of namespace dwt_cuda - - -#endif // DWT_COMMON_CUDA_H - +#endif // DWT_COMMON_CUDA_H diff --git a/examples/dwt2d/dwt_cuda/dwt.h b/examples/dwt2d/dwt_cuda/dwt.h old mode 100755 new mode 100644 index 2c76708..d6e2161 --- a/examples/dwt2d/dwt_cuda/dwt.h +++ b/examples/dwt2d/dwt_cuda/dwt.h @@ -1,4 +1,4 @@ -/// +/// /// @file dwt.h /// @author Martin Jirman (207962@mail.muni.cz) /// @brief Entry points for CUDA implementaion of 9/7 and 5/3 DWT. @@ -8,16 +8,16 @@ /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -56,57 +56,48 @@ /// #ifndef DWT_CUDA_H -#define DWT_CUDA_H - +#define DWT_CUDA_H namespace dwt_cuda { - - - /// Forward 5/3 2D DWT. See common rules (above) for more details. - /// @param in Expected to be normalized into range [-128, 127]. - /// Will not be preserved (will be overwritten). - /// @param out output buffer on GPU - /// @param sizeX width of input image (in pixels) - /// @param sizeY height of input image (in pixels) - /// @param levels number of recursive DWT levels - void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels); - - - /// Reverse 5/3 2D DWT. See common rules (above) for more details. - /// @param in Input DWT coefficients. Format described in common rules. - /// Will not be preserved (will be overwritten). - /// @param out output buffer on GPU - will contain original image - /// in normalized range [-128, 127]. - /// @param sizeX width of input image (in pixels) - /// @param sizeY height of input image (in pixels) - /// @param levels number of recursive DWT levels - void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels); - - - /// Forward 9/7 2D DWT. See common rules (above) for more details. - /// @param in Input DWT coefficients. Should be normalized (in range - /// [-0.5, 0.5]). Will not be preserved (will be overwritten). - /// @param out output buffer on GPU - format specified in common rules - /// @param sizeX width of input image (in pixels) - /// @param sizeY height of input image (in pixels) - /// @param levels number of recursive DWT levels - void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels); - - - /// Reverse 9/7 2D DWT. See common rules (above) for more details. - /// @param in Input DWT coefficients. Format described in common rules. - /// Will not be preserved (will be overwritten). - /// @param out output buffer on GPU - will contain original image - /// in normalized range [-0.5, 0.5]. - /// @param sizeX width of input image (in pixels) - /// @param sizeY height of input image (in pixels) - /// @param levels number of recursive DWT levels - void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels); - - + +/// Forward 5/3 2D DWT. See common rules (above) for more details. +/// @param in Expected to be normalized into range [-128, 127]. +/// Will not be preserved (will be overwritten). +/// @param out output buffer on GPU +/// @param sizeX width of input image (in pixels) +/// @param sizeY height of input image (in pixels) +/// @param levels number of recursive DWT levels +void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels); + +/// Reverse 5/3 2D DWT. See common rules (above) for more details. +/// @param in Input DWT coefficients. Format described in common rules. +/// Will not be preserved (will be overwritten). +/// @param out output buffer on GPU - will contain original image +/// in normalized range [-128, 127]. +/// @param sizeX width of input image (in pixels) +/// @param sizeY height of input image (in pixels) +/// @param levels number of recursive DWT levels +void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels); + +/// Forward 9/7 2D DWT. See common rules (above) for more details. +/// @param in Input DWT coefficients. Should be normalized (in range +/// [-0.5, 0.5]). Will not be preserved (will be overwritten). +/// @param out output buffer on GPU - format specified in common rules +/// @param sizeX width of input image (in pixels) +/// @param sizeY height of input image (in pixels) +/// @param levels number of recursive DWT levels +void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels); + +/// Reverse 9/7 2D DWT. See common rules (above) for more details. +/// @param in Input DWT coefficients. Format described in common rules. +/// Will not be preserved (will be overwritten). +/// @param out output buffer on GPU - will contain original image +/// in normalized range [-0.5, 0.5]. +/// @param sizeX width of input image (in pixels) +/// @param sizeY height of input image (in pixels) +/// @param levels number of recursive DWT levels +void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels); + } // namespace dwt_cuda - - -#endif // DWT_CUDA_H - +#endif // DWT_CUDA_H diff --git a/examples/dwt2d/dwt_cuda/fdwt53.cu b/examples/dwt2d/dwt_cuda/fdwt53.cu index 588acf4..c50bdd2 100755 --- a/examples/dwt2d/dwt_cuda/fdwt53.cu +++ b/examples/dwt2d/dwt_cuda/fdwt53.cu @@ -6,16 +6,16 @@ /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -44,7 +44,7 @@ namespace dwt_cuda { template class FDWT53 { private: - + /// Info needed for processing of one input column. /// @tparam CHECKED_LOADER true if column's loader should check boundaries /// false if there are no near boudnaries to check @@ -52,13 +52,13 @@ namespace dwt_cuda { struct FDWT53Column { /// loader for the column VerticalDWTPixelLoader loader; - + /// offset of the column in shared buffer - int offset; - + int offset; + // backup of first 3 loaded pixels (not transformed) int pixel0, pixel1, pixel2; - + /// Sets all fields to anything to prevent 'uninitialized' warnings. __device__ void clear() { offset = pixel0 = pixel1 = pixel2 = 0; @@ -104,7 +104,7 @@ namespace dwt_cuda { /// @param colIndex x-axis coordinate of the column (relative to the left /// side of this threadblock's block of input pixels) /// @param firstY y-axis coordinate of first image row to be transformed - + template __device__ void initColumn(FDWT53Column & column, const int * const input, @@ -137,7 +137,7 @@ namespace dwt_cuda { column.pixel2 = column.loader.loadFrom(input); // Now, the next pixel, which will be loaded by loader, is pixel #1. } - + } @@ -153,14 +153,14 @@ namespace dwt_cuda { buffer[column.offset + 0 * STRIDE] = column.pixel0; buffer[column.offset + 1 * STRIDE] = column.pixel1; buffer[column.offset + 2 * STRIDE] = column.pixel2; - + // load remaining pixels to be able to vertically transform the window - for(int i = 3; i < (3 + WIN_SIZE_Y); i++) + for(int i = 3; i < (3 + WIN_SIZE_Y); i++) { buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input); } - + // remember last 3 pixels for use in next iteration column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE]; column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE]; @@ -169,7 +169,7 @@ namespace dwt_cuda { // vertically transform the column in transform buffer buffer.forEachVerticalOdd(column.offset, Forward53Predict()); buffer.forEachVerticalEven(column.offset, Forward53Update()); - + } @@ -178,7 +178,7 @@ namespace dwt_cuda { /// @tparam CHECK_WRITES true if output writer must check boundaries /// @param in input image /// @param out output buffer - /// @param sizeX width of the input image + /// @param sizeX width of the input image /// @param sizeY height of the input image /// @param winSteps number of sliding window steps template @@ -186,15 +186,15 @@ namespace dwt_cuda { const int sizeX, const int sizeY, const int winSteps) { // info about one main and one boundary columns processed by this thread - FDWT53Column column; + FDWT53Column column; FDWT53Column boundaryColumn; // only few threads use this - // Initialize all column info: initialize loaders, compute offset of + // Initialize all column info: initialize loaders, compute offset of // column in shared buffer and initialize loader of column. const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps; initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th - + // first 3 threads initialize boundary columns, others do not use them boundaryColumn.clear(); if(threadIdx.x < 3) { @@ -205,9 +205,9 @@ namespace dwt_cuda { initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY); } - - - // index of column which will be written into output by this thread + + + // index of column which will be written into output by this thread const int outColumnIndex = parityIdx(); // offset of column which will be written by this thread into output @@ -219,7 +219,7 @@ namespace dwt_cuda { writer.init(sizeX, sizeY, outputFirstX, firstY); __syncthreads(); - + // Sliding window iterations: // Each iteration assumes that first 3 pixels of each column are loaded. for(int w = 0; w < winSteps; w++) { @@ -227,23 +227,23 @@ namespace dwt_cuda { // For each column (including boundary columns): load and vertically // transform another WIN_SIZE_Y lines. loadAndVerticallyTransform(column, in); - if(threadIdx.x < 3) { - loadAndVerticallyTransform(boundaryColumn, in); + if(threadIdx.x < 3) { + loadAndVerticallyTransform(boundaryColumn, in); } - + // wait for all columns to be vertically transformed and transform all // output rows horizontally __syncthreads(); - + buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict()); __syncthreads(); - + buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update()); // wait for all output rows to be transformed horizontally and write // them into output buffer - __syncthreads(); + __syncthreads(); for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) { @@ -256,20 +256,20 @@ namespace dwt_cuda { // before proceeding to next iteration, wait for all output columns // to be written into the output __syncthreads(); - + } - + } - + public: /// Determines, whether this block's pixels touch boundary and selects /// right version of algorithm according to it - for many threadblocks, it - /// selects version which does not deal with boundary mirroring and thus is + /// selects version which does not deal with boundary mirroring and thus is /// slightly faster. /// @param in input image /// @param out output buffer - /// @param sx width of the input image + /// @param sx width of the input image /// @param sy height of the input image /// @param steps number of sliding window steps __device__ static void run(const int * const in, int * const out, @@ -292,32 +292,32 @@ namespace dwt_cuda { // if(threadIdx.x == 0) { // printf("fdwt53 run"); // } - if(atBottomBoudary) + if(atBottomBoudary) { // near bottom boundary => check both writing and reading fdwt53.transform(in, out, sx, sy, steps); - } else if(atRightBoudary) + } else if(atRightBoudary) { // near right boundary only => check writing only fdwt53.transform(in, out, sx, sy, steps); - } else + } else { // no nearby boundary => check nothing fdwt53.transform(in, out, sx, sy, steps); } } // } - + }; // end of class FDWT53 - - - + + + /// Main GPU 5/3 FDWT entry point. /// @tparam WIN_SX width of sliding window to be used /// @tparam WIN_SY height of sliding window to be used /// @param input input image /// @param output output buffer - /// @param sizeX width of the input image + /// @param sizeX width of the input image /// @param sizeY height of the input image /// @param winSteps number of sliding window steps template @@ -328,20 +328,20 @@ namespace dwt_cuda { FDWT53::run(input, output, sizeX, sizeY, winSteps); } - - /// Only computes optimal number of sliding window steps, + + /// Only computes optimal number of sliding window steps, /// number of threadblocks and then lanches the 5/3 FDWT kernel. /// @tparam WIN_SX width of sliding window /// @tparam WIN_SY height of sliding window /// @param in input image /// @param out output buffer - /// @param sx width of the input image + /// @param sx width of the input image /// @param sy height of the input image template void launchFDWT53Kernel (int * in, int * out, int sx, int sy) { // compute optimal number of steps of each sliding window - + const int steps = divRndUp(sy, 15 * WIN_SY); int gx = divRndUp(sx, WIN_SX); @@ -352,18 +352,18 @@ namespace dwt_cuda { // prepare grid size dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX); - + // run kernel, possibly measure time and finally check the call // PERF_BEGIN fdwt53Kernel<<>>(in, out, sx, sy, steps); // PERF_END(" FDWT53", sx, sy) // CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel"); printf("fdwt53Kernel in launchFDWT53Kernel has finished"); - + } - - - + + + /// Forward 5/3 2D DWT. See common rules (above) for more details. /// @param in Expected to be normalized into range [-128, 127]. /// Will not be preserved (will be overwritten). @@ -373,7 +373,7 @@ namespace dwt_cuda { /// @param levels number of recursive DWT levels void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) { // select right width of kernel for the size of the image - + if(sizeX >= 960) { launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY); } else if (sizeX >= 480) { @@ -381,20 +381,20 @@ namespace dwt_cuda { } else { launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY); } - + // if this was not the last level, continue recursively with other levels if(levels > 1) { // copy output's LL band back into input buffer - const int llSizeX = divRndUp(sizeX, 2); + const int llSizeX = divRndUp(sizeX, 2); const int llSizeY = divRndUp(sizeY, 2); // printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY); memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238 - + // run remaining levels of FDWT fdwt53(in, out, llSizeX, llSizeY, levels - 1); } } - - + + } // end of namespace dwt_cuda diff --git a/examples/dwt2d/dwt_cuda/fdwt97.cu b/examples/dwt2d/dwt_cuda/fdwt97.cu index d61f674..402f8fe 100755 --- a/examples/dwt2d/dwt_cuda/fdwt97.cu +++ b/examples/dwt2d/dwt_cuda/fdwt97.cu @@ -1,4 +1,4 @@ -/// +/// /// @file fdwt97.cu /// @brief CUDA implementation of forward 9/7 2D DWT. /// @author Martin Jirman (207962@mail.muni.cz) @@ -7,16 +7,16 @@ /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -38,8 +38,8 @@ namespace dwt_cuda { - - + + /// Wraps a buffer and methods for computing 9/7 FDWT with sliding window /// of specified size. Template arguments specify this size. /// @tparam WIN_SIZE_X width of sliding window @@ -62,8 +62,8 @@ namespace dwt_cuda { template struct FDWT97ColumnLoadingInfo { /// Loader of pixels from some input image. - VerticalDWTPixelLoader loader; - + VerticalDWTPixelLoader loader; + /// Offset of column loaded by loader. (Offset in shared buffer.) int offset; }; @@ -103,7 +103,7 @@ namespace dwt_cuda { /// @param firstY index of first row to be loaded from image template __device__ void initColumn(FDWT97ColumnLoadingInfo & column, - const int columnIndex, const float * const input, + const int columnIndex, const float * const input, const int sizeX, const int sizeY, const int firstY) { // get offset of the column with index 'columnIndex' @@ -113,7 +113,7 @@ namespace dwt_cuda { // x-coordinate of the first pixel to be loaded by given loader const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex; - + if(blockIdx.y == 0) { // topmost block - apply mirroring rules when loading first 7 rows column.loader.init(sizeX, sizeY, firstX, firstY); @@ -162,7 +162,7 @@ namespace dwt_cuda { /// @tparam CHECK_WRITES true if boundaries should be checked when writing /// @param in input image /// @param out output buffer - /// @param sizeX width of the input image + /// @param sizeX width of the input image /// @param sizeY height of the input image /// @param winSteps number of steps of sliding window template @@ -205,7 +205,7 @@ namespace dwt_cuda { // transform buffer offset of column transformed and saved by this thread const int outColumnOffset = buffer.getColumnOffset(outColumnIndex); - // (Each iteration of this loop assumes that first 7 rows of transform + // (Each iteration of this loop assumes that first 7 rows of transform // buffer are already loaded with horizontally transformed coefficients.) for(int w = 0; w < winSteps; w++) { // Load another WIN_SIZE_Y lines of thread's column into the buffer. @@ -220,7 +220,7 @@ namespace dwt_cuda { horizontalFDWT97(WIN_SIZE_Y, 7); // Using 7 registers, remember current values of last 7 rows of - // transform buffer. These rows are transformed horizontally only + // transform buffer. These rows are transformed horizontally only // and will be used in next iteration. float last7Lines[7]; for(int i = 0; i < 7; i++) { @@ -249,7 +249,7 @@ namespace dwt_cuda { // As expected, these lines are already horizontally transformed. for(int i = 0; i < 7; i++) { buffer[outColumnOffset + i * STRIDE] = last7Lines[i]; - + } // Wait for all writing threads before proceeding to loading new @@ -259,15 +259,15 @@ namespace dwt_cuda { } } - - + + public: /// Runs one of specialized variants of 9/7 FDWT according to distance of - /// processed pixels to image boudnary. Some variants do not check for + /// processed pixels to image boudnary. Some variants do not check for /// boudnary and thus are slightly faster. /// @param in input image /// @param out output buffer - /// @param sx width of the input image + /// @param sx width of the input image /// @param sy height of the input image /// @param steps number of steps of sliding window __device__ static void run(const float * const input, float * const output, @@ -299,15 +299,15 @@ namespace dwt_cuda { fdwt97.transform(input, output, sx, sy, steps); } } - + }; // end of class FDWT97 - - - + + + /// Main GPU 9/7 FDWT entry point. /// @param input input image /// @parma output output buffer - /// @param sx width of the input image + /// @param sx width of the input image /// @param sy height of the input image /// @param steps number of steps of sliding window template @@ -321,21 +321,21 @@ namespace dwt_cuda { FDWT97::run(input, output, sx, sy, steps); } - - - /// Only computes optimal number of sliding window steps, + + + /// Only computes optimal number of sliding window steps, /// number of threadblocks and then lanches the 9/7 FDWT kernel. /// @tparam WIN_SX width of sliding window /// @tparam WIN_SY height of sliding window /// @param in input image /// @param out output buffer - /// @param sx width of the input image + /// @param sx width of the input image /// @param sy height of the input image template void launchFDWT97Kernel (float * in, float * out, int sx, int sy) { // compute optimal number of steps of each sliding window const int steps = divRndUp(sy, 15 * WIN_SY); - + // prepare grid size dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX); @@ -346,11 +346,11 @@ namespace dwt_cuda { PERF_END(" FDWT97", sx, sy) CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel"); } - - - + + + /// Forward 9/7 2D DWT. See common rules (dwt.h) for more details. - /// @param in Input DWT coefficients. Should be normalized (in range + /// @param in Input DWT coefficients. Should be normalized (in range /// [-0.5, 0.5]). Will not be preserved (will be overwritten). /// @param out output buffer on GPU - format specified in common rules /// @param sizeX width of input image (in pixels) @@ -365,19 +365,19 @@ namespace dwt_cuda { } else { launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY); } - + // if this was not the last level, continue recursively with other levels if(levels > 1) { // copy output's LL band back into input buffer const int llSizeX = divRndUp(sizeX, 2); const int llSizeY = divRndUp(sizeY, 2); memCopy(in, out, llSizeX, llSizeY); - + // run remaining levels of FDWT fdwt97(in, out, llSizeX, llSizeY, levels - 1); } } - - + + } // end of namespace dwt_cuda diff --git a/examples/dwt2d/dwt_cuda/io.h b/examples/dwt2d/dwt_cuda/io.h old mode 100755 new mode 100644 index 741def0..ae57ffc --- a/examples/dwt2d/dwt_cuda/io.h +++ b/examples/dwt2d/dwt_cuda/io.h @@ -3,20 +3,20 @@ /// @brief Manages loading and saving lineary stored bands and input images. /// @author Martin Jirman (207962@mail.muni.cz) /// @date 2011-01-20 22:38 -/// +/// /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -30,454 +30,411 @@ /// POSSIBILITY OF SUCH DAMAGE. /// - #ifndef IO_H -#define IO_H - +#define IO_H #include "common.h" namespace dwt_cuda { - - /// Base for all IO classes - manages mirroring. - class DWTIO { - protected: - /// Handles mirroring of image at edges in a DWT correct way. - /// @param d a position in the image (will be replaced by mirrored d) - /// @param sizeD size of the image along the dimension of 'd' - __device__ static void mirror(int & d, const int & sizeD) { - // TODO: enable multiple mirroring: -// if(sizeD > 1) { -// if(d < 0) { -// const int underflow = -1 - d; -// const int phase = (underflow / (sizeD - 1)) & 1; -// const int remainder = underflow % (sizeD - 1); -// if(phase == 0) { -// d = remainder + 1; -// } else { -// d = sizeD - 2 - remainder; -// } -// } else if(d >= sizeD) { -// const int overflow = d - sizeD; -// const int phase = (overflow / (sizeD - 1)) & 1; -// const int remainder = overflow % (sizeD - 1); -// if(phase == 0) { -// d = sizeD - 2 - remainder; -// } else { -// d = remainder + 1; -// } -// } -// } else { -// d = 0; -// } - //for test the mirror's use Feb 17 - if(d >= sizeD) { - d = 2 * sizeD - 2 - d; - } else if(d < 0) { - d = -d; - } +/// Base for all IO classes - manages mirroring. +class DWTIO { +protected: + /// Handles mirroring of image at edges in a DWT correct way. + /// @param d a position in the image (will be replaced by mirrored d) + /// @param sizeD size of the image along the dimension of 'd' + __device__ static void mirror(int &d, const int &sizeD) { + // TODO: enable multiple mirroring: + // if(sizeD > 1) { + // if(d < 0) { + // const int underflow = -1 - d; + // const int phase = (underflow / (sizeD - 1)) & 1; + // const int remainder = underflow % (sizeD - 1); + // if(phase == 0) { + // d = remainder + 1; + // } else { + // d = sizeD - 2 - remainder; + // } + // } else if(d >= sizeD) { + // const int overflow = d - sizeD; + // const int phase = (overflow / (sizeD - 1)) & 1; + // const int remainder = overflow % (sizeD - 1); + // if(phase == 0) { + // d = sizeD - 2 - remainder; + // } else { + // d = remainder + 1; + // } + // } + // } else { + // d = 0; + // } + // for test the mirror's use Feb 17 + if (d >= sizeD) { + d = 2 * sizeD - 2 - d; + } else if (d < 0) { + d = -d; } - }; + } +}; +/// Base class for pixel loader and writer - manages computing start index, +/// stride and end of image for loading column of pixels. +/// @tparam T type of image pixels +/// @tparam CHECKED true = be prepared to image boundary, false = don't care +template class VerticalDWTPixelIO : protected DWTIO { +protected: + int end; ///< index of bottom neightbor of last pixel of column + int stride; ///< increment of pointer to get to next pixel - /// Base class for pixel loader and writer - manages computing start index, - /// stride and end of image for loading column of pixels. - /// @tparam T type of image pixels - /// @tparam CHECKED true = be prepared to image boundary, false = don't care - template - class VerticalDWTPixelIO : protected DWTIO { - protected: - int end; ///< index of bottom neightbor of last pixel of column - int stride; ///< increment of pointer to get to next pixel + /// Initializes pixel IO - sets end index and a position of first pixel. + /// @param sizeX width of the image + /// @param sizeY height of the image + /// @param firstX x-coordinate of first pixel to use + /// @param firstY y-coordinate of first pixel to use + /// @return index of pixel at position [x, y] in the image + __device__ int initialize(const int sizeX, const int sizeY, int firstX, + int firstY) { + // initialize all pointers and stride + end = CHECKED ? (sizeY * sizeX + firstX) : 0; + stride = sizeX; + return firstX + sizeX * firstY; + } +}; - /// Initializes pixel IO - sets end index and a position of first pixel. - /// @param sizeX width of the image - /// @param sizeY height of the image - /// @param firstX x-coordinate of first pixel to use - /// @param firstY y-coordinate of first pixel to use - /// @return index of pixel at position [x, y] in the image - __device__ int initialize(const int sizeX, const int sizeY, - int firstX, int firstY) { - // initialize all pointers and stride - end = CHECKED ? (sizeY * sizeX + firstX) : 0; - stride = sizeX; - return firstX + sizeX * firstY; - } - }; +/// Writes reverse transformed pixels directly into output image. +/// @tparam T type of output pixels +/// @tparam CHECKED true = be prepared to image boundary, false = don't care +template +class VerticalDWTPixelWriter : VerticalDWTPixelIO { +private: + int next; // index of the next pixel to be loaded - - - /// Writes reverse transformed pixels directly into output image. - /// @tparam T type of output pixels - /// @tparam CHECKED true = be prepared to image boundary, false = don't care - template - class VerticalDWTPixelWriter : VerticalDWTPixelIO { - private: - int next; // index of the next pixel to be loaded - - public: - /// Initializes writer - sets output buffer and a position of first pixel. - /// @param sizeX width of the image - /// @param sizeY height of the image - /// @param firstX x-coordinate of first pixel to write into - /// @param firstY y-coordinate of first pixel to write into - __device__ void init(const int sizeX, const int sizeY, - int firstX, int firstY) { - if(firstX < sizeX) { - next = this->initialize(sizeX, sizeY, firstX, firstY); - } else { - this->end = 0; - this->stride = 0; - next = 0; - } - } - - /// Writes given value at next position and advances internal pointer while - /// correctly handling mirroring. - /// @param output output image to write pixel into - /// @param value value of the pixel to be written - __device__ void writeInto(T * const output, const T & value) { - if((!CHECKED) || (next != this->end)) { - output[next] = value; - next += this->stride; - } - } - }; - - - - /// Loads pixels from input image. - /// @tparam T type of image input pixels - /// @tparam CHECKED true = be prepared to image boundary, false = don't care - template - class VerticalDWTPixelLoader - : protected VerticalDWTPixelIO { - private: - int last; ///< index of last loaded pixel - public: - - - //******************* FOR TEST ********************** - __device__ int getlast(){ - return last; - } - __device__ int getend(){ - return this->end; - } - __device__ int getstride(){ - return this->stride; - } - __device__ void setend(int a){ - this->end=a; - } - //******************* FOR TEST ********************** - - - - /// Initializes loader - sets input size and a position of first pixel. - /// @param sizeX width of the image - /// @param sizeY height of the image - /// @param firstX x-coordinate of first pixel to load - /// @param firstY y-coordinate of first pixel to load - __device__ void init(const int sizeX, const int sizeY, - int firstX, int firstY) { - // correctly mirror x coordinate - this->mirror(firstX, sizeX); - - // 'last' always points to already loaded pixel (subtract sizeX = stride) - last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX; - //last = (FirstX + sizeX * FirstY) - sizeX - } - - /// Sets all fields to zeros, for compiler not to complain about - /// uninitialized stuff. - __device__ void clear() { +public: + /// Initializes writer - sets output buffer and a position of first pixel. + /// @param sizeX width of the image + /// @param sizeY height of the image + /// @param firstX x-coordinate of first pixel to write into + /// @param firstY y-coordinate of first pixel to write into + __device__ void init(const int sizeX, const int sizeY, int firstX, + int firstY) { + if (firstX < sizeX) { + next = this->initialize(sizeX, sizeY, firstX, firstY); + } else { this->end = 0; this->stride = 0; - this->last = 0; + next = 0; + } + } + + /// Writes given value at next position and advances internal pointer while + /// correctly handling mirroring. + /// @param output output image to write pixel into + /// @param value value of the pixel to be written + __device__ void writeInto(T *const output, const T &value) { + if ((!CHECKED) || (next != this->end)) { + output[next] = value; + next += this->stride; + } + } +}; + +/// Loads pixels from input image. +/// @tparam T type of image input pixels +/// @tparam CHECKED true = be prepared to image boundary, false = don't care +template +class VerticalDWTPixelLoader : protected VerticalDWTPixelIO { +private: + int last; ///< index of last loaded pixel +public: + //******************* FOR TEST ********************** + __device__ int getlast() { return last; } + __device__ int getend() { return this->end; } + __device__ int getstride() { return this->stride; } + __device__ void setend(int a) { this->end = a; } + //******************* FOR TEST ********************** + + /// Initializes loader - sets input size and a position of first pixel. + /// @param sizeX width of the image + /// @param sizeY height of the image + /// @param firstX x-coordinate of first pixel to load + /// @param firstY y-coordinate of first pixel to load + __device__ void init(const int sizeX, const int sizeY, int firstX, + int firstY) { + // correctly mirror x coordinate + this->mirror(firstX, sizeX); + + // 'last' always points to already loaded pixel (subtract sizeX = stride) + last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX; + // last = (FirstX + sizeX * FirstY) - sizeX + } + + /// Sets all fields to zeros, for compiler not to complain about + /// uninitialized stuff. + __device__ void clear() { + this->end = 0; + this->stride = 0; + this->last = 0; + } + + /// Gets another pixel and advancees internal pointer to following one. + /// @param input input image to load next pixel from + /// @return next pixel from given image + __device__ T loadFrom(const T *const input) { + last += this->stride; + if (CHECKED && (last == this->end)) { + last -= 2 * this->stride; + this->stride = -this->stride; // reverse loader's direction + } + // avoid reading from negative indices if loader is checked + // return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this + // checked variant later + if (last < 0) { + return 0; } - /// Gets another pixel and advancees internal pointer to following one. - /// @param input input image to load next pixel from - /// @return next pixel from given image - __device__ T loadFrom(const T * const input) { - last += this->stride; - if(CHECKED && (last == this->end)) { - last -= 2 * this->stride; - this->stride = -this->stride; // reverse loader's direction - } - // avoid reading from negative indices if loader is checked - // return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later - if(last < 0 ) { - return 0; - } - - return input[last]; - // return this->end; - // return last; - // return this->stride; - } - }; + return input[last]; + // return this->end; + // return last; + // return this->stride; + } +}; +/// Base for band write and loader. Manages computing strides and pointers +/// to first and last pixels in a linearly-stored-bands correct way. +/// @tparam T type of band coefficients +/// @tparam CHECKED true = be prepared to image boundary, false = don't care +template class VerticalDWTBandIO : protected DWTIO { +protected: + /// index of bottom neighbor of last pixel of loaded column + int end; + /// increment of index to get from highpass band to the lowpass one + int strideHighToLow; - /// Base for band write and loader. Manages computing strides and pointers - /// to first and last pixels in a linearly-stored-bands correct way. - /// @tparam T type of band coefficients - /// @tparam CHECKED true = be prepared to image boundary, false = don't care - template - class VerticalDWTBandIO : protected DWTIO { - protected: - /// index of bottom neighbor of last pixel of loaded column - int end; - - /// increment of index to get from highpass band to the lowpass one - int strideHighToLow; - - /// increment of index to get from the lowpass band to the highpass one - int strideLowToHigh; + /// increment of index to get from the lowpass band to the highpass one + int strideLowToHigh; - /// Initializes IO - sets size of image and a position of first pixel. - /// @param imageSizeX width of the image - /// @param imageSizeY height of the image - /// @param firstX x-coordinate of first pixel to use - /// (Parity determines vertically low or high band.) - /// @param firstY y-coordinate of first pixel to use - /// (Parity determines horizontally low or high band.) - /// @return index of first item specified by firstX and firstY - __device__ int initialize(const int imageSizeX, const int imageSizeY, - int firstX, int firstY) { - // index of first pixel (topmost one) of the column with index firstX - int columnOffset = firstX / 2; - - // difference between indices of two vertically neighboring pixels - // in the same band - int verticalStride; - - // resolve index of first pixel according to horizontal parity - if(firstX & 1) { - // first pixel in one of right bands - verticalStride = imageSizeX / 2; - columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2); - strideLowToHigh = (imageSizeX * imageSizeY) / 2; - } else { - // first pixel in one of left bands - verticalStride = imageSizeX / 2 + (imageSizeX & 1); - strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX; - } - - // set the other stride - strideHighToLow = verticalStride - strideLowToHigh; + /// Initializes IO - sets size of image and a position of first pixel. + /// @param imageSizeX width of the image + /// @param imageSizeY height of the image + /// @param firstX x-coordinate of first pixel to use + /// (Parity determines vertically low or high band.) + /// @param firstY y-coordinate of first pixel to use + /// (Parity determines horizontally low or high band.) + /// @return index of first item specified by firstX and firstY + __device__ int initialize(const int imageSizeX, const int imageSizeY, + int firstX, int firstY) { + // index of first pixel (topmost one) of the column with index firstX + int columnOffset = firstX / 2; - // compute index of coefficient which indicates end of image - if(CHECKED) { - end = columnOffset // right column - + (imageSizeY / 2) * verticalStride // right row - + (imageSizeY & 1) * strideLowToHigh; // possibly in high band - } else { - end = 0; - } + // difference between indices of two vertically neighboring pixels + // in the same band + int verticalStride; - - //***********for test************** - // end = CHECKED; - //***********for test************** - - - // finally, return index of the first item - return columnOffset // right column - + (firstY / 2) * verticalStride // right row - + (firstY & 1) * strideLowToHigh; // possibly in high band - } - }; - - - - - /// Directly loads coefficients from four consecutively stored transformed - /// bands. - /// @tparam T type of input band coefficients - /// @tparam CHECKED true = be prepared to image boundary, false = don't care - template - class VerticalDWTBandLoader : public VerticalDWTBandIO { - private: - int last; ///< index of last loaded pixel - - /// Checks internal index and possibly reverses direction of loader. - /// (Handles mirroring at the bottom of the image.) - /// @param input input image to load next coefficient from - /// @param stride stride to use now (one of two loader's strides) - /// @return loaded coefficient - __device__ T updateAndLoad(const T * const input, const int & stride) { - last += stride; - if(CHECKED && (last == this->end)) { - // undo last two updates of index (to get to previous mirrored item) - last -= (this->strideLowToHigh + this->strideHighToLow); - - // swap and reverse strides (to move up in the loaded column now) - const int temp = this->strideLowToHigh; - this->strideLowToHigh = -this->strideHighToLow; - this->strideHighToLow = -temp; - } - if(last < 0 ) { - return 0; - } - // avoid reading from negative indices if loader is checked - // return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later - return input[last]; - } - public: - - /// Initializes loader - sets input size and a position of first pixel. - /// @param imageSizeX width of the image - /// @param imageSizeY height of the image - /// @param firstX x-coordinate of first pixel to load - /// (Parity determines vertically low or high band.) - /// @param firstY y-coordinate of first pixel to load - /// (Parity determines horizontally low or high band.) - __device__ void init(const int imageSizeX, const int imageSizeY, - int firstX, const int firstY) { - this->mirror(firstX, imageSizeX); - last = this->initialize(imageSizeX, imageSizeY, firstX, firstY); - - // adjust to point to previous item - last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow; - } - - /// Sets all fields to zeros, for compiler not to complain about - /// uninitialized stuff. - __device__ void clear() { - this->end = 0; - this->strideHighToLow = 0; - this->strideLowToHigh = 0; - this->last = 0; + // resolve index of first pixel according to horizontal parity + if (firstX & 1) { + // first pixel in one of right bands + verticalStride = imageSizeX / 2; + columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2); + strideLowToHigh = (imageSizeX * imageSizeY) / 2; + } else { + // first pixel in one of left bands + verticalStride = imageSizeX / 2 + (imageSizeX & 1); + strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX; } - /// Gets another coefficient from lowpass band and advances internal index. - /// Call this method first if position of first pixel passed to init - /// was in high band. - /// @param input input image to load next coefficient from - /// @return next coefficient from the lowpass band of the given image - __device__ T loadLowFrom(const T * const input) { - return updateAndLoad(input, this->strideHighToLow); + // set the other stride + strideHighToLow = verticalStride - strideLowToHigh; + + // compute index of coefficient which indicates end of image + if (CHECKED) { + end = columnOffset // right column + + (imageSizeY / 2) * verticalStride // right row + + (imageSizeY & 1) * strideLowToHigh; // possibly in high band + } else { + end = 0; } - /// Gets another coefficient from the highpass band and advances index. - /// Call this method first if position of first pixel passed to init - /// was in high band. - /// @param input input image to load next coefficient from - /// @return next coefficient from the highbass band of the given image - __device__ T loadHighFrom(const T * const input) { - return updateAndLoad(input, this->strideLowToHigh); + //***********for test************** + // end = CHECKED; + //***********for test************** + + // finally, return index of the first item + return columnOffset // right column + + (firstY / 2) * verticalStride // right row + + (firstY & 1) * strideLowToHigh; // possibly in high band + } +}; + +/// Directly loads coefficients from four consecutively stored transformed +/// bands. +/// @tparam T type of input band coefficients +/// @tparam CHECKED true = be prepared to image boundary, false = don't care +template +class VerticalDWTBandLoader : public VerticalDWTBandIO { +private: + int last; ///< index of last loaded pixel + + /// Checks internal index and possibly reverses direction of loader. + /// (Handles mirroring at the bottom of the image.) + /// @param input input image to load next coefficient from + /// @param stride stride to use now (one of two loader's strides) + /// @return loaded coefficient + __device__ T updateAndLoad(const T *const input, const int &stride) { + last += stride; + if (CHECKED && (last == this->end)) { + // undo last two updates of index (to get to previous mirrored item) + last -= (this->strideLowToHigh + this->strideHighToLow); + + // swap and reverse strides (to move up in the loaded column now) + const int temp = this->strideLowToHigh; + this->strideLowToHigh = -this->strideHighToLow; + this->strideHighToLow = -temp; } - - }; - - - - - /// Directly saves coefficients into four transformed bands. - /// @tparam T type of output band coefficients - /// @tparam CHECKED true = be prepared to image boundary, false = don't care - template - class VerticalDWTBandWriter : public VerticalDWTBandIO { - private: - int next; ///< index of last loaded pixel - - /// Checks internal index and possibly stops the writer. - /// (Handles mirroring at edges of the image.) - /// @param output output buffer - /// @param item item to put into the output - /// @param stride increment of the pointer to get to next output index - __device__ int saveAndUpdate(T * const output, const T & item, - const int & stride) { -// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){ //test, Mar 20 - if((!CHECKED) || (next != this->end)) { - // if(next == 4) { - // printf(" next: %d stride: %d val: %f \n", next, stride, item ); - // } - output[next] = item; - next += stride; - } -// } - // if((!CHECKED) || (next != this->end)) { //the real one - // output[next] = item; - // next += stride; //stride has been test - // } - return next; + if (last < 0) { + return 0; } - public: + // avoid reading from negative indices if loader is checked + // return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this + // checked variant later + return input[last]; + } - /// Initializes writer - sets output size and a position of first pixel. - /// @param output output image - /// @param imageSizeX width of the image - /// @param imageSizeY height of the image - /// @param firstX x-coordinate of first pixel to write - /// (Parity determines vertically low or high band.) - /// @param firstY y-coordinate of first pixel to write - /// (Parity determines horizontally low or high band.) - __device__ void init(const int imageSizeX, const int imageSizeY, - const int firstX, const int firstY) { - if (firstX < imageSizeX) { - next = this->initialize(imageSizeX, imageSizeY, firstX, firstY); - } else { - clear(); - } - } - - /// Sets all fields to zeros, for compiler not to complain about - /// uninitialized stuff. - __device__ void clear() { - this->end = 0; - this->strideHighToLow = 0; - this->strideLowToHigh = 0; - this->next = 0; - } +public: + /// Initializes loader - sets input size and a position of first pixel. + /// @param imageSizeX width of the image + /// @param imageSizeY height of the image + /// @param firstX x-coordinate of first pixel to load + /// (Parity determines vertically low or high band.) + /// @param firstY y-coordinate of first pixel to load + /// (Parity determines horizontally low or high band.) + __device__ void init(const int imageSizeX, const int imageSizeY, int firstX, + const int firstY) { + this->mirror(firstX, imageSizeX); + last = this->initialize(imageSizeX, imageSizeY, firstX, firstY); - /// Writes another coefficient into the band which was specified using - /// init's firstX and firstY parameters and advances internal pointer. - /// Call this method first if position of first pixel passed to init - /// was in lowpass band. - /// @param output output image - /// @param low lowpass coefficient to save into the lowpass band - __device__ int writeLowInto(T * const output, const T & primary) { - return saveAndUpdate(output, primary, this->strideLowToHigh); - } + // adjust to point to previous item + last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow; + } - /// Writes another coefficient from the other band and advances pointer. - /// Call this method first if position of first pixel passed to init - /// was in highpass band. - /// @param output output image - /// @param high highpass coefficient to save into the highpass band - __device__ int writeHighInto(T * const output, const T & other) { - return saveAndUpdate(output, other, this->strideHighToLow); - } + /// Sets all fields to zeros, for compiler not to complain about + /// uninitialized stuff. + __device__ void clear() { + this->end = 0; + this->strideHighToLow = 0; + this->strideLowToHigh = 0; + this->last = 0; + } + + /// Gets another coefficient from lowpass band and advances internal index. + /// Call this method first if position of first pixel passed to init + /// was in high band. + /// @param input input image to load next coefficient from + /// @return next coefficient from the lowpass band of the given image + __device__ T loadLowFrom(const T *const input) { + return updateAndLoad(input, this->strideHighToLow); + } + + /// Gets another coefficient from the highpass band and advances index. + /// Call this method first if position of first pixel passed to init + /// was in high band. + /// @param input input image to load next coefficient from + /// @return next coefficient from the highbass band of the given image + __device__ T loadHighFrom(const T *const input) { + return updateAndLoad(input, this->strideLowToHigh); + } +}; + +/// Directly saves coefficients into four transformed bands. +/// @tparam T type of output band coefficients +/// @tparam CHECKED true = be prepared to image boundary, false = don't care +template +class VerticalDWTBandWriter : public VerticalDWTBandIO { +private: + int next; ///< index of last loaded pixel + + /// Checks internal index and possibly stops the writer. + /// (Handles mirroring at edges of the image.) + /// @param output output buffer + /// @param item item to put into the output + /// @param stride increment of the pointer to get to next output index + __device__ int saveAndUpdate(T *const output, const T &item, + const int &stride) { + // if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){ + ////test, Mar 20 + if ((!CHECKED) || (next != this->end)) { + // if(next == 4) { + // printf(" next: %d stride: %d val: %f \n", next, stride, item ); + // } + output[next] = item; + next += stride; + } + // } + // if((!CHECKED) || (next != this->end)) { //the real one + // output[next] = item; + // next += stride; //stride has been test + // } + return next; + } + +public: + /// Initializes writer - sets output size and a position of first pixel. + /// @param output output image + /// @param imageSizeX width of the image + /// @param imageSizeY height of the image + /// @param firstX x-coordinate of first pixel to write + /// (Parity determines vertically low or high band.) + /// @param firstY y-coordinate of first pixel to write + /// (Parity determines horizontally low or high band.) + __device__ void init(const int imageSizeX, const int imageSizeY, + const int firstX, const int firstY) { + if (firstX < imageSizeX) { + next = this->initialize(imageSizeX, imageSizeY, firstX, firstY); + } else { + clear(); + } + } + + /// Sets all fields to zeros, for compiler not to complain about + /// uninitialized stuff. + __device__ void clear() { + this->end = 0; + this->strideHighToLow = 0; + this->strideLowToHigh = 0; + this->next = 0; + } + + /// Writes another coefficient into the band which was specified using + /// init's firstX and firstY parameters and advances internal pointer. + /// Call this method first if position of first pixel passed to init + /// was in lowpass band. + /// @param output output image + /// @param low lowpass coefficient to save into the lowpass band + __device__ int writeLowInto(T *const output, const T &primary) { + return saveAndUpdate(output, primary, this->strideLowToHigh); + } + + /// Writes another coefficient from the other band and advances pointer. + /// Call this method first if position of first pixel passed to init + /// was in highpass band. + /// @param output output image + /// @param high highpass coefficient to save into the highpass band + __device__ int writeHighInto(T *const output, const T &other) { + return saveAndUpdate(output, other, this->strideHighToLow); + } + + //*******Add three functions to get private values******* + __device__ int getnext() { return next; } + + __device__ int getend() { return this->end; } + + __device__ int getstrideHighToLow() { return this->strideHighToLow; } + + __device__ int getstrideLowToHigh() { return this->strideLowToHigh; } + + //*******Add three functions to get private values******* +}; - //*******Add three functions to get private values******* - __device__ int getnext(){ - return next; - } - - __device__ int getend(){ - return this->end; - } - - __device__ int getstrideHighToLow(){ - return this->strideHighToLow; - } - - __device__ int getstrideLowToHigh(){ - return this->strideLowToHigh; - } - - //*******Add three functions to get private values******* - }; - - - } // namespace dwt_cuda - -#endif // IO_H - +#endif // IO_H diff --git a/examples/dwt2d/dwt_cuda/rdwt53.cu b/examples/dwt2d/dwt_cuda/rdwt53.cu index c5ecb2b..7cdaaf0 100755 --- a/examples/dwt2d/dwt_cuda/rdwt53.cu +++ b/examples/dwt2d/dwt_cuda/rdwt53.cu @@ -1,4 +1,4 @@ -/// +/// /// @file rdwt53.cu /// @brief CUDA implementation of reverse 5/3 2D DWT. /// @author Martin Jirman (207962@mail.muni.cz) @@ -7,16 +7,16 @@ /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -38,7 +38,7 @@ namespace dwt_cuda { - + /// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT /// using sliding window and lifting schema. @@ -46,8 +46,8 @@ namespace dwt_cuda { /// @tparam WIN_SIZE_Y height of sliding window template class RDWT53 { - private: - + private: + /// Shared memory buffer used for 5/3 DWT transforms. typedef TransformBuffer RDWT53Buffer; @@ -64,10 +64,10 @@ namespace dwt_cuda { struct RDWT53Column { /// loader of pixels from column in input image VerticalDWTBandLoader loader; - + /// Offset of corresponding column in shared buffer. int offset; - + /// Sets all fields to some values to avoid 'uninitialized' warnings. __device__ void clear() { offset = 0; @@ -128,7 +128,7 @@ namespace dwt_cuda { /// @param sizeY height of the input image /// @param loader (uninitialized) info about loaded column template - __device__ void initColumn(const int columnX, const int * const input, + __device__ void initColumn(const int columnX, const int * const input, const int sizeX, const int sizeY, RDWT53Column & column, const int firstY) { @@ -162,7 +162,7 @@ namespace dwt_cuda { /// @tparam CHECKED_WRITES true if boundaries must be checked when writing /// @param in input image (5/3 transformed coefficients) /// @param out output buffer (for reverse transformed image) - /// @param sizeX width of the output image + /// @param sizeX width of the output image /// @param sizeY height of the output image /// @param winSteps number of sliding window steps template @@ -182,7 +182,7 @@ namespace dwt_cuda { // column #0, thread #1 get right column #1 and thread #2 left column. const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3); - // Thread initializes offset of the boundary column (in shared + // Thread initializes offset of the boundary column (in shared // buffer), first 3 pixels of the column and a loader for this column. initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY); } @@ -216,8 +216,8 @@ namespace dwt_cuda { // horizontally transform all newly loaded lines horizontalTransform(WIN_SIZE_Y, 3); - // Using 3 registers, remember current values of last 3 rows - // of transform buffer. These rows are transformed horizontally + // Using 3 registers, remember current values of last 3 rows + // of transform buffer. These rows are transformed horizontally // only and will be used in next iteration. int last3Lines[3]; last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE]; @@ -253,7 +253,7 @@ namespace dwt_cuda { /// Main GPU 5/3 RDWT entry point. /// @param in input image (5/3 transformed coefficients) /// @param out output buffer (for reverse transformed image) - /// @param sizeX width of the output image + /// @param sizeX width of the output image /// @param sizeY height of the output image /// @param winSteps number of sliding window steps __device__ static void run(const int * const input, int * const output, @@ -284,13 +284,13 @@ namespace dwt_cuda { } }; // end of class RDWT53 - - - + + + /// Main GPU 5/3 RDWT entry point. /// @param in input image (5/3 transformed coefficients) /// @param out output buffer (for reverse transformed image) - /// @param sizeX width of the output image + /// @param sizeX width of the output image /// @param sizeY height of the output image /// @param winSteps number of sliding window steps template @@ -299,34 +299,34 @@ namespace dwt_cuda { const int sx, const int sy, const int steps) { RDWT53::run(in, out, sx, sy, steps); } - - - - /// Only computes optimal number of sliding window steps, + + + + /// Only computes optimal number of sliding window steps, /// number of threadblocks and then lanches the 5/3 RDWT kernel. /// @tparam WIN_SX width of sliding window /// @tparam WIN_SY height of sliding window /// @param in input image /// @param out output buffer - /// @param sx width of the input image + /// @param sx width of the input image /// @param sy height of the input image template void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) { // compute optimal number of steps of each sliding window const int steps = divRndUp(sy, 15 * WIN_SY); - + // prepare grid size dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); - + // finally transform this level PERF_BEGIN rdwt53Kernel<<>>(in, out, sx, sy, steps); PERF_END(" RDWT53", sx, sy) CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel"); } - - - + + + /// Reverse 5/3 2D DWT. See common rules (above) for more details. /// @param in Input DWT coefficients. Format described in common rules. /// Will not be preserved (will be overwritten). @@ -341,11 +341,11 @@ namespace dwt_cuda { const int llSizeX = divRndUp(sizeX, 2); const int llSizeY = divRndUp(sizeY, 2); rdwt53(in, out, llSizeX, llSizeY, levels - 1); - + // copy reverse transformed LL band from output back into the input memCopy(in, out, llSizeX, llSizeY); } - + // select right width of kernel for the size of the image if(sizeX >= 960) { launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY); @@ -355,6 +355,6 @@ namespace dwt_cuda { launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY); } } - + } // end of namespace dwt_cuda diff --git a/examples/dwt2d/dwt_cuda/rdwt97.cu b/examples/dwt2d/dwt_cuda/rdwt97.cu index 151d69d..40c5221 100755 --- a/examples/dwt2d/dwt_cuda/rdwt97.cu +++ b/examples/dwt2d/dwt_cuda/rdwt97.cu @@ -1,4 +1,4 @@ -/// +/// /// @file rdwt97.cu /// @brief CUDA implementation of reverse 9/7 2D DWT. /// @author Martin Jirman (207962@mail.muni.cz) @@ -7,16 +7,16 @@ /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -38,7 +38,7 @@ namespace dwt_cuda { - + /// Wraps shared memory buffer and methods for computing 9/7 RDWT using /// lifting schema and sliding window. /// @tparam WIN_SIZE_X width of the sliding window @@ -46,7 +46,7 @@ namespace dwt_cuda { template class RDWT97 { private: - + /// Info related to loading of one input column. /// @tparam CHECKED true if boundary chould be checked, /// false if there is no near boudnary @@ -54,10 +54,10 @@ namespace dwt_cuda { struct RDWT97Column { /// laoder of input pxels for given column. VerticalDWTBandLoader loader; - + /// Offset of loaded column in shared memory buffer. int offset; - + /// Sets all fields to some values to avoid 'uninitialized' warnings. __device__ void clear() { loader.clear(); @@ -104,7 +104,7 @@ namespace dwt_cuda { /// @param column (uninitialized) info about loading one column /// @param firstY index of first image row to be transformed template - __device__ void initColumn(const int colIndex, const float * const input, + __device__ void initColumn(const int colIndex, const float * const input, const int sizeX, const int sizeY, RDWT97Column & column, const int firstY) { @@ -124,7 +124,7 @@ namespace dwt_cuda { buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input); buffer[column.offset + 5 * STRIDE] = buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input); - buffer[column.offset + 6 * STRIDE] = + buffer[column.offset + 6 * STRIDE] = buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input); } else { // non-topmost row - regular loading: @@ -162,7 +162,7 @@ namespace dwt_cuda { /// when writing into output buffer /// @param in input image (9/7 transformed coefficients) /// @param out output buffer (for reverse transformed image) - /// @param sizeX width of the output image + /// @param sizeX width of the output image /// @param sizeY height of the output image /// @param winSteps number of steps of sliding window template @@ -182,7 +182,7 @@ namespace dwt_cuda { // each thread among first 7 ones gets index of one of boundary columns const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7); - // Thread initializes offset of the boundary column (in shared + // Thread initializes offset of the boundary column (in shared // buffer), first 7 pixels of the column and a loader for this column. initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY); } @@ -201,7 +201,7 @@ namespace dwt_cuda { // offset of column (in transform buffer) saved by this thread const int outColumnOffset = buffer.getColumnOffset(threadIdx.x); - // (Each iteration assumes that first 7 rows of transform buffer are + // (Each iteration assumes that first 7 rows of transform buffer are // already loaded with horizontally transformed pixels.) for(int w = 0; w < winSteps; w++) { // Load another WIN_SIZE_Y lines of this thread's column @@ -216,8 +216,8 @@ namespace dwt_cuda { // horizontally transform all newly loaded lines horizontalRDWT97(WIN_SIZE_Y, 7); - // Using 7 registers, remember current values of last 7 rows - // of transform buffer. These rows are transformed horizontally + // Using 7 registers, remember current values of last 7 rows + // of transform buffer. These rows are transformed horizontally // only and will be used in next iteration. float last7Lines[7]; for(int i = 0; i < 7; i++) { @@ -257,13 +257,13 @@ namespace dwt_cuda { /// Main GPU 9/7 RDWT entry point. /// @param in input image (9/7 transformed coefficients) /// @param out output buffer (for reverse transformed image) - /// @param sizeX width of the output image + /// @param sizeX width of the output image /// @param sizeY height of the output image __device__ static void run(const float * const input, float * const output, const int sx, const int sy, const int steps) { // prepare instance with buffer in shared memory __shared__ RDWT97 rdwt97; - + // Compute limits of this threadblock's block of pixels and use them to // determine, whether this threadblock will have to deal with boundary. // (3 in next expressions is for radius of impulse response of 9/7 RDWT.) @@ -285,15 +285,15 @@ namespace dwt_cuda { rdwt97.transform(input, output, sx, sy, steps); } } - + }; // end of class RDWT97 - - - + + + /// Main GPU 9/7 RDWT entry point. /// @param in input image (9/7 transformed coefficients) /// @param out output buffer (for reverse transformed image) - /// @param sizeX width of the output image + /// @param sizeX width of the output image /// @param sizeY height of the output image template __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97), 8)) @@ -301,34 +301,34 @@ namespace dwt_cuda { const int sx, const int sy, const int steps) { RDWT97::run(in, out, sx, sy, steps); } - - - - /// Only computes optimal number of sliding window steps, + + + + /// Only computes optimal number of sliding window steps, /// number of threadblocks and then lanches the 9/7 RDWT kernel. /// @tparam WIN_SX width of sliding window /// @tparam WIN_SY height of sliding window /// @param in input image /// @param out output buffer - /// @param sx width of the input image + /// @param sx width of the input image /// @param sy height of the input image template void launchRDWT97Kernel (float * in, float * out, int sx, int sy) { // compute optimal number of steps of each sliding window const int steps = divRndUp(sy, 15 * WIN_SY); - + // prepare grid size dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); - + // finally launch kernel PERF_BEGIN rdwt97Kernel<<>>(in, out, sx, sy, steps); PERF_END(" RDWT97", sx, sy) CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel"); } - - - + + + /// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details. /// @param in Input DWT coefficients. Format described in common rules. /// Will not be preserved (will be overwritten). @@ -343,11 +343,11 @@ namespace dwt_cuda { const int llSizeX = divRndUp(sizeX, 2); const int llSizeY = divRndUp(sizeY, 2); rdwt97(in, out, llSizeX, llSizeY, levels - 1); - + // copy reverse transformed LL band from output back into the input memCopy(in, out, llSizeX, llSizeY); } - + // select right width of kernel for the size of the image if(sizeX >= 960) { launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY); @@ -357,7 +357,7 @@ namespace dwt_cuda { launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY); } } - - + + } // end of namespace dwt_cuda diff --git a/examples/dwt2d/dwt_cuda/transform_buffer.h b/examples/dwt2d/dwt_cuda/transform_buffer.h old mode 100755 new mode 100644 index 69b74e2..ba98b42 --- a/examples/dwt2d/dwt_cuda/transform_buffer.h +++ b/examples/dwt2d/dwt_cuda/transform_buffer.h @@ -7,16 +7,16 @@ /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. -/// +/// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: -/// +/// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. -/// +/// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -30,344 +30,309 @@ /// POSSIBILITY OF SUCH DAMAGE. /// - #ifndef TRANSFORM_BUFFER_H -#define TRANSFORM_BUFFER_H - +#define TRANSFORM_BUFFER_H namespace dwt_cuda { - - - /// Buffer (in shared memory of GPU) where block of input image is stored, - /// but odd and even lines are separated. (Generates less bank conflicts when - /// using lifting schema.) All operations expect SIZE_X threads. - /// Also implements basic building blocks of lifting schema. - /// @tparam SIZE_X width of the buffer excluding two boundaries (Also - /// a number of threads participating on all operations.) - /// Must be divisible by 4. - /// @tparam SIZE_Y height of buffer (total number of lines) - /// @tparam BOUNDARY_X number of extra pixels at the left and right side - /// boundary is expected to be smaller than half SIZE_X - /// Must be divisible by 2. - template - class TransformBuffer { - public: - enum { - /// difference between pointers to two vertical neigbors - VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2) - }; - - private: - enum { - /// number of shared memory banks - needed for correct padding - #ifdef __CUDA_ARCH__ - SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16), - #else - SHM_BANKS = 16, // for host code only - can be anything, won't be used - #endif - - /// size of one of two buffers (odd or even) - BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y, - - /// unused space between two buffers - PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS), - - /// offset of the odd columns buffer from the beginning of data buffer - ODD_OFFSET = BUFFER_SIZE + PADDING, - }; - /// buffer for both even and odd columns - T data[2 * BUFFER_SIZE + PADDING]; - - - - /// Applies specified function to all central elements while also passing - /// previous and next elements as parameters. - /// @param count count of central elements to apply function to - /// @param prevOffset offset of first central element - /// @param midOffset offset of first central element's predecessor - /// @param nextOffset offset of first central element's successor - /// @param function the function itself - template - __device__ void horizontalStep(const int count, const int prevOffset, - const int midOffset, const int nextOffset, - const FUNC & function) { - // number of unchecked iterations - const int STEPS = count / SIZE_X; - - // items remaining after last unchecked iteration - const int finalCount = count % SIZE_X; - - // offset of items processed in last (checked) iteration - const int finalOffset = count - finalCount; - - // all threads perform fixed number of iterations ... - for(int i = 0; i < STEPS; i++) { +/// Buffer (in shared memory of GPU) where block of input image is stored, +/// but odd and even lines are separated. (Generates less bank conflicts when +/// using lifting schema.) All operations expect SIZE_X threads. +/// Also implements basic building blocks of lifting schema. +/// @tparam SIZE_X width of the buffer excluding two boundaries (Also +/// a number of threads participating on all operations.) +/// Must be divisible by 4. +/// @tparam SIZE_Y height of buffer (total number of lines) +/// @tparam BOUNDARY_X number of extra pixels at the left and right side +/// boundary is expected to be smaller than half SIZE_X +/// Must be divisible by 2. +template +class TransformBuffer { +public: + enum { + /// difference between pointers to two vertical neigbors + VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2) + }; + +private: + enum { +/// number of shared memory banks - needed for correct padding +#ifdef __CUDA_ARCH__ + SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16), +#else + SHM_BANKS = 16, // for host code only - can be anything, won't be used +#endif + + /// size of one of two buffers (odd or even) + BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y, + + /// unused space between two buffers + PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS), + + /// offset of the odd columns buffer from the beginning of data buffer + ODD_OFFSET = BUFFER_SIZE + PADDING, + }; + + /// buffer for both even and odd columns + T data[2 * BUFFER_SIZE + PADDING]; + + /// Applies specified function to all central elements while also passing + /// previous and next elements as parameters. + /// @param count count of central elements to apply function to + /// @param prevOffset offset of first central element + /// @param midOffset offset of first central element's predecessor + /// @param nextOffset offset of first central element's successor + /// @param function the function itself + template + __device__ void horizontalStep(const int count, const int prevOffset, + const int midOffset, const int nextOffset, + const FUNC &function) { + // number of unchecked iterations + const int STEPS = count / SIZE_X; + + // items remaining after last unchecked iteration + const int finalCount = count % SIZE_X; + + // offset of items processed in last (checked) iteration + const int finalOffset = count - finalCount; + + // all threads perform fixed number of iterations ... + for (int i = 0; i < STEPS; i++) { // for(int i = 0; i < 3; i++) { - const T previous = data[prevOffset + i * SIZE_X + threadIdx.x]; - const T next = data[nextOffset + i * SIZE_X + threadIdx.x]; - T & center = data[midOffset + i * SIZE_X + threadIdx.x]; - // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x)); - function(previous, center, next);// the real one - } - - // ... but not all threads participate on final iteration - if(threadIdx.x < finalCount) { - const T previous = data[prevOffset + finalOffset + threadIdx.x]; - const T next = data[nextOffset + finalOffset + threadIdx.x]; - T & center = data[midOffset + finalOffset + threadIdx.x]; - // function(previous, center, (nextOffset+finalOffset+threadIdx.x)); - // kaixi - function(previous, center, next);//the real one - } + const T previous = data[prevOffset + i * SIZE_X + threadIdx.x]; + const T next = data[nextOffset + i * SIZE_X + threadIdx.x]; + T ¢er = data[midOffset + i * SIZE_X + threadIdx.x]; + // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x)); + function(previous, center, next); // the real one } - public: - - __device__ void getPrintData() { - // - for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) { - printf(" index: %d data: %f \n ", i ,data[i]); - } - - } - - - /// Gets offset of the column with given index. Central columns have - /// indices from 0 to NUM_LINES - 1, left boundary columns have negative - /// indices and right boundary columns indices start with NUM_LINES. - /// @param columnIndex index of column to get pointer to - /// @return offset of the first item of column with specified index - __device__ int getColumnOffset(int columnIndex) { - columnIndex += BOUNDARY_X; // skip boundary - return columnIndex / 2 // select right column - + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer + // ... but not all threads participate on final iteration + if (threadIdx.x < finalCount) { + const T previous = data[prevOffset + finalOffset + threadIdx.x]; + const T next = data[nextOffset + finalOffset + threadIdx.x]; + T ¢er = data[midOffset + finalOffset + threadIdx.x]; + // function(previous, center, (nextOffset+finalOffset+threadIdx.x)); + // kaixi + function(previous, center, next); // the real one } - - - /// Provides access to data of the transform buffer. - /// @param index index of the item to work with - /// @return reference to item at given index - __device__ T & operator[] (const int index) { - return data[index]; - } - - - /// Applies specified function to all horizontally even elements in - /// specified lines. (Including even elements in boundaries except - /// first even element in first left boundary.) SIZE_X threads participate - /// and synchronization is needed before result can be used. - /// @param firstLine index of first line - /// @param numLines count of lines - /// @param func function to be applied on all even elements - /// parameters: previous (odd) element, the even - /// element itself and finally next (odd) element - template - __device__ void forEachHorizontalEven(const int firstLine, - const int numLines, - const FUNC & func) { - // number of even elemens to apply function to - const int count = numLines * VERTICAL_STRIDE - 1; - // offset of first even element - const int centerOffset = firstLine * VERTICAL_STRIDE + 1; - // offset of odd predecessor of first even element - const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET; - // offset of odd successor of first even element - const int nextOffset = prevOffset + 1; + } - // if(threadIdx.x == 0) { +public: + __device__ void getPrintData() { + // + for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) { + printf(" index: %d data: %f \n ", i, data[i]); + } + } - // printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); - // } - - // call generic horizontal step function - horizontalStep(count, prevOffset, centerOffset, nextOffset, func); - } - - - /// Applies given function to all horizontally odd elements in specified - /// lines. (Including odd elements in boundaries except last odd element - /// in last right boundary.) SIZE_X threads participate and synchronization - /// is needed before result can be used. - /// @param firstLine index of first line - /// @param numLines count of lines - /// @param func function to be applied on all odd elements - /// parameters: previous (even) element, the odd - /// element itself and finally next (even) element - template - __device__ void forEachHorizontalOdd(const int firstLine, - const int numLines, - const FUNC & func) { - // numbet of odd elements to apply function to - const int count = numLines * VERTICAL_STRIDE - 1; - // offset of even predecessor of first odd element - const int prevOffset = firstLine * VERTICAL_STRIDE; - // offset of first odd element - const int centerOffset = prevOffset + ODD_OFFSET; - // offset of even successor of first odd element - const int nextOffset = prevOffset + 1; + /// Gets offset of the column with given index. Central columns have + /// indices from 0 to NUM_LINES - 1, left boundary columns have negative + /// indices and right boundary columns indices start with NUM_LINES. + /// @param columnIndex index of column to get pointer to + /// @return offset of the first item of column with specified index + __device__ int getColumnOffset(int columnIndex) { + columnIndex += BOUNDARY_X; // skip boundary + return columnIndex / 2 // select right column + + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer + } - // if(threadIdx.x == 0) { - // printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); - // } - - - // call generic horizontal step function - horizontalStep(count, prevOffset, centerOffset, nextOffset, func); - } - - - /// Applies specified function to all even elements (except element #0) - /// of given column. Each thread takes care of one column, so there's - /// no need for synchronization. - /// @param columnOffset offset of thread's column - /// @param f function to be applied on all even elements - /// parameters: previous (odd) element, the even - /// element itself and finally next (odd) element - template - __device__ void forEachVerticalEven(const int columnOffset, const F & f) { - if(SIZE_Y > 3) { // makes no sense otherwise - const int steps = SIZE_Y / 2 - 1; - for(int i = 0; i < steps; i++) { - const int row = 2 + i * 2; - const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; - const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; - f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next); - - //--------------- FOR TEST ----------------- -/* __syncthreads(); - if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ - diffOut[2500]++; - diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE]; - } - __syncthreads(); -*/ //--------------- FOR TEST ----------------- - - - } - } - } - - - /// Applies specified function to all odd elements of given column. - /// Each thread takes care of one column, so there's no need for - /// synchronization. - /// @param columnOffset offset of thread's column - /// @param f function to be applied on all odd elements - /// parameters: previous (even) element, the odd - /// element itself and finally next (even) element - template - __device__ void forEachVerticalOdd(const int columnOffset, const F & f) { - const int steps = (SIZE_Y - 1) / 2; - for(int i = 0; i < steps; i++) { - const int row = i * 2 + 1; + /// Provides access to data of the transform buffer. + /// @param index index of the item to work with + /// @return reference to item at given index + __device__ T &operator[](const int index) { return data[index]; } + + /// Applies specified function to all horizontally even elements in + /// specified lines. (Including even elements in boundaries except + /// first even element in first left boundary.) SIZE_X threads participate + /// and synchronization is needed before result can be used. + /// @param firstLine index of first line + /// @param numLines count of lines + /// @param func function to be applied on all even elements + /// parameters: previous (odd) element, the even + /// element itself and finally next (odd) element + template + __device__ void forEachHorizontalEven(const int firstLine, const int numLines, + const FUNC &func) { + // number of even elemens to apply function to + const int count = numLines * VERTICAL_STRIDE - 1; + // offset of first even element + const int centerOffset = firstLine * VERTICAL_STRIDE + 1; + // offset of odd predecessor of first even element + const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET; + // offset of odd successor of first even element + const int nextOffset = prevOffset + 1; + + // if(threadIdx.x == 0) { + + // printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d + // nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); + // } + + // call generic horizontal step function + horizontalStep(count, prevOffset, centerOffset, nextOffset, func); + } + + /// Applies given function to all horizontally odd elements in specified + /// lines. (Including odd elements in boundaries except last odd element + /// in last right boundary.) SIZE_X threads participate and synchronization + /// is needed before result can be used. + /// @param firstLine index of first line + /// @param numLines count of lines + /// @param func function to be applied on all odd elements + /// parameters: previous (even) element, the odd + /// element itself and finally next (even) element + template + __device__ void forEachHorizontalOdd(const int firstLine, const int numLines, + const FUNC &func) { + // numbet of odd elements to apply function to + const int count = numLines * VERTICAL_STRIDE - 1; + // offset of even predecessor of first odd element + const int prevOffset = firstLine * VERTICAL_STRIDE; + // offset of first odd element + const int centerOffset = prevOffset + ODD_OFFSET; + // offset of even successor of first odd element + const int nextOffset = prevOffset + 1; + + // if(threadIdx.x == 0) { + // printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d + // nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); + // } + + // call generic horizontal step function + horizontalStep(count, prevOffset, centerOffset, nextOffset, func); + } + + /// Applies specified function to all even elements (except element #0) + /// of given column. Each thread takes care of one column, so there's + /// no need for synchronization. + /// @param columnOffset offset of thread's column + /// @param f function to be applied on all even elements + /// parameters: previous (odd) element, the even + /// element itself and finally next (odd) element + template + __device__ void forEachVerticalEven(const int columnOffset, const F &f) { + if (SIZE_Y > 3) { // makes no sense otherwise + const int steps = SIZE_Y / 2 - 1; + for (int i = 0; i < steps; i++) { + const int row = 2 + i * 2; const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; + f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); - f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); - - - //--------------- FOR TEST ----------------- -/* __syncthreads(); - if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ - diffOut[2500]++; - diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE]; - } - - __syncthreads(); -*/ //--------------- FOR TEST ----------------- + //--------------- FOR TEST ----------------- + /* __syncthreads(); + if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ + diffOut[2500]++; + diffOut[diffOut[2500]] = 2;//data[columnOffset + + row * VERTICAL_STRIDE]; + } + __syncthreads(); + */ //--------------- FOR TEST ----------------- } } - - - - /// Scales elements at specified lines. - /// @param evenScale scaling factor for horizontally even elements - /// @param oddScale scaling factor for horizontally odd elements - /// @param numLines number of lines, whose elements should be scaled - /// @param firstLine index of first line to scale elements in - __device__ void scaleHorizontal(const T evenScale, const T oddScale, - const int firstLine, const int numLines) { - const int offset = firstLine * VERTICAL_STRIDE; - const int count = numLines * VERTICAL_STRIDE; - const int steps = count / SIZE_X; - const int finalCount = count % SIZE_X; - const int finalOffset = count - finalCount; + } - // printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset); - - // run iterations, whete all threads participate - for(int i = 0; i < steps; i++) { - data[threadIdx.x + i * SIZE_X + offset] *= evenScale; - // if(threadIdx.x + i * SIZE_X + offset == 531) { - // printf("threadidx 531: %d \n", threadIdx.x); - // } - // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) { - // printf("threadidx 531: %d \n", threadIdx.x); - // } - data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale; - } - - // some threads also finish remaining unscaled items - if(threadIdx.x < finalCount) { - data[threadIdx.x + finalOffset + offset] *= evenScale; - // if(threadIdx.x + finalOffset + offset == 531) { - // printf("threadidx 531: %d \n", threadIdx.x); - // } - // if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) { - // printf("threadidx 531: %d \n", threadIdx.x); - // } - data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale; - } + /// Applies specified function to all odd elements of given column. + /// Each thread takes care of one column, so there's no need for + /// synchronization. + /// @param columnOffset offset of thread's column + /// @param f function to be applied on all odd elements + /// parameters: previous (even) element, the odd + /// element itself and finally next (even) element + template + __device__ void forEachVerticalOdd(const int columnOffset, const F &f) { + const int steps = (SIZE_Y - 1) / 2; + for (int i = 0; i < steps; i++) { + const int row = i * 2 + 1; + const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; + const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; + f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); + + //--------------- FOR TEST ----------------- + /* __syncthreads(); + if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ + diffOut[2500]++; + diffOut[diffOut[2500]] = 1; //data[columnOffset + + row * VERTICAL_STRIDE]; + } + + __syncthreads(); + */ //--------------- FOR TEST ----------------- } - - - /// Scales elements in specified column. - /// @param evenScale scaling factor for vertically even elements - /// @param oddScale scaling factor for vertically odd elements - /// @param columnOffset offset of the column to work with - /// @param numLines number of lines, whose elements should be scaled - /// @param firstLine index of first line to scale elements in - __device__ void scaleVertical(const T evenScale, const T oddScale, - const int columnOffset, const int numLines, - const int firstLine) { - for(int i = firstLine; i < (numLines + firstLine); i++) { - if(i & 1) { - data[columnOffset + i * VERTICAL_STRIDE] *= oddScale; - } else { - data[columnOffset + i * VERTICAL_STRIDE] *= evenScale; - } + } + + /// Scales elements at specified lines. + /// @param evenScale scaling factor for horizontally even elements + /// @param oddScale scaling factor for horizontally odd elements + /// @param numLines number of lines, whose elements should be scaled + /// @param firstLine index of first line to scale elements in + __device__ void scaleHorizontal(const T evenScale, const T oddScale, + const int firstLine, const int numLines) { + const int offset = firstLine * VERTICAL_STRIDE; + const int count = numLines * VERTICAL_STRIDE; + const int steps = count / SIZE_X; + const int finalCount = count % SIZE_X; + const int finalOffset = count - finalCount; + + // printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, + // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, + // finalCount, finalOffset); + + // run iterations, whete all threads participate + for (int i = 0; i < steps; i++) { + data[threadIdx.x + i * SIZE_X + offset] *= evenScale; + // if(threadIdx.x + i * SIZE_X + offset == 531) { + // printf("threadidx 531: %d \n", threadIdx.x); + // } + // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) { + // printf("threadidx 531: %d \n", threadIdx.x); + // } + data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale; + } + + // some threads also finish remaining unscaled items + if (threadIdx.x < finalCount) { + data[threadIdx.x + finalOffset + offset] *= evenScale; + // if(threadIdx.x + finalOffset + offset == 531) { + // printf("threadidx 531: %d \n", threadIdx.x); + // } + // if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) { + // printf("threadidx 531: %d \n", threadIdx.x); + // } + data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale; + } + } + + /// Scales elements in specified column. + /// @param evenScale scaling factor for vertically even elements + /// @param oddScale scaling factor for vertically odd elements + /// @param columnOffset offset of the column to work with + /// @param numLines number of lines, whose elements should be scaled + /// @param firstLine index of first line to scale elements in + __device__ void scaleVertical(const T evenScale, const T oddScale, + const int columnOffset, const int numLines, + const int firstLine) { + for (int i = firstLine; i < (numLines + firstLine); i++) { + if (i & 1) { + data[columnOffset + i * VERTICAL_STRIDE] *= oddScale; + } else { + data[columnOffset + i * VERTICAL_STRIDE] *= evenScale; } } - - - //****************For Test(Feb23), test inter parameters************* - __device__ int getVERTICAL_STRIDE(){ - return VERTICAL_STRIDE; - } - __device__ int getSHM_BANKS(){ - return SHM_BANKS; - } - __device__ int getBuffersize(){ - return BUFFER_SIZE; - } - __device__ int getPADDING(){ - return PADDING; - } - __device__ int getODD_OFFSET(){ - return ODD_OFFSET; - } + } + //****************For Test(Feb23), test inter parameters************* + __device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; } + __device__ int getSHM_BANKS() { return SHM_BANKS; } + __device__ int getBuffersize() { return BUFFER_SIZE; } + __device__ int getPADDING() { return PADDING; } + __device__ int getODD_OFFSET() { return ODD_OFFSET; } - //****************For Test(Feb23), test inter parameters************* - - - }; // end of class TransformBuffer + //****************For Test(Feb23), test inter parameters************* +}; // end of class TransformBuffer } // namespace dwt_cuda - -#endif // TRANSFORM_BUFFER_H - +#endif // TRANSFORM_BUFFER_H diff --git a/examples/dwt2d/main.cu b/examples/dwt2d/main.cu index 899e38c..212d09e 100755 --- a/examples/dwt2d/main.cu +++ b/examples/dwt2d/main.cu @@ -1,16 +1,16 @@ -/* +/* * Copyright (c) 2009, Jiri Matela * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -54,7 +54,7 @@ int getImg(char * srcFilename, unsigned char *srcImg, int inputSize) // printf("Loading ipnput: %s\n", srcFilename); char *path = "../../data/dwt2d/"; char *newSrc = NULL; - + if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL) { newSrc[0] = '\0'; @@ -67,7 +67,7 @@ int getImg(char * srcFilename, unsigned char *srcImg, int inputSize) //srcFilename = strcat("../../data/dwt2d/",srcFilename); //read image int i = open(srcFilename, O_RDONLY, 0644); - if (i == -1) { + if (i == -1) { error(0,errno,"cannot access %s", srcFilename); return -1; } @@ -97,18 +97,18 @@ template void processDWT(struct dwt *d, int forward, int writeVisual) { int componentSize = d->pixWidth*d->pixHeight*sizeof(T); - + T *c_r_out, *backup ; cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size cudaCheckError("Alloc device memory"); cudaMemset(c_r_out, 0, componentSize); cudaCheckError("Memset device memory"); - + cudaMalloc((void**)&backup, componentSize); //< aligned component size cudaCheckError("Alloc device memory"); cudaMemset(backup, 0, componentSize); cudaCheckError("Memset device memory"); - + if (d->components == 3) { /* Alloc two more buffers for G and B */ T *c_g_out, *c_b_out; @@ -116,12 +116,12 @@ void processDWT(struct dwt *d, int forward, int writeVisual) cudaCheckError("Alloc device memory"); cudaMemset(c_g_out, 0, componentSize); cudaCheckError("Memset device memory"); - + cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size cudaCheckError("Alloc device memory"); cudaMemset(c_b_out, 0, componentSize); cudaCheckError("Memset device memory"); - + /* Load components */ T *c_r, *c_g, *c_b; cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size @@ -140,13 +140,13 @@ void processDWT(struct dwt *d, int forward, int writeVisual) cudaCheckError("Memset device memory"); rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight); - + /* Compute DWT and always store into file */ nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); - + // -------test---------- // T *h_r_out=(T*)malloc(componentSize); // cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost); @@ -156,13 +156,13 @@ void processDWT(struct dwt *d, int forward, int writeVisual) // if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n"); // } // -------test---------- - - + + /* Store DWT to file */ writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r"); // writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g"); // writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b"); -#ifdef OUTPUT +#ifdef OUTPUT if (writeVisual) { writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r"); writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g"); @@ -186,7 +186,7 @@ void processDWT(struct dwt *d, int forward, int writeVisual) cudaFree(c_b_out); cudaCheckError("Cuda free"); - } + } else if (d->components == 1) { //Load component T *c_r; @@ -197,11 +197,11 @@ void processDWT(struct dwt *d, int forward, int writeVisual) bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight); - // Compute DWT + // Compute DWT nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); - // Store DWT to file -// #ifdef OUTPUT + // Store DWT to file +// #ifdef OUTPUT if (writeVisual) { writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out"); } else { @@ -218,7 +218,7 @@ void processDWT(struct dwt *d, int forward, int writeVisual) cudaCheckError("Cuda free device"); } -int main(int argc, char **argv) +int main(int argc, char **argv) { int optindex = 0; char ch; @@ -233,13 +233,13 @@ int main(int argc, char **argv) {"97", no_argument, 0, '9'}, //9/7 transform {"53", no_argument, 0, '5' }, //5/3transform {"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear - {"help", no_argument, 0, 'h'} + {"help", no_argument, 0, 'h'} }; - + int pixWidth = 0; // devCount -1) { - printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n", + printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n", device, 0, devCount -1); return -1; } - cudaDeviceProp devProp; - cudaGetDeviceProperties(&devProp, device); + cudaDeviceProp devProp; + cudaGetDeviceProperties(&devProp, device); cudaCheckError("Get device properties"); - // if (devProp.major < 1) { + // if (devProp.major < 1) { // printf("Device %d does not support CUDA\n", device); // return -1; - // } + // } printf("Using device %d: %s\n", device, devProp.name); cudaSetDevice(device); cudaCheckError("Set selected device"); @@ -366,14 +366,14 @@ int main(int argc, char **argv) printf(" DWT levels:\t\t%d\n", dwtLvls); printf(" Forward transform:\t%d\n", forward); printf(" 9/7 transform:\t\t%d\n", dwt97); - + //data sizes int inputSize = pixWidth*pixHeight*compCount; //srcImg, inputSize); cudaCheckError("Alloc host memory"); - if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) + if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) return -1; /* DWT */ diff --git a/examples/dwt2d/run_cpu.sh b/examples/dwt2d/run_cpu.sh index 5a23d74..028379c 100755 --- a/examples/dwt2d/run_cpu.sh +++ b/examples/dwt2d/run_cpu.sh @@ -5,4 +5,3 @@ ./dwt2d 4.bmp -d 4x4 -r -5 -l 3 # ./dwt2d 4.bmp -d 4x4 -r -9 -l 3 # ./dwt2d 8.bmp -d 8x8 -f -9 -l 3 - diff --git a/examples/dwt2d/test_compile_nvcc.sh b/examples/dwt2d/test_compile_nvcc.sh index 5cd6644..3810261 100755 --- a/examples/dwt2d/test_compile_nvcc.sh +++ b/examples/dwt2d/test_compile_nvcc.sh @@ -7,12 +7,3 @@ /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart - - - - - - - - - diff --git a/examples/microbench/cudamemcpy_test.cc b/examples/microbench/cudamemcpy_test.cc index b723921..b329a32 100644 --- a/examples/microbench/cudamemcpy_test.cc +++ b/examples/microbench/cudamemcpy_test.cc @@ -1,42 +1,40 @@ #include -__global__ -void saxpy(int n, float a, float *x, float *y) -{ - int i = blockIdx.x*blockDim.x + threadIdx.x; - if (i < n) y[i] = a*x[i] + y[i]; +__global__ void saxpy(int n, float a, float *x, float *y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + y[i] = a * x[i] + y[i]; } -int main(void) -{ - int N = 1<<20; +int main(void) { + int N = 1 << 20; float *x, *y, *d_x, *d_y; - x = (float*)malloc(N*sizeof(float)); - y = (float*)malloc(N*sizeof(float)); + x = (float *)malloc(N * sizeof(float)); + y = (float *)malloc(N * sizeof(float)); - cudaMalloc(&d_x, N*sizeof(float)); - cudaMalloc(&d_y, N*sizeof(float)); + cudaMalloc(&d_x, N * sizeof(float)); + cudaMalloc(&d_y, N * sizeof(float)); for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } - cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); // Perform SAXPY on 1M elements // saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y); - cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); float maxError = 0.0f; for (int i = 0; i < N; i++) - maxError = max(maxError, abs(y[i]-4.0f)); + maxError = max(maxError, abs(y[i] - 4.0f)); printf("Max error: %f\n", maxError); cudaFree(d_x); cudaFree(d_y); free(x); free(y); -} \ No newline at end of file +} diff --git a/examples/microbench/dummy_kernel.cc b/examples/microbench/dummy_kernel.cc index 4bb63e9..d9f8673 100644 --- a/examples/microbench/dummy_kernel.cc +++ b/examples/microbench/dummy_kernel.cc @@ -1,42 +1,39 @@ #include -__global__ -void saxpy(void) -{ - int i = blockIdx.x*blockDim.x + threadIdx.x; +__global__ void saxpy(void) { + int i = blockIdx.x * blockDim.x + threadIdx.x; printf("block_id:%d thread_id:%d \n", i) } -int main(void) -{ - int N = 1<<20; +int main(void) { + int N = 1 << 20; float *x, *y, *d_x, *d_y; - x = (float*)malloc(N*sizeof(float)); - y = (float*)malloc(N*sizeof(float)); + x = (float *)malloc(N * sizeof(float)); + y = (float *)malloc(N * sizeof(float)); - cudaMalloc(&d_x, N*sizeof(float)); - cudaMalloc(&d_y, N*sizeof(float)); + cudaMalloc(&d_x, N * sizeof(float)); + cudaMalloc(&d_y, N * sizeof(float)); for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } - cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); // Perform SAXPY on 1M elements - saxpy<<<(1,1)>>>; + saxpy<<<(1, 1)>>>; - cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); float maxError = 0.0f; for (int i = 0; i < N; i++) - maxError = max(maxError, abs(y[i]-4.0f)); + maxError = max(maxError, abs(y[i] - 4.0f)); printf("Max error: %f\n", maxError); cudaFree(d_x); cudaFree(d_y); free(x); free(y); -} \ No newline at end of file +} diff --git a/examples/microbench/kerne_arg.cc b/examples/microbench/kerne_arg.cc index 6a8626f..ce91e63 100644 --- a/examples/microbench/kerne_arg.cc +++ b/examples/microbench/kerne_arg.cc @@ -1,41 +1,36 @@ #include -__global__ -void saxpy(int N) -{ -printf("hello!: %d\n", N); -} +__global__ void saxpy(int N) { printf("hello!: %d\n", N); } -int main(void) -{ - int N = 1<<20; +int main(void) { + int N = 1 << 20; float *x, *y, *d_x, *d_y; - x = (float*)malloc(N*sizeof(float)); - y = (float*)malloc(N*sizeof(float)); + x = (float *)malloc(N * sizeof(float)); + y = (float *)malloc(N * sizeof(float)); - cudaMalloc(&d_x, N*sizeof(float)); - cudaMalloc(&d_y, N*sizeof(float)); + cudaMalloc(&d_x, N * sizeof(float)); + cudaMalloc(&d_y, N * sizeof(float)); for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } - cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); // Perform SAXPY on 1M elements - saxpy<<<(1,1)>>>(N); + saxpy<<<(1, 1)>>>(N); - cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); float maxError = 0.0f; for (int i = 0; i < N; i++) - maxError = max(maxError, abs(y[i]-4.0f)); + maxError = max(maxError, abs(y[i] - 4.0f)); printf("Max error: %f\n", maxError); cudaFree(d_x); cudaFree(d_y); free(x); free(y); -} \ No newline at end of file +} diff --git a/examples/microbench/one_thread_kernel.cc b/examples/microbench/one_thread_kernel.cc index 73736b2..6df23f0 100644 --- a/examples/microbench/one_thread_kernel.cc +++ b/examples/microbench/one_thread_kernel.cc @@ -1,41 +1,36 @@ #include -__global__ -void saxpy(void) -{ -printf("hello!\n"); -} +__global__ void saxpy(void) { printf("hello!\n"); } -int main(void) -{ - int N = 1<<20; +int main(void) { + int N = 1 << 20; float *x, *y, *d_x, *d_y; - x = (float*)malloc(N*sizeof(float)); - y = (float*)malloc(N*sizeof(float)); + x = (float *)malloc(N * sizeof(float)); + y = (float *)malloc(N * sizeof(float)); - cudaMalloc(&d_x, N*sizeof(float)); - cudaMalloc(&d_y, N*sizeof(float)); + cudaMalloc(&d_x, N * sizeof(float)); + cudaMalloc(&d_y, N * sizeof(float)); for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } - cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); // Perform SAXPY on 1M elements - saxpy<<<(1,1)>>>; + saxpy<<<(1, 1)>>>; - cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); float maxError = 0.0f; for (int i = 0; i < N; i++) - maxError = max(maxError, abs(y[i]-4.0f)); + maxError = max(maxError, abs(y[i] - 4.0f)); printf("Max error: %f\n", maxError); cudaFree(d_x); cudaFree(d_y); free(x); free(y); -} \ No newline at end of file +} diff --git a/runtime/lib/cudaRuntimeImpl.cpp b/runtime/lib/cudaRuntimeImpl.cpp index 9dd5277..fb2012a 100644 --- a/runtime/lib/cudaRuntimeImpl.cpp +++ b/runtime/lib/cudaRuntimeImpl.cpp @@ -43,7 +43,7 @@ cudaError_t cudaMallocHost(void **devPtr, size_t size) { *devPtr = malloc(size); if (devPtr == NULL) return cudaErrorMemoryAllocation; - return cudaSuccess; + return cudaSuccess; } cudaError_t cudaMemset(void *devPtr, int value, size_t count) { memset(devPtr, value, count);