fix linting issues

This commit is contained in:
Bhanu Garg 2022-05-24 20:43:47 -04:00
parent d834f31626
commit 308e9284cb
27 changed files with 1246 additions and 1384 deletions

2
.gitignore vendored
View File

@ -44,4 +44,4 @@ CMakeCache.txt
# OS generated files # OS generated files
.DS_Store .DS_Store
.DS_Store? .DS_Store?

View File

@ -27,11 +27,11 @@ Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RIS
export CuPBoP_PATH=`pwd` export CuPBoP_PATH=`pwd`
export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH
``` ```
If you are using boson, you can pre-installed llvm 10.0.0
LLVM_PATH=/opt/llvm-10.0.0 If you are using boson, you can pre-installed llvm 10.0.0\
export PATH=$LLVM_PATH/bin:$PATH `LLVM_PATH=/opt/llvm-10.0.0`\
`export PATH=$LLVM_PATH/bin:$PATH`
2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file 2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file
```bash ```bash

View File

@ -396,7 +396,7 @@ void init_block(llvm::Module *M, std::ofstream &fout) {
replace_asm_call(M); replace_asm_call(M);
// replace dynamic shared memory // replace dynamic shared memory
auto dynamic_shared_memory_addr = auto dynamic_shared_memory_addr =
M->getGlobalVariable("dynamic_shared_memory"); M->getGlobalVariable("dynamic_shared_memory");
if (dynamic_shared_memory_addr) { if (dynamic_shared_memory_addr) {
replace_dynamic_shared_memory(M); replace_dynamic_shared_memory(M);
} }

View File

@ -272,13 +272,12 @@ void AddContextSaveRestore(llvm::Instruction *instruction,
std::vector<Instruction *> uses; std::vector<Instruction *> uses;
Function *f2 = instruction->getParent()->getParent(); Function *f2 = instruction->getParent()->getParent();
for (Instruction::use_iterator ui = instruction->use_begin(), for (Instruction::use_iterator ui = instruction->use_begin(),
ue = instruction->use_end(); ue = instruction->use_end();
ui != ue; ++ui) { ui != ue; ++ui) {
llvm::Instruction *user = cast<Instruction>(ui->getUser()); llvm::Instruction *user = cast<Instruction>(ui->getUser());
Function *f1 = user->getParent()->getParent(); Function *f1 = user->getParent()->getParent();
if(f2->getName() != f1->getName()) { if (f2->getName() != f1->getName()) {
continue; continue;
} }
if (user == NULL) if (user == NULL)

View File

@ -89,20 +89,20 @@ void mem_share2global(llvm::Module *M) {
} else if (element_type->isStructTy()) { } else if (element_type->isStructTy()) {
auto undef = llvm::UndefValue::get(element_type); auto undef = llvm::UndefValue::get(element_type);
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef, *M, element_type, false, llvm::GlobalValue::ExternalLinkage,
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, undef, new_name, NULL,
false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
global_memory->setDSOLocal(true); global_memory->setDSOLocal(true);
Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName())); Comdat *comdat =
M->getOrInsertComdat(StringRef(share_memory->getName()));
comdat->setSelectionKind(Comdat::SelectionKind::Any); comdat->setSelectionKind(Comdat::SelectionKind::Any);
global_memory->setComdat(comdat); global_memory->setComdat(comdat);
global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage); global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
global_memory->setInitializer(undef); global_memory->setInitializer(undef);
global_memory->setAlignment(share_memory->getAlignment()); global_memory->setAlignment(share_memory->getAlignment());
corresponding_global_memory.insert( corresponding_global_memory.insert(
std::pair<GlobalVariable *, GlobalVariable *>(share_memory, std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_memory)); global_memory));
} else { } else {
assert(0 && "The required Share Memory Type is not supported\n"); assert(0 && "The required Share Memory Type is not supported\n");

42
examples/dwt2d/common.h Executable file → Normal file
View File

@ -1,16 +1,16 @@
/* /*
* Copyright (c) 2009, Jiri Matela * Copyright (c) 2009, Jiri Matela
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
* *
* * Redistributions of source code must retain the above copyright * * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright * * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the * notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution. * documentation and/or other materials provided with the distribution.
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -27,9 +27,9 @@
#ifndef _COMMON_H #ifndef _COMMON_H
#define _COMMON_H #define _COMMON_H
//24-bit multiplication is faster on G80, // 24-bit multiplication is faster on G80,
//but we must be sure to multiply integers // but we must be sure to multiply integers
//only within [-8M, 8M - 1] range // only within [-8M, 8M - 1] range
#define IMUL(a, b) __mul24(a, b) #define IMUL(a, b) __mul24(a, b)
////cuda timing macros ////cuda timing macros
@ -42,21 +42,23 @@
// cudaEventSynchronize(cstop); \ // cudaEventSynchronize(cstop); \
// cudaEventElapsedTime(&elapsedTime, cstart, cstop) // cudaEventElapsedTime(&elapsedTime, cstart, cstop)
//divide and round up macro // divide and round up macro
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b))) #define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
# define cudaCheckError( msg ) { \ #define cudaCheckError(msg) \
cudaError_t err = cudaGetLastError(); \ { \
if( cudaSuccess != err) { \ cudaError_t err = cudaGetLastError(); \
fprintf(stderr, "%s: %i: %s: %s.\n", \ if (cudaSuccess != err) { \
__FILE__, __LINE__, msg, cudaGetErrorString( err) ); \ fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg, \
exit(-1); \ cudaGetErrorString(err)); \
} } exit(-1); \
} \
# define cudaCheckAsyncError( msg ) { \ }
cudaThreadSynchronize(); \
cudaCheckError( msg ); \
}
#define cudaCheckAsyncError(msg) \
{ \
cudaThreadSynchronize(); \
cudaCheckError(msg); \
}
#endif #endif

View File

@ -1,16 +1,16 @@
/* /*
* Copyright (c) 2009, Jiri Matela * Copyright (c) 2009, Jiri Matela
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
* *
* * Redistributions of source code must retain the above copyright * * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright * * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the * notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution. * documentation and/or other materials provided with the distribution.
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -23,7 +23,7 @@
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <unistd.h> #include <unistd.h>
#include <error.h> #include <error.h>
#include <stdio.h> #include <stdio.h>
@ -50,7 +50,7 @@ __device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int
d_r[pos] = r - 128; d_r[pos] = r - 128;
d_g[pos] = g - 128; d_g[pos] = g - 128;
d_b[pos] = b - 128; d_b[pos] = b - 128;
} }
/* Store float component */ /* Store float component */
__device__ void storeComponent(float *d_c, float c, int pos) __device__ void storeComponent(float *d_c, float c, int pos)
@ -66,8 +66,8 @@ __device__ void storeComponent(int *d_c, int c, int pos)
/* Copy img src data into three separated component buffers */ /* Copy img src data into three separated component buffers */
template<typename T> template<typename T>
__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b, __global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
unsigned char * d_src, unsigned char * d_src,
int pixels) int pixels)
{ {
int x = threadIdx.x; int x = threadIdx.x;
@ -75,8 +75,8 @@ __global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
__shared__ unsigned char sData[THREADS*3]; __shared__ unsigned char sData[THREADS*3];
/* Copy data to shared mem by 4bytes /* Copy data to shared mem by 4bytes
other checks are not necessary, since other checks are not necessary, since
d_src buffer is aligned to sharedDataSize */ d_src buffer is aligned to sharedDataSize */
if ( (x*4) < THREADS*3 ) { if ( (x*4) < THREADS*3 ) {
float *s = (float *)d_src; float *s = (float *)d_src;
@ -107,8 +107,8 @@ __global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
__shared__ unsigned char sData[THREADS]; __shared__ unsigned char sData[THREADS];
/* Copy data to shared mem by 4bytes /* Copy data to shared mem by 4bytes
other checks are not necessary, since other checks are not necessary, since
d_src buffer is aligned to sharedDataSize */ d_src buffer is aligned to sharedDataSize */
if ( (x*4) < THREADS) { if ( (x*4) < THREADS) {
float *s = (float *)d_src; float *s = (float *)d_src;

17
examples/dwt2d/components.h Executable file → Normal file
View File

@ -1,16 +1,16 @@
/* /*
* Copyright (c) 2009, Jiri Matela * Copyright (c) 2009, Jiri Matela
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
* *
* * Redistributions of source code must retain the above copyright * * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright * * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the * notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution. * documentation and/or other materials provided with the distribution.
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -28,11 +28,12 @@
#define _COMPONENTS_H #define _COMPONENTS_H
/* Separate compoents of source 8bit RGB image */ /* Separate compoents of source 8bit RGB image */
template<typename T> template <typename T>
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height); void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
int height);
/* Copy a 8bit source image data into a color compoment of type T */ /* Copy a 8bit source image data into a color compoment of type T */
template<typename T> template <typename T>
void bwToComponent(T *d_c, unsigned char * src, int width, int height); void bwToComponent(T *d_c, unsigned char *src, int width, int height);
#endif #endif

View File

@ -1,16 +1,16 @@
/* /*
* Copyright (c) 2009, Jiri Matela * Copyright (c) 2009, Jiri Matela
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
* *
* * Redistributions of source code must retain the above copyright * * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright * * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the * notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution. * documentation and/or other materials provided with the distribution.
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -85,25 +85,25 @@ template<typename T>
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward) int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
{ {
printf("\n*** %d stages of 2D forward DWT:\n", stages); printf("\n*** %d stages of 2D forward DWT:\n", stages);
/* create backup of input, because each test iteration overwrites it */ /* create backup of input, because each test iteration overwrites it */
const int size = pixHeight * pixWidth * sizeof(T); const int size = pixHeight * pixWidth * sizeof(T);
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice); cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device"); cudaCheckError("Memcopy device to device");
/* Measure time of individual levels. */ /* Measure time of individual levels. */
if(forward) if(forward)
fdwt(in, out, pixWidth, pixHeight, stages); fdwt(in, out, pixWidth, pixHeight, stages);
else else
rdwt(in, out, pixWidth, pixHeight, stages); rdwt(in, out, pixWidth, pixHeight, stages);
// Measure overall time of DWT. // Measure overall time of DWT.
/* #ifdef GPU_DWT_TESTING_1 /* #ifdef GPU_DWT_TESTING_1
dwt_cuda::CudaDWTTester tester; dwt_cuda::CudaDWTTester tester;
for(int i = tester.getNumIterations(); i--; ) { for(int i = tester.getNumIterations(); i--; ) {
// Recover input and measure one overall DWT run. // Recover input and measure one overall DWT run.
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device"); cudaCheckError("Memcopy device to device");
tester.beginTestIteration(); tester.beginTestIteration();
if(forward) if(forward)
@ -113,8 +113,8 @@ int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int st
tester.endTestIteration(); tester.endTestIteration();
} }
tester.showPerformance(" Overall DWT", pixWidth, pixHeight); tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
#endif // GPU_DWT_TESTING #endif // GPU_DWT_TESTING
cudaCheckAsyncError("DWT Kernel calls"); cudaCheckAsyncError("DWT Kernel calls");
*/ return 0; */ return 0;
} }
@ -128,25 +128,25 @@ template<typename T>
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut) int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
{ {
printf("*** %d stages of 2D forward DWT:\n", stages); printf("*** %d stages of 2D forward DWT:\n", stages);
// create backup of input, because each test iteration overwrites it // create backup of input, because each test iteration overwrites it
const int size = pixHeight * pixWidth * sizeof(T); const int size = pixHeight * pixWidth * sizeof(T);
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice); cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device"); cudaCheckError("Memcopy device to device");
// Measure time of individual levels. // Measure time of individual levels.
if(forward) if(forward)
fdwt(in, out, pixWidth, pixHeight, stages, diffOut); fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
else else
rdwt(in, out, pixWidth, pixHeight, stages); rdwt(in, out, pixWidth, pixHeight, stages);
// Measure overall time of DWT. // Measure overall time of DWT.
#ifdef GPU_DWT_TESTING_1 #ifdef GPU_DWT_TESTING_1
dwt_cuda::CudaDWTTester tester; dwt_cuda::CudaDWTTester tester;
for(int i = tester.getNumIterations(); i--; ) { for(int i = tester.getNumIterations(); i--; ) {
// Recover input and measure one overall DWT run. // Recover input and measure one overall DWT run.
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device"); cudaCheckError("Memcopy device to device");
tester.beginTestIteration(); tester.beginTestIteration();
if(forward) if(forward)
@ -156,8 +156,8 @@ int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int st
tester.endTestIteration(); tester.endTestIteration();
} }
tester.showPerformance(" Overall DWT", pixWidth, pixHeight); tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
#endif // GPU_DWT_TESTING #endif // GPU_DWT_TESTING
cudaCheckAsyncError("DWT Kernel calls"); cudaCheckAsyncError("DWT Kernel calls");
return 0; return 0;
} }
@ -178,8 +178,8 @@ void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char
for(i = 0; i < samplesNum; i++) { for(i = 0; i < samplesNum; i++) {
float r = (src[i]+0.5f) * 255; float r = (src[i]+0.5f) * 255;
if (r > 255) r = 255; if (r > 255) r = 255;
if (r < 0) r = 0; if (r < 0) r = 0;
dst[i] = (unsigned char)r; dst[i] = (unsigned char)r;
outputFile << "index: " << i << " val: "<< r <<" \n"; outputFile << "index: " << i << " val: "<< r <<" \n";
@ -199,7 +199,7 @@ void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char *
for(i = 0; i < samplesNum; i++) { for(i = 0; i < samplesNum; i++) {
int r = src[i]+128; int r = src[i]+128;
if (r > 255) r = 255; if (r > 255) r = 255;
if (r < 0) r = 0; if (r < 0) r = 0;
dst[i] = (unsigned char)r; dst[i] = (unsigned char)r;
// added this line to output check // added this line to output check
outputFile << "index: " << i << " val: "<< r <<" \n"; outputFile << "index: " << i << " val: "<< r <<" \n";
@ -250,16 +250,16 @@ int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
if(x == 0) return 1; if(x == 0) return 1;
return 0; return 0;
} }
template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
/* Write output visual ordered */ /* Write output visual ordered */
template<typename T> template<typename T>
int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
int stages, const char * filename, const char * suffix) int stages, const char * filename, const char * suffix)
{ {
struct band { struct band {
int dimX; int dimX;
int dimY; int dimY;
}; };
struct dimensions { struct dimensions {
@ -309,7 +309,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY); printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
} }
#endif #endif
size = samplesNum*sizeof(T); size = samplesNum*sizeof(T);
cudaMallocHost((void **)&src, size); cudaMallocHost((void **)&src, size);
cudaCheckError("Malloc host"); cudaCheckError("Malloc host");
@ -332,7 +332,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY; offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
for (i = 0; i < bandDims[s].HL.dimY; i++) { for (i = 0; i < bandDims[s].HL.dimY; i++) {
memcpy(dst+i*pixWidth+bandDims[s].LL.dimX, memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
src+offset+i*bandDims[s].HL.dimX, src+offset+i*bandDims[s].HL.dimX,
size); size);
} }
@ -342,7 +342,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
yOffset = bandDims[s].LL.dimY; yOffset = bandDims[s].LL.dimY;
for (i = 0; i < bandDims[s].HL.dimY; i++) { for (i = 0; i < bandDims[s].HL.dimY; i++) {
memcpy(dst+(yOffset+i)*pixWidth, memcpy(dst+(yOffset+i)*pixWidth,
src+offset+i*bandDims[s].LH.dimX, src+offset+i*bandDims[s].LH.dimX,
size); size);
} }
@ -352,7 +352,7 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
yOffset = bandDims[s].HL.dimY; yOffset = bandDims[s].HL.dimY;
for (i = 0; i < bandDims[s].HH.dimY; i++) { for (i = 0; i < bandDims[s].HH.dimY; i++) {
memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX, memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
src+offset+i*bandDims[s].HH.dimX, src+offset+i*bandDims[s].HH.dimX,
size); size);
} }
} }
@ -381,5 +381,5 @@ int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
if (x == 0) return 1; if (x == 0) return 1;
return 0; return 0;
} }
template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);

25
examples/dwt2d/dwt.h Executable file → Normal file
View File

@ -1,16 +1,16 @@
/* /*
* Copyright (c) 2009, Jiri Matela * Copyright (c) 2009, Jiri Matela
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
* *
* * Redistributions of source code must retain the above copyright * * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright * * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the * notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution. * documentation and/or other materials provided with the distribution.
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -27,14 +27,15 @@
#ifndef _DWT_H #ifndef _DWT_H
#define _DWT_H #define _DWT_H
template<typename T> template <typename T>
int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward); int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
int stages, bool forward);
template<typename T> template <typename T>
int writeNStage2DDWT(T *component_cuda, int width, int height, int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
int stages, const char * filename, const char * suffix); const char *filename, const char *suffix);
template<typename T> template <typename T>
int writeLinear(T *component_cuda, int width, int height, int writeLinear(T *component_cuda, int width, int height, const char *filename,
const char * filename, const char * suffix); const char *suffix);
#endif #endif

View File

@ -1,20 +1,20 @@
/// ///
/// @file common.cu /// @file common.cu
/// @author Martin Jirman (207962@mail.muni.cz) /// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-01-20 14:37 /// @date 2011-01-20 14:37
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -27,7 +27,7 @@
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE. /// POSSIBILITY OF SUCH DAMAGE.
/// ///
#include "common.h" #include "common.h"
namespace dwt_cuda { namespace dwt_cuda {

387
examples/dwt2d/dwt_cuda/common.h Executable file → Normal file
View File

@ -1,4 +1,4 @@
/// ///
/// @file common.h /// @file common.h
/// @author Martin Jirman (207962@mail.muni.cz) /// @author Martin Jirman (207962@mail.muni.cz)
/// @brief Common stuff for all CUDA dwt functions. /// @brief Common stuff for all CUDA dwt functions.
@ -6,16 +6,16 @@
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -29,233 +29,204 @@
/// POSSIBILITY OF SUCH DAMAGE. /// POSSIBILITY OF SUCH DAMAGE.
/// ///
#ifndef DWT_COMMON_H #ifndef DWT_COMMON_H
#define DWT_COMMON_H #define DWT_COMMON_H
#include <cstdio>
#include <algorithm> #include <algorithm>
#include <cstdio>
#include <vector> #include <vector>
// compile time minimum macro // compile time minimum macro
#define CTMIN(a,b) (((a) < (b)) ? (a) : (b)) #define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
// performance testing macros // performance testing macros
#if defined(GPU_DWT_TESTING) #if defined(GPU_DWT_TESTING)
#define PERF_BEGIN \ #define PERF_BEGIN \
{ \ { \
dwt_cuda::CudaDWTTester PERF_TESTER; \ dwt_cuda::CudaDWTTester PERF_TESTER; \
for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \ for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) { \
{ \
PERF_TESTER.beginTestIteration(); PERF_TESTER.beginTestIteration();
#define PERF_END(PERF_NAME, PERF_W, PERF_H) \ #define PERF_END(PERF_NAME, PERF_W, PERF_H) \
PERF_TESTER.endTestIteration(); \ PERF_TESTER.endTestIteration(); \
} \ } \
PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \ PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
} }
#else // GPU_DWT_TESTING #else // GPU_DWT_TESTING
#define PERF_BEGIN #define PERF_BEGIN
#define PERF_END(PERF_NAME, PERF_W, PERF_H) #define PERF_END(PERF_NAME, PERF_W, PERF_H)
#endif // GPU_DWT_TESTING #endif // GPU_DWT_TESTING
namespace dwt_cuda { namespace dwt_cuda {
/// Divide and round up.
/// Divide and round up. template <typename T>
template <typename T> __device__ __host__ inline T divRndUp(const T &n, const T &d) {
__device__ __host__ inline T divRndUp(const T & n, const T & d) { return (n / d) + ((n % d) ? 1 : 0);
return (n / d) + ((n % d) ? 1 : 0); }
// 9/7 forward DWT lifting schema coefficients
const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1
const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2
const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2
// 9/7 reverse DWT lifting schema coefficients
const float r97update2 = -f97Update2; ///< undo 9/7 update 2
const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
const float r97update1 = -f97Update1; ///< undo 9/7 update 1
const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
// FDWT 9/7 scaling coefficients
const float scale97Mul = 1.23017410491400f;
const float scale97Div = 1.0 / scale97Mul;
// 5/3 forward DWT lifting schema coefficients
const float forward53Predict = -0.5f; /// forward 5/3 predict
const float forward53Update = 0.25f; /// forward 5/3 update
// 5/3 forward DWT lifting schema coefficients
const float reverse53Update = -forward53Update; /// undo 5/3 update
const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
/// Functor which adds scaled sum of neighbors to given central pixel.
struct AddScaledSum {
const float scale; // scale of neighbors
__device__ AddScaledSum(const float scale) : scale(scale) {}
__device__ void operator()(const float p, float &c, const float n) const {
// if(threadIdx.x == 0) {
// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
// scale * (p + n) );
// }
c += scale * (p + n);
} }
};
// 9/7 forward DWT lifting schema coefficients
const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1
const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2
const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2
/// Returns index ranging from 0 to num threads, such that first half
/// of threads get even indices and others get odd indices. Each thread
/// gets different index.
/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7
/// parityIdx: 0 2 4 6 1 3 5 7
/// @tparam THREADS total count of participating threads
/// @return parity-separated index of thread
template <int THREADS> __device__ inline int parityIdx() {
return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
}
// 9/7 reverse DWT lifting schema coefficients /// size of shared memory
const float r97update2 = -f97Update2; ///< undo 9/7 update 2 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2 const int SHM_SIZE = 48 * 1024;
const float r97update1 = -f97Update1; ///< undo 9/7 update 1 #else
const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1 const int SHM_SIZE = 16 * 1024;
#endif
// FDWT 9/7 scaling coefficients
const float scale97Mul = 1.23017410491400f;
const float scale97Div = 1.0 / scale97Mul;
// 5/3 forward DWT lifting schema coefficients
const float forward53Predict = -0.5f; /// forward 5/3 predict
const float forward53Update = 0.25f; /// forward 5/3 update
// 5/3 forward DWT lifting schema coefficients
const float reverse53Update = -forward53Update; /// undo 5/3 update
const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
/// Functor which adds scaled sum of neighbors to given central pixel.
struct AddScaledSum {
const float scale; // scale of neighbors
__device__ AddScaledSum(const float scale) : scale(scale) {}
__device__ void operator()(const float p, float & c, const float n) const {
// if(threadIdx.x == 0) { /// Perrformance and return code tester.
class CudaDWTTester {
// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) ); private:
static bool testRunning; ///< true if any test is currently running
// } cudaEvent_t beginEvent; ///< begin CUDA event
cudaEvent_t endEvent; ///< end CUDA event
c += scale * (p + n); std::vector<float> times; ///< collected times
const bool disabled; ///< true if this object is disabled
public:
/// Checks CUDA related error.
/// @param status return code to be checked
/// @param message message to be shown if there was an error
/// @return true if there was no error, false otherwise
static bool check(const cudaError_t &status, const char *message) {
#if defined(GPU_DWT_TESTING)
if ((!testRunning) && status != cudaSuccess) {
const char *errorString = cudaGetErrorString(status);
fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
fflush(stderr);
return false;
} }
}; #endif // GPU_DWT_TESTING
return true;
/// Returns index ranging from 0 to num threads, such that first half
/// of threads get even indices and others get odd indices. Each thread
/// gets different index.
/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7
/// parityIdx: 0 2 4 6 1 3 5 7
/// @tparam THREADS total count of participating threads
/// @return parity-separated index of thread
template <int THREADS>
__device__ inline int parityIdx() {
return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
} }
/// size of shared memory
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
const int SHM_SIZE = 48 * 1024;
#else
const int SHM_SIZE = 16 * 1024;
#endif
/// Perrformance and return code tester.
class CudaDWTTester {
private:
static bool testRunning; ///< true if any test is currently running
cudaEvent_t beginEvent; ///< begin CUDA event
cudaEvent_t endEvent; ///< end CUDA event
std::vector<float> times; ///< collected times
const bool disabled; ///< true if this object is disabled
public:
/// Checks CUDA related error.
/// @param status return code to be checked
/// @param message message to be shown if there was an error
/// @return true if there was no error, false otherwise
static bool check(const cudaError_t & status, const char * message) {
#if defined(GPU_DWT_TESTING)
if((!testRunning) && status != cudaSuccess) {
const char * errorString = cudaGetErrorString(status);
fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
fflush(stderr);
return false;
}
#endif // GPU_DWT_TESTING
return true;
}
/// Checks last kernel call for errors. /// Checks last kernel call for errors.
/// @param message description of the kernel call /// @param message description of the kernel call
/// @return true if there was no error, false otherwise /// @return true if there was no error, false otherwise
static bool checkLastKernelCall(const char * message) { static bool checkLastKernelCall(const char *message) {
#if defined(GPU_DWT_TESTING) #if defined(GPU_DWT_TESTING)
return testRunning ? true : check(cudaThreadSynchronize(), message); return testRunning ? true : check(cudaThreadSynchronize(), message);
#else // GPU_DWT_TESTING #else // GPU_DWT_TESTING
return true; return true;
#endif // GPU_DWT_TESTING #endif // GPU_DWT_TESTING
}
/// Initializes DWT tester for time measurement
CudaDWTTester() : disabled(testRunning) {}
/// Gets rpefered number of iterations
int getNumIterations() {
return disabled ? 1 : 31;
}
/// Starts one test iteration.
void beginTestIteration() {
if(!disabled) {
cudaEventCreate(&beginEvent);
cudaEventCreate(&endEvent);
cudaEventRecord(beginEvent, 0);
testRunning = true;
}
}
/// Ends on etest iteration.
void endTestIteration() {
if(!disabled) {
float time;
testRunning = false;
cudaEventRecord(endEvent, 0);
cudaEventSynchronize(endEvent);
cudaEventElapsedTime(&time, beginEvent, endEvent);
cudaEventDestroy(beginEvent);
cudaEventDestroy(endEvent);
times.push_back(time);
}
}
/// Shows brief info about all iterations.
/// @param name name of processing method
/// @param sizeX width of processed image
/// @param sizeY height of processed image
void showPerformance(const char * name, const int sizeX, const int sizeY) {
if(!disabled) {
// compute mean and median
std::sort(times.begin(), times.end());
double sum = 0;
for(int i = times.size(); i--; ) {
sum += times[i];
}
const double median = (times[times.size() / 2]
+ times[(times.size() - 1) / 2]) * 0.5f;
printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "
"(%d x %d)\n", name, (sum / times.size()), median,
times[times.size() - 1], sizeX, sizeY);
}
}
};
/// Simple cudaMemcpy wrapped in performance tester.
/// @param dest destination bufer
/// @param src source buffer
/// @param sx width of copied image
/// @param sy height of copied image
template <typename T>
inline void memCopy(T * const dest, const T * const src,
const size_t sx, const size_t sy) {
cudaError_t status;
PERF_BEGIN
status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice);
PERF_END(" memcpy", sx, sy)
CudaDWTTester::check(status, "memcpy device > device");
} }
/// Initializes DWT tester for time measurement
CudaDWTTester() : disabled(testRunning) {}
/// Gets rpefered number of iterations
int getNumIterations() { return disabled ? 1 : 31; }
/// Starts one test iteration.
void beginTestIteration() {
if (!disabled) {
cudaEventCreate(&beginEvent);
cudaEventCreate(&endEvent);
cudaEventRecord(beginEvent, 0);
testRunning = true;
}
}
/// Ends on etest iteration.
void endTestIteration() {
if (!disabled) {
float time;
testRunning = false;
cudaEventRecord(endEvent, 0);
cudaEventSynchronize(endEvent);
cudaEventElapsedTime(&time, beginEvent, endEvent);
cudaEventDestroy(beginEvent);
cudaEventDestroy(endEvent);
times.push_back(time);
}
}
/// Shows brief info about all iterations.
/// @param name name of processing method
/// @param sizeX width of processed image
/// @param sizeY height of processed image
void showPerformance(const char *name, const int sizeX, const int sizeY) {
if (!disabled) {
// compute mean and median
std::sort(times.begin(), times.end());
double sum = 0;
for (int i = times.size(); i--;) {
sum += times[i];
}
const double median =
(times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "
"(%d x %d)\n",
name, (sum / times.size()), median, times[times.size() - 1], sizeX,
sizeY);
}
}
};
/// Simple cudaMemcpy wrapped in performance tester.
/// @param dest destination bufer
/// @param src source buffer
/// @param sx width of copied image
/// @param sy height of copied image
template <typename T>
inline void memCopy(T *const dest, const T *const src, const size_t sx,
const size_t sy) {
cudaError_t status;
PERF_BEGIN
status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
PERF_END(" memcpy", sx, sy)
CudaDWTTester::check(status, "memcpy device > device");
}
} // end of namespace dwt_cuda } // end of namespace dwt_cuda
#endif // DWT_COMMON_CUDA_H
#endif // DWT_COMMON_CUDA_H

99
examples/dwt2d/dwt_cuda/dwt.h Executable file → Normal file
View File

@ -1,4 +1,4 @@
/// ///
/// @file dwt.h /// @file dwt.h
/// @author Martin Jirman (207962@mail.muni.cz) /// @author Martin Jirman (207962@mail.muni.cz)
/// @brief Entry points for CUDA implementaion of 9/7 and 5/3 DWT. /// @brief Entry points for CUDA implementaion of 9/7 and 5/3 DWT.
@ -8,16 +8,16 @@
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -56,57 +56,48 @@
/// ///
#ifndef DWT_CUDA_H #ifndef DWT_CUDA_H
#define DWT_CUDA_H #define DWT_CUDA_H
namespace dwt_cuda { namespace dwt_cuda {
/// Forward 5/3 2D DWT. See common rules (above) for more details.
/// Forward 5/3 2D DWT. See common rules (above) for more details. /// @param in Expected to be normalized into range [-128, 127].
/// @param in Expected to be normalized into range [-128, 127]. /// Will not be preserved (will be overwritten).
/// Will not be preserved (will be overwritten). /// @param out output buffer on GPU
/// @param out output buffer on GPU /// @param sizeX width of input image (in pixels)
/// @param sizeX width of input image (in pixels) /// @param sizeY height of input image (in pixels)
/// @param sizeY height of input image (in pixels) /// @param levels number of recursive DWT levels
/// @param levels number of recursive DWT levels void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Format described in common rules.
/// Reverse 5/3 2D DWT. See common rules (above) for more details. /// Will not be preserved (will be overwritten).
/// @param in Input DWT coefficients. Format described in common rules. /// @param out output buffer on GPU - will contain original image
/// Will not be preserved (will be overwritten). /// in normalized range [-128, 127].
/// @param out output buffer on GPU - will contain original image /// @param sizeX width of input image (in pixels)
/// in normalized range [-128, 127]. /// @param sizeY height of input image (in pixels)
/// @param sizeX width of input image (in pixels) /// @param levels number of recursive DWT levels
/// @param sizeY height of input image (in pixels) void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
/// @param levels number of recursive DWT levels
void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels); /// Forward 9/7 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Should be normalized (in range
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
/// Forward 9/7 2D DWT. See common rules (above) for more details. /// @param out output buffer on GPU - format specified in common rules
/// @param in Input DWT coefficients. Should be normalized (in range /// @param sizeX width of input image (in pixels)
/// [-0.5, 0.5]). Will not be preserved (will be overwritten). /// @param sizeY height of input image (in pixels)
/// @param out output buffer on GPU - format specified in common rules /// @param levels number of recursive DWT levels
/// @param sizeX width of input image (in pixels) void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels /// Reverse 9/7 2D DWT. See common rules (above) for more details.
void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels); /// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten).
/// @param out output buffer on GPU - will contain original image
/// Reverse 9/7 2D DWT. See common rules (above) for more details. /// in normalized range [-0.5, 0.5].
/// @param in Input DWT coefficients. Format described in common rules. /// @param sizeX width of input image (in pixels)
/// Will not be preserved (will be overwritten). /// @param sizeY height of input image (in pixels)
/// @param out output buffer on GPU - will contain original image /// @param levels number of recursive DWT levels
/// in normalized range [-0.5, 0.5]. void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
} // namespace dwt_cuda } // namespace dwt_cuda
#endif // DWT_CUDA_H
#endif // DWT_CUDA_H

View File

@ -6,16 +6,16 @@
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -44,7 +44,7 @@ namespace dwt_cuda {
template <int WIN_SIZE_X, int WIN_SIZE_Y> template <int WIN_SIZE_X, int WIN_SIZE_Y>
class FDWT53 { class FDWT53 {
private: private:
/// Info needed for processing of one input column. /// Info needed for processing of one input column.
/// @tparam CHECKED_LOADER true if column's loader should check boundaries /// @tparam CHECKED_LOADER true if column's loader should check boundaries
/// false if there are no near boudnaries to check /// false if there are no near boudnaries to check
@ -52,13 +52,13 @@ namespace dwt_cuda {
struct FDWT53Column { struct FDWT53Column {
/// loader for the column /// loader for the column
VerticalDWTPixelLoader<int, CHECKED_LOADER> loader; VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
/// offset of the column in shared buffer /// offset of the column in shared buffer
int offset; int offset;
// backup of first 3 loaded pixels (not transformed) // backup of first 3 loaded pixels (not transformed)
int pixel0, pixel1, pixel2; int pixel0, pixel1, pixel2;
/// Sets all fields to anything to prevent 'uninitialized' warnings. /// Sets all fields to anything to prevent 'uninitialized' warnings.
__device__ void clear() { __device__ void clear() {
offset = pixel0 = pixel1 = pixel2 = 0; offset = pixel0 = pixel1 = pixel2 = 0;
@ -104,7 +104,7 @@ namespace dwt_cuda {
/// @param colIndex x-axis coordinate of the column (relative to the left /// @param colIndex x-axis coordinate of the column (relative to the left
/// side of this threadblock's block of input pixels) /// side of this threadblock's block of input pixels)
/// @param firstY y-axis coordinate of first image row to be transformed /// @param firstY y-axis coordinate of first image row to be transformed
template <bool CHECKED> template <bool CHECKED>
__device__ void initColumn(FDWT53Column<CHECKED> & column, __device__ void initColumn(FDWT53Column<CHECKED> & column,
const int * const input, const int * const input,
@ -137,7 +137,7 @@ namespace dwt_cuda {
column.pixel2 = column.loader.loadFrom(input); column.pixel2 = column.loader.loadFrom(input);
// Now, the next pixel, which will be loaded by loader, is pixel #1. // Now, the next pixel, which will be loaded by loader, is pixel #1.
} }
} }
@ -153,14 +153,14 @@ namespace dwt_cuda {
buffer[column.offset + 0 * STRIDE] = column.pixel0; buffer[column.offset + 0 * STRIDE] = column.pixel0;
buffer[column.offset + 1 * STRIDE] = column.pixel1; buffer[column.offset + 1 * STRIDE] = column.pixel1;
buffer[column.offset + 2 * STRIDE] = column.pixel2; buffer[column.offset + 2 * STRIDE] = column.pixel2;
// load remaining pixels to be able to vertically transform the window // load remaining pixels to be able to vertically transform the window
for(int i = 3; i < (3 + WIN_SIZE_Y); i++) for(int i = 3; i < (3 + WIN_SIZE_Y); i++)
{ {
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input); buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
} }
// remember last 3 pixels for use in next iteration // remember last 3 pixels for use in next iteration
column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE]; column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE]; column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
@ -169,7 +169,7 @@ namespace dwt_cuda {
// vertically transform the column in transform buffer // vertically transform the column in transform buffer
buffer.forEachVerticalOdd(column.offset, Forward53Predict()); buffer.forEachVerticalOdd(column.offset, Forward53Predict());
buffer.forEachVerticalEven(column.offset, Forward53Update()); buffer.forEachVerticalEven(column.offset, Forward53Update());
} }
@ -178,7 +178,7 @@ namespace dwt_cuda {
/// @tparam CHECK_WRITES true if output writer must check boundaries /// @tparam CHECK_WRITES true if output writer must check boundaries
/// @param in input image /// @param in input image
/// @param out output buffer /// @param out output buffer
/// @param sizeX width of the input image /// @param sizeX width of the input image
/// @param sizeY height of the input image /// @param sizeY height of the input image
/// @param winSteps number of sliding window steps /// @param winSteps number of sliding window steps
template <bool CHECK_LOADS, bool CHECK_WRITES> template <bool CHECK_LOADS, bool CHECK_WRITES>
@ -186,15 +186,15 @@ namespace dwt_cuda {
const int sizeX, const int sizeY, const int sizeX, const int sizeY,
const int winSteps) { const int winSteps) {
// info about one main and one boundary columns processed by this thread // info about one main and one boundary columns processed by this thread
FDWT53Column<CHECK_LOADS> column; FDWT53Column<CHECK_LOADS> column;
FDWT53Column<CHECK_LOADS> boundaryColumn; // only few threads use this FDWT53Column<CHECK_LOADS> boundaryColumn; // only few threads use this
// Initialize all column info: initialize loaders, compute offset of // Initialize all column info: initialize loaders, compute offset of
// column in shared buffer and initialize loader of column. // column in shared buffer and initialize loader of column.
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps; const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
// first 3 threads initialize boundary columns, others do not use them // first 3 threads initialize boundary columns, others do not use them
boundaryColumn.clear(); boundaryColumn.clear();
if(threadIdx.x < 3) { if(threadIdx.x < 3) {
@ -205,9 +205,9 @@ namespace dwt_cuda {
initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY); initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
} }
// index of column which will be written into output by this thread // index of column which will be written into output by this thread
const int outColumnIndex = parityIdx<WIN_SIZE_X>(); const int outColumnIndex = parityIdx<WIN_SIZE_X>();
// offset of column which will be written by this thread into output // offset of column which will be written by this thread into output
@ -219,7 +219,7 @@ namespace dwt_cuda {
writer.init(sizeX, sizeY, outputFirstX, firstY); writer.init(sizeX, sizeY, outputFirstX, firstY);
__syncthreads(); __syncthreads();
// Sliding window iterations: // Sliding window iterations:
// Each iteration assumes that first 3 pixels of each column are loaded. // Each iteration assumes that first 3 pixels of each column are loaded.
for(int w = 0; w < winSteps; w++) { for(int w = 0; w < winSteps; w++) {
@ -227,23 +227,23 @@ namespace dwt_cuda {
// For each column (including boundary columns): load and vertically // For each column (including boundary columns): load and vertically
// transform another WIN_SIZE_Y lines. // transform another WIN_SIZE_Y lines.
loadAndVerticallyTransform(column, in); loadAndVerticallyTransform(column, in);
if(threadIdx.x < 3) { if(threadIdx.x < 3) {
loadAndVerticallyTransform(boundaryColumn, in); loadAndVerticallyTransform(boundaryColumn, in);
} }
// wait for all columns to be vertically transformed and transform all // wait for all columns to be vertically transformed and transform all
// output rows horizontally // output rows horizontally
__syncthreads(); __syncthreads();
buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict()); buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
__syncthreads(); __syncthreads();
buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update()); buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
// wait for all output rows to be transformed horizontally and write // wait for all output rows to be transformed horizontally and write
// them into output buffer // them into output buffer
__syncthreads(); __syncthreads();
for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) { for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
@ -256,20 +256,20 @@ namespace dwt_cuda {
// before proceeding to next iteration, wait for all output columns // before proceeding to next iteration, wait for all output columns
// to be written into the output // to be written into the output
__syncthreads(); __syncthreads();
} }
} }
public: public:
/// Determines, whether this block's pixels touch boundary and selects /// Determines, whether this block's pixels touch boundary and selects
/// right version of algorithm according to it - for many threadblocks, it /// right version of algorithm according to it - for many threadblocks, it
/// selects version which does not deal with boundary mirroring and thus is /// selects version which does not deal with boundary mirroring and thus is
/// slightly faster. /// slightly faster.
/// @param in input image /// @param in input image
/// @param out output buffer /// @param out output buffer
/// @param sx width of the input image /// @param sx width of the input image
/// @param sy height of the input image /// @param sy height of the input image
/// @param steps number of sliding window steps /// @param steps number of sliding window steps
__device__ static void run(const int * const in, int * const out, __device__ static void run(const int * const in, int * const out,
@ -292,32 +292,32 @@ namespace dwt_cuda {
// if(threadIdx.x == 0) { // if(threadIdx.x == 0) {
// printf("fdwt53 run"); // printf("fdwt53 run");
// } // }
if(atBottomBoudary) if(atBottomBoudary)
{ {
// near bottom boundary => check both writing and reading // near bottom boundary => check both writing and reading
fdwt53.transform<true, true>(in, out, sx, sy, steps); fdwt53.transform<true, true>(in, out, sx, sy, steps);
} else if(atRightBoudary) } else if(atRightBoudary)
{ {
// near right boundary only => check writing only // near right boundary only => check writing only
fdwt53.transform<false, true>(in, out, sx, sy, steps); fdwt53.transform<false, true>(in, out, sx, sy, steps);
} else } else
{ {
// no nearby boundary => check nothing // no nearby boundary => check nothing
fdwt53.transform<false, false>(in, out, sx, sy, steps); fdwt53.transform<false, false>(in, out, sx, sy, steps);
} }
} }
// } // }
}; // end of class FDWT53 }; // end of class FDWT53
/// Main GPU 5/3 FDWT entry point. /// Main GPU 5/3 FDWT entry point.
/// @tparam WIN_SX width of sliding window to be used /// @tparam WIN_SX width of sliding window to be used
/// @tparam WIN_SY height of sliding window to be used /// @tparam WIN_SY height of sliding window to be used
/// @param input input image /// @param input input image
/// @param output output buffer /// @param output output buffer
/// @param sizeX width of the input image /// @param sizeX width of the input image
/// @param sizeY height of the input image /// @param sizeY height of the input image
/// @param winSteps number of sliding window steps /// @param winSteps number of sliding window steps
template <int WIN_SX, int WIN_SY> template <int WIN_SX, int WIN_SY>
@ -328,20 +328,20 @@ namespace dwt_cuda {
FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps); FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
} }
/// Only computes optimal number of sliding window steps,
/// Only computes optimal number of sliding window steps,
/// number of threadblocks and then lanches the 5/3 FDWT kernel. /// number of threadblocks and then lanches the 5/3 FDWT kernel.
/// @tparam WIN_SX width of sliding window /// @tparam WIN_SX width of sliding window
/// @tparam WIN_SY height of sliding window /// @tparam WIN_SY height of sliding window
/// @param in input image /// @param in input image
/// @param out output buffer /// @param out output buffer
/// @param sx width of the input image /// @param sx width of the input image
/// @param sy height of the input image /// @param sy height of the input image
template <int WIN_SX, int WIN_SY> template <int WIN_SX, int WIN_SY>
void launchFDWT53Kernel (int * in, int * out, int sx, int sy) { void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
// compute optimal number of steps of each sliding window // compute optimal number of steps of each sliding window
const int steps = divRndUp(sy, 15 * WIN_SY); const int steps = divRndUp(sy, 15 * WIN_SY);
int gx = divRndUp(sx, WIN_SX); int gx = divRndUp(sx, WIN_SX);
@ -352,18 +352,18 @@ namespace dwt_cuda {
// prepare grid size // prepare grid size
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
// printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX); // printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
// run kernel, possibly measure time and finally check the call // run kernel, possibly measure time and finally check the call
// PERF_BEGIN // PERF_BEGIN
fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps); fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
// PERF_END(" FDWT53", sx, sy) // PERF_END(" FDWT53", sx, sy)
// CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel"); // CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
printf("fdwt53Kernel in launchFDWT53Kernel has finished"); printf("fdwt53Kernel in launchFDWT53Kernel has finished");
} }
/// Forward 5/3 2D DWT. See common rules (above) for more details. /// Forward 5/3 2D DWT. See common rules (above) for more details.
/// @param in Expected to be normalized into range [-128, 127]. /// @param in Expected to be normalized into range [-128, 127].
/// Will not be preserved (will be overwritten). /// Will not be preserved (will be overwritten).
@ -373,7 +373,7 @@ namespace dwt_cuda {
/// @param levels number of recursive DWT levels /// @param levels number of recursive DWT levels
void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) { void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
// select right width of kernel for the size of the image // select right width of kernel for the size of the image
if(sizeX >= 960) { if(sizeX >= 960) {
launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY); launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
} else if (sizeX >= 480) { } else if (sizeX >= 480) {
@ -381,20 +381,20 @@ namespace dwt_cuda {
} else { } else {
launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY); launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
} }
// if this was not the last level, continue recursively with other levels // if this was not the last level, continue recursively with other levels
if(levels > 1) { if(levels > 1) {
// copy output's LL band back into input buffer // copy output's LL band back into input buffer
const int llSizeX = divRndUp(sizeX, 2); const int llSizeX = divRndUp(sizeX, 2);
const int llSizeY = divRndUp(sizeY, 2); const int llSizeY = divRndUp(sizeY, 2);
// printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY); // printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238 memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
// run remaining levels of FDWT // run remaining levels of FDWT
fdwt53(in, out, llSizeX, llSizeY, levels - 1); fdwt53(in, out, llSizeX, llSizeY, levels - 1);
} }
} }
} // end of namespace dwt_cuda } // end of namespace dwt_cuda

View File

@ -1,4 +1,4 @@
/// ///
/// @file fdwt97.cu /// @file fdwt97.cu
/// @brief CUDA implementation of forward 9/7 2D DWT. /// @brief CUDA implementation of forward 9/7 2D DWT.
/// @author Martin Jirman (207962@mail.muni.cz) /// @author Martin Jirman (207962@mail.muni.cz)
@ -7,16 +7,16 @@
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -38,8 +38,8 @@
namespace dwt_cuda { namespace dwt_cuda {
/// Wraps a buffer and methods for computing 9/7 FDWT with sliding window /// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
/// of specified size. Template arguments specify this size. /// of specified size. Template arguments specify this size.
/// @tparam WIN_SIZE_X width of sliding window /// @tparam WIN_SIZE_X width of sliding window
@ -62,8 +62,8 @@ namespace dwt_cuda {
template <bool CHECKED> template <bool CHECKED>
struct FDWT97ColumnLoadingInfo { struct FDWT97ColumnLoadingInfo {
/// Loader of pixels from some input image. /// Loader of pixels from some input image.
VerticalDWTPixelLoader<float, CHECKED> loader; VerticalDWTPixelLoader<float, CHECKED> loader;
/// Offset of column loaded by loader. (Offset in shared buffer.) /// Offset of column loaded by loader. (Offset in shared buffer.)
int offset; int offset;
}; };
@ -103,7 +103,7 @@ namespace dwt_cuda {
/// @param firstY index of first row to be loaded from image /// @param firstY index of first row to be loaded from image
template <bool CHECKED> template <bool CHECKED>
__device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column, __device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
const int columnIndex, const float * const input, const int columnIndex, const float * const input,
const int sizeX, const int sizeY, const int sizeX, const int sizeY,
const int firstY) { const int firstY) {
// get offset of the column with index 'columnIndex' // get offset of the column with index 'columnIndex'
@ -113,7 +113,7 @@ namespace dwt_cuda {
// x-coordinate of the first pixel to be loaded by given loader // x-coordinate of the first pixel to be loaded by given loader
const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex; const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;
if(blockIdx.y == 0) { if(blockIdx.y == 0) {
// topmost block - apply mirroring rules when loading first 7 rows // topmost block - apply mirroring rules when loading first 7 rows
column.loader.init(sizeX, sizeY, firstX, firstY); column.loader.init(sizeX, sizeY, firstX, firstY);
@ -162,7 +162,7 @@ namespace dwt_cuda {
/// @tparam CHECK_WRITES true if boundaries should be checked when writing /// @tparam CHECK_WRITES true if boundaries should be checked when writing
/// @param in input image /// @param in input image
/// @param out output buffer /// @param out output buffer
/// @param sizeX width of the input image /// @param sizeX width of the input image
/// @param sizeY height of the input image /// @param sizeY height of the input image
/// @param winSteps number of steps of sliding window /// @param winSteps number of steps of sliding window
template <bool CHECK_LOADS, bool CHECK_WRITES> template <bool CHECK_LOADS, bool CHECK_WRITES>
@ -205,7 +205,7 @@ namespace dwt_cuda {
// transform buffer offset of column transformed and saved by this thread // transform buffer offset of column transformed and saved by this thread
const int outColumnOffset = buffer.getColumnOffset(outColumnIndex); const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
// (Each iteration of this loop assumes that first 7 rows of transform // (Each iteration of this loop assumes that first 7 rows of transform
// buffer are already loaded with horizontally transformed coefficients.) // buffer are already loaded with horizontally transformed coefficients.)
for(int w = 0; w < winSteps; w++) { for(int w = 0; w < winSteps; w++) {
// Load another WIN_SIZE_Y lines of thread's column into the buffer. // Load another WIN_SIZE_Y lines of thread's column into the buffer.
@ -220,7 +220,7 @@ namespace dwt_cuda {
horizontalFDWT97(WIN_SIZE_Y, 7); horizontalFDWT97(WIN_SIZE_Y, 7);
// Using 7 registers, remember current values of last 7 rows of // Using 7 registers, remember current values of last 7 rows of
// transform buffer. These rows are transformed horizontally only // transform buffer. These rows are transformed horizontally only
// and will be used in next iteration. // and will be used in next iteration.
float last7Lines[7]; float last7Lines[7];
for(int i = 0; i < 7; i++) { for(int i = 0; i < 7; i++) {
@ -249,7 +249,7 @@ namespace dwt_cuda {
// As expected, these lines are already horizontally transformed. // As expected, these lines are already horizontally transformed.
for(int i = 0; i < 7; i++) { for(int i = 0; i < 7; i++) {
buffer[outColumnOffset + i * STRIDE] = last7Lines[i]; buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
} }
// Wait for all writing threads before proceeding to loading new // Wait for all writing threads before proceeding to loading new
@ -259,15 +259,15 @@ namespace dwt_cuda {
} }
} }
public: public:
/// Runs one of specialized variants of 9/7 FDWT according to distance of /// Runs one of specialized variants of 9/7 FDWT according to distance of
/// processed pixels to image boudnary. Some variants do not check for /// processed pixels to image boudnary. Some variants do not check for
/// boudnary and thus are slightly faster. /// boudnary and thus are slightly faster.
/// @param in input image /// @param in input image
/// @param out output buffer /// @param out output buffer
/// @param sx width of the input image /// @param sx width of the input image
/// @param sy height of the input image /// @param sy height of the input image
/// @param steps number of steps of sliding window /// @param steps number of steps of sliding window
__device__ static void run(const float * const input, float * const output, __device__ static void run(const float * const input, float * const output,
@ -299,15 +299,15 @@ namespace dwt_cuda {
fdwt97.transform<false, false>(input, output, sx, sy, steps); fdwt97.transform<false, false>(input, output, sx, sy, steps);
} }
} }
}; // end of class FDWT97 }; // end of class FDWT97
/// Main GPU 9/7 FDWT entry point. /// Main GPU 9/7 FDWT entry point.
/// @param input input image /// @param input input image
/// @parma output output buffer /// @parma output output buffer
/// @param sx width of the input image /// @param sx width of the input image
/// @param sy height of the input image /// @param sy height of the input image
/// @param steps number of steps of sliding window /// @param steps number of steps of sliding window
template <int WIN_SX, int WIN_SY> template <int WIN_SX, int WIN_SY>
@ -321,21 +321,21 @@ namespace dwt_cuda {
FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps); FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
} }
/// Only computes optimal number of sliding window steps, /// Only computes optimal number of sliding window steps,
/// number of threadblocks and then lanches the 9/7 FDWT kernel. /// number of threadblocks and then lanches the 9/7 FDWT kernel.
/// @tparam WIN_SX width of sliding window /// @tparam WIN_SX width of sliding window
/// @tparam WIN_SY height of sliding window /// @tparam WIN_SY height of sliding window
/// @param in input image /// @param in input image
/// @param out output buffer /// @param out output buffer
/// @param sx width of the input image /// @param sx width of the input image
/// @param sy height of the input image /// @param sy height of the input image
template <int WIN_SX, int WIN_SY> template <int WIN_SX, int WIN_SY>
void launchFDWT97Kernel (float * in, float * out, int sx, int sy) { void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
// compute optimal number of steps of each sliding window // compute optimal number of steps of each sliding window
const int steps = divRndUp(sy, 15 * WIN_SY); const int steps = divRndUp(sy, 15 * WIN_SY);
// prepare grid size // prepare grid size
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX); printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
@ -346,11 +346,11 @@ namespace dwt_cuda {
PERF_END(" FDWT97", sx, sy) PERF_END(" FDWT97", sx, sy)
CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel"); CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
} }
/// Forward 9/7 2D DWT. See common rules (dwt.h) for more details. /// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
/// @param in Input DWT coefficients. Should be normalized (in range /// @param in Input DWT coefficients. Should be normalized (in range
/// [-0.5, 0.5]). Will not be preserved (will be overwritten). /// [-0.5, 0.5]). Will not be preserved (will be overwritten).
/// @param out output buffer on GPU - format specified in common rules /// @param out output buffer on GPU - format specified in common rules
/// @param sizeX width of input image (in pixels) /// @param sizeX width of input image (in pixels)
@ -365,19 +365,19 @@ namespace dwt_cuda {
} else { } else {
launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY); launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
} }
// if this was not the last level, continue recursively with other levels // if this was not the last level, continue recursively with other levels
if(levels > 1) { if(levels > 1) {
// copy output's LL band back into input buffer // copy output's LL band back into input buffer
const int llSizeX = divRndUp(sizeX, 2); const int llSizeX = divRndUp(sizeX, 2);
const int llSizeY = divRndUp(sizeY, 2); const int llSizeY = divRndUp(sizeY, 2);
memCopy(in, out, llSizeX, llSizeY); memCopy(in, out, llSizeX, llSizeY);
// run remaining levels of FDWT // run remaining levels of FDWT
fdwt97(in, out, llSizeX, llSizeY, levels - 1); fdwt97(in, out, llSizeX, llSizeY, levels - 1);
} }
} }
} // end of namespace dwt_cuda } // end of namespace dwt_cuda

801
examples/dwt2d/dwt_cuda/io.h Executable file → Normal file
View File

@ -3,20 +3,20 @@
/// @brief Manages loading and saving lineary stored bands and input images. /// @brief Manages loading and saving lineary stored bands and input images.
/// @author Martin Jirman (207962@mail.muni.cz) /// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-01-20 22:38 /// @date 2011-01-20 22:38
/// ///
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -30,454 +30,411 @@
/// POSSIBILITY OF SUCH DAMAGE. /// POSSIBILITY OF SUCH DAMAGE.
/// ///
#ifndef IO_H #ifndef IO_H
#define IO_H #define IO_H
#include "common.h" #include "common.h"
namespace dwt_cuda { namespace dwt_cuda {
/// Base for all IO classes - manages mirroring.
/// Base for all IO classes - manages mirroring. class DWTIO {
class DWTIO { protected:
protected: /// Handles mirroring of image at edges in a DWT correct way.
/// Handles mirroring of image at edges in a DWT correct way. /// @param d a position in the image (will be replaced by mirrored d)
/// @param d a position in the image (will be replaced by mirrored d) /// @param sizeD size of the image along the dimension of 'd'
/// @param sizeD size of the image along the dimension of 'd' __device__ static void mirror(int &d, const int &sizeD) {
__device__ static void mirror(int & d, const int & sizeD) { // TODO: enable multiple mirroring:
// TODO: enable multiple mirroring: // if(sizeD > 1) {
// if(sizeD > 1) { // if(d < 0) {
// if(d < 0) { // const int underflow = -1 - d;
// const int underflow = -1 - d; // const int phase = (underflow / (sizeD - 1)) & 1;
// const int phase = (underflow / (sizeD - 1)) & 1; // const int remainder = underflow % (sizeD - 1);
// const int remainder = underflow % (sizeD - 1); // if(phase == 0) {
// if(phase == 0) { // d = remainder + 1;
// d = remainder + 1; // } else {
// } else { // d = sizeD - 2 - remainder;
// d = sizeD - 2 - remainder; // }
// } // } else if(d >= sizeD) {
// } else if(d >= sizeD) { // const int overflow = d - sizeD;
// const int overflow = d - sizeD; // const int phase = (overflow / (sizeD - 1)) & 1;
// const int phase = (overflow / (sizeD - 1)) & 1; // const int remainder = overflow % (sizeD - 1);
// const int remainder = overflow % (sizeD - 1); // if(phase == 0) {
// if(phase == 0) { // d = sizeD - 2 - remainder;
// d = sizeD - 2 - remainder; // } else {
// } else { // d = remainder + 1;
// d = remainder + 1; // }
// } // }
// } // } else {
// } else { // d = 0;
// d = 0; // }
// } // for test the mirror's use Feb 17
//for test the mirror's use Feb 17 if (d >= sizeD) {
if(d >= sizeD) { d = 2 * sizeD - 2 - d;
d = 2 * sizeD - 2 - d; } else if (d < 0) {
} else if(d < 0) { d = -d;
d = -d;
}
} }
}; }
};
/// Base class for pixel loader and writer - manages computing start index,
/// stride and end of image for loading column of pixels.
/// @tparam T type of image pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
protected:
int end; ///< index of bottom neightbor of last pixel of column
int stride; ///< increment of pointer to get to next pixel
/// Base class for pixel loader and writer - manages computing start index, /// Initializes pixel IO - sets end index and a position of first pixel.
/// stride and end of image for loading column of pixels. /// @param sizeX width of the image
/// @tparam T type of image pixels /// @param sizeY height of the image
/// @tparam CHECKED true = be prepared to image boundary, false = don't care /// @param firstX x-coordinate of first pixel to use
template <typename T, bool CHECKED> /// @param firstY y-coordinate of first pixel to use
class VerticalDWTPixelIO : protected DWTIO { /// @return index of pixel at position [x, y] in the image
protected: __device__ int initialize(const int sizeX, const int sizeY, int firstX,
int end; ///< index of bottom neightbor of last pixel of column int firstY) {
int stride; ///< increment of pointer to get to next pixel // initialize all pointers and stride
end = CHECKED ? (sizeY * sizeX + firstX) : 0;
stride = sizeX;
return firstX + sizeX * firstY;
}
};
/// Initializes pixel IO - sets end index and a position of first pixel. /// Writes reverse transformed pixels directly into output image.
/// @param sizeX width of the image /// @tparam T type of output pixels
/// @param sizeY height of the image /// @tparam CHECKED true = be prepared to image boundary, false = don't care
/// @param firstX x-coordinate of first pixel to use template <typename T, bool CHECKED>
/// @param firstY y-coordinate of first pixel to use class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
/// @return index of pixel at position [x, y] in the image private:
__device__ int initialize(const int sizeX, const int sizeY, int next; // index of the next pixel to be loaded
int firstX, int firstY) {
// initialize all pointers and stride
end = CHECKED ? (sizeY * sizeX + firstX) : 0;
stride = sizeX;
return firstX + sizeX * firstY;
}
};
public:
/// Initializes writer - sets output buffer and a position of first pixel.
/// Writes reverse transformed pixels directly into output image. /// @param sizeX width of the image
/// @tparam T type of output pixels /// @param sizeY height of the image
/// @tparam CHECKED true = be prepared to image boundary, false = don't care /// @param firstX x-coordinate of first pixel to write into
template <typename T, bool CHECKED> /// @param firstY y-coordinate of first pixel to write into
class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> { __device__ void init(const int sizeX, const int sizeY, int firstX,
private: int firstY) {
int next; // index of the next pixel to be loaded if (firstX < sizeX) {
next = this->initialize(sizeX, sizeY, firstX, firstY);
public: } else {
/// Initializes writer - sets output buffer and a position of first pixel.
/// @param sizeX width of the image
/// @param sizeY height of the image
/// @param firstX x-coordinate of first pixel to write into
/// @param firstY y-coordinate of first pixel to write into
__device__ void init(const int sizeX, const int sizeY,
int firstX, int firstY) {
if(firstX < sizeX) {
next = this->initialize(sizeX, sizeY, firstX, firstY);
} else {
this->end = 0;
this->stride = 0;
next = 0;
}
}
/// Writes given value at next position and advances internal pointer while
/// correctly handling mirroring.
/// @param output output image to write pixel into
/// @param value value of the pixel to be written
__device__ void writeInto(T * const output, const T & value) {
if((!CHECKED) || (next != this->end)) {
output[next] = value;
next += this->stride;
}
}
};
/// Loads pixels from input image.
/// @tparam T type of image input pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTPixelLoader
: protected VerticalDWTPixelIO<const T, CHECKED> {
private:
int last; ///< index of last loaded pixel
public:
//******************* FOR TEST **********************
__device__ int getlast(){
return last;
}
__device__ int getend(){
return this->end;
}
__device__ int getstride(){
return this->stride;
}
__device__ void setend(int a){
this->end=a;
}
//******************* FOR TEST **********************
/// Initializes loader - sets input size and a position of first pixel.
/// @param sizeX width of the image
/// @param sizeY height of the image
/// @param firstX x-coordinate of first pixel to load
/// @param firstY y-coordinate of first pixel to load
__device__ void init(const int sizeX, const int sizeY,
int firstX, int firstY) {
// correctly mirror x coordinate
this->mirror(firstX, sizeX);
// 'last' always points to already loaded pixel (subtract sizeX = stride)
last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
//last = (FirstX + sizeX * FirstY) - sizeX
}
/// Sets all fields to zeros, for compiler not to complain about
/// uninitialized stuff.
__device__ void clear() {
this->end = 0; this->end = 0;
this->stride = 0; this->stride = 0;
this->last = 0; next = 0;
}
}
/// Writes given value at next position and advances internal pointer while
/// correctly handling mirroring.
/// @param output output image to write pixel into
/// @param value value of the pixel to be written
__device__ void writeInto(T *const output, const T &value) {
if ((!CHECKED) || (next != this->end)) {
output[next] = value;
next += this->stride;
}
}
};
/// Loads pixels from input image.
/// @tparam T type of image input pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
private:
int last; ///< index of last loaded pixel
public:
//******************* FOR TEST **********************
__device__ int getlast() { return last; }
__device__ int getend() { return this->end; }
__device__ int getstride() { return this->stride; }
__device__ void setend(int a) { this->end = a; }
//******************* FOR TEST **********************
/// Initializes loader - sets input size and a position of first pixel.
/// @param sizeX width of the image
/// @param sizeY height of the image
/// @param firstX x-coordinate of first pixel to load
/// @param firstY y-coordinate of first pixel to load
__device__ void init(const int sizeX, const int sizeY, int firstX,
int firstY) {
// correctly mirror x coordinate
this->mirror(firstX, sizeX);
// 'last' always points to already loaded pixel (subtract sizeX = stride)
last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
// last = (FirstX + sizeX * FirstY) - sizeX
}
/// Sets all fields to zeros, for compiler not to complain about
/// uninitialized stuff.
__device__ void clear() {
this->end = 0;
this->stride = 0;
this->last = 0;
}
/// Gets another pixel and advancees internal pointer to following one.
/// @param input input image to load next pixel from
/// @return next pixel from given image
__device__ T loadFrom(const T *const input) {
last += this->stride;
if (CHECKED && (last == this->end)) {
last -= 2 * this->stride;
this->stride = -this->stride; // reverse loader's direction
}
// avoid reading from negative indices if loader is checked
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
// checked variant later
if (last < 0) {
return 0;
} }
/// Gets another pixel and advancees internal pointer to following one. return input[last];
/// @param input input image to load next pixel from // return this->end;
/// @return next pixel from given image // return last;
__device__ T loadFrom(const T * const input) { // return this->stride;
last += this->stride; }
if(CHECKED && (last == this->end)) { };
last -= 2 * this->stride;
this->stride = -this->stride; // reverse loader's direction
}
// avoid reading from negative indices if loader is checked
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later
if(last < 0 ) {
return 0;
}
return input[last];
// return this->end;
// return last;
// return this->stride;
}
};
/// Base for band write and loader. Manages computing strides and pointers
/// to first and last pixels in a linearly-stored-bands correct way.
/// @tparam T type of band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
protected:
/// index of bottom neighbor of last pixel of loaded column
int end;
/// increment of index to get from highpass band to the lowpass one
int strideHighToLow;
/// Base for band write and loader. Manages computing strides and pointers /// increment of index to get from the lowpass band to the highpass one
/// to first and last pixels in a linearly-stored-bands correct way. int strideLowToHigh;
/// @tparam T type of band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTBandIO : protected DWTIO {
protected:
/// index of bottom neighbor of last pixel of loaded column
int end;
/// increment of index to get from highpass band to the lowpass one
int strideHighToLow;
/// increment of index to get from the lowpass band to the highpass one
int strideLowToHigh;
/// Initializes IO - sets size of image and a position of first pixel. /// Initializes IO - sets size of image and a position of first pixel.
/// @param imageSizeX width of the image /// @param imageSizeX width of the image
/// @param imageSizeY height of the image /// @param imageSizeY height of the image
/// @param firstX x-coordinate of first pixel to use /// @param firstX x-coordinate of first pixel to use
/// (Parity determines vertically low or high band.) /// (Parity determines vertically low or high band.)
/// @param firstY y-coordinate of first pixel to use /// @param firstY y-coordinate of first pixel to use
/// (Parity determines horizontally low or high band.) /// (Parity determines horizontally low or high band.)
/// @return index of first item specified by firstX and firstY /// @return index of first item specified by firstX and firstY
__device__ int initialize(const int imageSizeX, const int imageSizeY, __device__ int initialize(const int imageSizeX, const int imageSizeY,
int firstX, int firstY) { int firstX, int firstY) {
// index of first pixel (topmost one) of the column with index firstX // index of first pixel (topmost one) of the column with index firstX
int columnOffset = firstX / 2; int columnOffset = firstX / 2;
// difference between indices of two vertically neighboring pixels
// in the same band
int verticalStride;
// resolve index of first pixel according to horizontal parity
if(firstX & 1) {
// first pixel in one of right bands
verticalStride = imageSizeX / 2;
columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
strideLowToHigh = (imageSizeX * imageSizeY) / 2;
} else {
// first pixel in one of left bands
verticalStride = imageSizeX / 2 + (imageSizeX & 1);
strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
}
// set the other stride
strideHighToLow = verticalStride - strideLowToHigh;
// compute index of coefficient which indicates end of image // difference between indices of two vertically neighboring pixels
if(CHECKED) { // in the same band
end = columnOffset // right column int verticalStride;
+ (imageSizeY / 2) * verticalStride // right row
+ (imageSizeY & 1) * strideLowToHigh; // possibly in high band
} else {
end = 0;
}
// resolve index of first pixel according to horizontal parity
//***********for test************** if (firstX & 1) {
// end = CHECKED; // first pixel in one of right bands
//***********for test************** verticalStride = imageSizeX / 2;
columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
strideLowToHigh = (imageSizeX * imageSizeY) / 2;
// finally, return index of the first item } else {
return columnOffset // right column // first pixel in one of left bands
+ (firstY / 2) * verticalStride // right row verticalStride = imageSizeX / 2 + (imageSizeX & 1);
+ (firstY & 1) * strideLowToHigh; // possibly in high band strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
}
};
/// Directly loads coefficients from four consecutively stored transformed
/// bands.
/// @tparam T type of input band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
private:
int last; ///< index of last loaded pixel
/// Checks internal index and possibly reverses direction of loader.
/// (Handles mirroring at the bottom of the image.)
/// @param input input image to load next coefficient from
/// @param stride stride to use now (one of two loader's strides)
/// @return loaded coefficient
__device__ T updateAndLoad(const T * const input, const int & stride) {
last += stride;
if(CHECKED && (last == this->end)) {
// undo last two updates of index (to get to previous mirrored item)
last -= (this->strideLowToHigh + this->strideHighToLow);
// swap and reverse strides (to move up in the loaded column now)
const int temp = this->strideLowToHigh;
this->strideLowToHigh = -this->strideHighToLow;
this->strideHighToLow = -temp;
}
if(last < 0 ) {
return 0;
}
// avoid reading from negative indices if loader is checked
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later
return input[last];
}
public:
/// Initializes loader - sets input size and a position of first pixel.
/// @param imageSizeX width of the image
/// @param imageSizeY height of the image
/// @param firstX x-coordinate of first pixel to load
/// (Parity determines vertically low or high band.)
/// @param firstY y-coordinate of first pixel to load
/// (Parity determines horizontally low or high band.)
__device__ void init(const int imageSizeX, const int imageSizeY,
int firstX, const int firstY) {
this->mirror(firstX, imageSizeX);
last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
// adjust to point to previous item
last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
}
/// Sets all fields to zeros, for compiler not to complain about
/// uninitialized stuff.
__device__ void clear() {
this->end = 0;
this->strideHighToLow = 0;
this->strideLowToHigh = 0;
this->last = 0;
} }
/// Gets another coefficient from lowpass band and advances internal index. // set the other stride
/// Call this method first if position of first pixel passed to init strideHighToLow = verticalStride - strideLowToHigh;
/// was in high band.
/// @param input input image to load next coefficient from // compute index of coefficient which indicates end of image
/// @return next coefficient from the lowpass band of the given image if (CHECKED) {
__device__ T loadLowFrom(const T * const input) { end = columnOffset // right column
return updateAndLoad(input, this->strideHighToLow); + (imageSizeY / 2) * verticalStride // right row
+ (imageSizeY & 1) * strideLowToHigh; // possibly in high band
} else {
end = 0;
} }
/// Gets another coefficient from the highpass band and advances index. //***********for test**************
/// Call this method first if position of first pixel passed to init // end = CHECKED;
/// was in high band. //***********for test**************
/// @param input input image to load next coefficient from
/// @return next coefficient from the highbass band of the given image // finally, return index of the first item
__device__ T loadHighFrom(const T * const input) { return columnOffset // right column
return updateAndLoad(input, this->strideLowToHigh); + (firstY / 2) * verticalStride // right row
+ (firstY & 1) * strideLowToHigh; // possibly in high band
}
};
/// Directly loads coefficients from four consecutively stored transformed
/// bands.
/// @tparam T type of input band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
private:
int last; ///< index of last loaded pixel
/// Checks internal index and possibly reverses direction of loader.
/// (Handles mirroring at the bottom of the image.)
/// @param input input image to load next coefficient from
/// @param stride stride to use now (one of two loader's strides)
/// @return loaded coefficient
__device__ T updateAndLoad(const T *const input, const int &stride) {
last += stride;
if (CHECKED && (last == this->end)) {
// undo last two updates of index (to get to previous mirrored item)
last -= (this->strideLowToHigh + this->strideHighToLow);
// swap and reverse strides (to move up in the loaded column now)
const int temp = this->strideLowToHigh;
this->strideLowToHigh = -this->strideHighToLow;
this->strideHighToLow = -temp;
} }
if (last < 0) {
}; return 0;
/// Directly saves coefficients into four transformed bands.
/// @tparam T type of output band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
private:
int next; ///< index of last loaded pixel
/// Checks internal index and possibly stops the writer.
/// (Handles mirroring at edges of the image.)
/// @param output output buffer
/// @param item item to put into the output
/// @param stride increment of the pointer to get to next output index
__device__ int saveAndUpdate(T * const output, const T & item,
const int & stride) {
// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){ //test, Mar 20
if((!CHECKED) || (next != this->end)) {
// if(next == 4) {
// printf(" next: %d stride: %d val: %f \n", next, stride, item );
// }
output[next] = item;
next += stride;
}
// }
// if((!CHECKED) || (next != this->end)) { //the real one
// output[next] = item;
// next += stride; //stride has been test
// }
return next;
} }
public: // avoid reading from negative indices if loader is checked
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
// checked variant later
return input[last];
}
/// Initializes writer - sets output size and a position of first pixel. public:
/// @param output output image /// Initializes loader - sets input size and a position of first pixel.
/// @param imageSizeX width of the image /// @param imageSizeX width of the image
/// @param imageSizeY height of the image /// @param imageSizeY height of the image
/// @param firstX x-coordinate of first pixel to write /// @param firstX x-coordinate of first pixel to load
/// (Parity determines vertically low or high band.) /// (Parity determines vertically low or high band.)
/// @param firstY y-coordinate of first pixel to write /// @param firstY y-coordinate of first pixel to load
/// (Parity determines horizontally low or high band.) /// (Parity determines horizontally low or high band.)
__device__ void init(const int imageSizeX, const int imageSizeY, __device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
const int firstX, const int firstY) { const int firstY) {
if (firstX < imageSizeX) { this->mirror(firstX, imageSizeX);
next = this->initialize(imageSizeX, imageSizeY, firstX, firstY); last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
} else {
clear();
}
}
/// Sets all fields to zeros, for compiler not to complain about
/// uninitialized stuff.
__device__ void clear() {
this->end = 0;
this->strideHighToLow = 0;
this->strideLowToHigh = 0;
this->next = 0;
}
/// Writes another coefficient into the band which was specified using // adjust to point to previous item
/// init's firstX and firstY parameters and advances internal pointer. last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
/// Call this method first if position of first pixel passed to init }
/// was in lowpass band.
/// @param output output image
/// @param low lowpass coefficient to save into the lowpass band
__device__ int writeLowInto(T * const output, const T & primary) {
return saveAndUpdate(output, primary, this->strideLowToHigh);
}
/// Writes another coefficient from the other band and advances pointer. /// Sets all fields to zeros, for compiler not to complain about
/// Call this method first if position of first pixel passed to init /// uninitialized stuff.
/// was in highpass band. __device__ void clear() {
/// @param output output image this->end = 0;
/// @param high highpass coefficient to save into the highpass band this->strideHighToLow = 0;
__device__ int writeHighInto(T * const output, const T & other) { this->strideLowToHigh = 0;
return saveAndUpdate(output, other, this->strideHighToLow); this->last = 0;
} }
/// Gets another coefficient from lowpass band and advances internal index.
/// Call this method first if position of first pixel passed to init
/// was in high band.
/// @param input input image to load next coefficient from
/// @return next coefficient from the lowpass band of the given image
__device__ T loadLowFrom(const T *const input) {
return updateAndLoad(input, this->strideHighToLow);
}
/// Gets another coefficient from the highpass band and advances index.
/// Call this method first if position of first pixel passed to init
/// was in high band.
/// @param input input image to load next coefficient from
/// @return next coefficient from the highbass band of the given image
__device__ T loadHighFrom(const T *const input) {
return updateAndLoad(input, this->strideLowToHigh);
}
};
/// Directly saves coefficients into four transformed bands.
/// @tparam T type of output band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
private:
int next; ///< index of last loaded pixel
/// Checks internal index and possibly stops the writer.
/// (Handles mirroring at edges of the image.)
/// @param output output buffer
/// @param item item to put into the output
/// @param stride increment of the pointer to get to next output index
__device__ int saveAndUpdate(T *const output, const T &item,
const int &stride) {
// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
////test, Mar 20
if ((!CHECKED) || (next != this->end)) {
// if(next == 4) {
// printf(" next: %d stride: %d val: %f \n", next, stride, item );
// }
output[next] = item;
next += stride;
}
// }
// if((!CHECKED) || (next != this->end)) { //the real one
// output[next] = item;
// next += stride; //stride has been test
// }
return next;
}
public:
/// Initializes writer - sets output size and a position of first pixel.
/// @param output output image
/// @param imageSizeX width of the image
/// @param imageSizeY height of the image
/// @param firstX x-coordinate of first pixel to write
/// (Parity determines vertically low or high band.)
/// @param firstY y-coordinate of first pixel to write
/// (Parity determines horizontally low or high band.)
__device__ void init(const int imageSizeX, const int imageSizeY,
const int firstX, const int firstY) {
if (firstX < imageSizeX) {
next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
} else {
clear();
}
}
/// Sets all fields to zeros, for compiler not to complain about
/// uninitialized stuff.
__device__ void clear() {
this->end = 0;
this->strideHighToLow = 0;
this->strideLowToHigh = 0;
this->next = 0;
}
/// Writes another coefficient into the band which was specified using
/// init's firstX and firstY parameters and advances internal pointer.
/// Call this method first if position of first pixel passed to init
/// was in lowpass band.
/// @param output output image
/// @param low lowpass coefficient to save into the lowpass band
__device__ int writeLowInto(T *const output, const T &primary) {
return saveAndUpdate(output, primary, this->strideLowToHigh);
}
/// Writes another coefficient from the other band and advances pointer.
/// Call this method first if position of first pixel passed to init
/// was in highpass band.
/// @param output output image
/// @param high highpass coefficient to save into the highpass band
__device__ int writeHighInto(T *const output, const T &other) {
return saveAndUpdate(output, other, this->strideHighToLow);
}
//*******Add three functions to get private values*******
__device__ int getnext() { return next; }
__device__ int getend() { return this->end; }
__device__ int getstrideHighToLow() { return this->strideHighToLow; }
__device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
//*******Add three functions to get private values*******
};
//*******Add three functions to get private values*******
__device__ int getnext(){
return next;
}
__device__ int getend(){
return this->end;
}
__device__ int getstrideHighToLow(){
return this->strideHighToLow;
}
__device__ int getstrideLowToHigh(){
return this->strideLowToHigh;
}
//*******Add three functions to get private values*******
};
} // namespace dwt_cuda } // namespace dwt_cuda
#endif // IO_H
#endif // IO_H

View File

@ -1,4 +1,4 @@
/// ///
/// @file rdwt53.cu /// @file rdwt53.cu
/// @brief CUDA implementation of reverse 5/3 2D DWT. /// @brief CUDA implementation of reverse 5/3 2D DWT.
/// @author Martin Jirman (207962@mail.muni.cz) /// @author Martin Jirman (207962@mail.muni.cz)
@ -7,16 +7,16 @@
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -38,7 +38,7 @@
namespace dwt_cuda { namespace dwt_cuda {
/// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT /// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
/// using sliding window and lifting schema. /// using sliding window and lifting schema.
@ -46,8 +46,8 @@ namespace dwt_cuda {
/// @tparam WIN_SIZE_Y height of sliding window /// @tparam WIN_SIZE_Y height of sliding window
template <int WIN_SIZE_X, int WIN_SIZE_Y> template <int WIN_SIZE_X, int WIN_SIZE_Y>
class RDWT53 { class RDWT53 {
private: private:
/// Shared memory buffer used for 5/3 DWT transforms. /// Shared memory buffer used for 5/3 DWT transforms.
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer; typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
@ -64,10 +64,10 @@ namespace dwt_cuda {
struct RDWT53Column { struct RDWT53Column {
/// loader of pixels from column in input image /// loader of pixels from column in input image
VerticalDWTBandLoader<int, CHECKED> loader; VerticalDWTBandLoader<int, CHECKED> loader;
/// Offset of corresponding column in shared buffer. /// Offset of corresponding column in shared buffer.
int offset; int offset;
/// Sets all fields to some values to avoid 'uninitialized' warnings. /// Sets all fields to some values to avoid 'uninitialized' warnings.
__device__ void clear() { __device__ void clear() {
offset = 0; offset = 0;
@ -128,7 +128,7 @@ namespace dwt_cuda {
/// @param sizeY height of the input image /// @param sizeY height of the input image
/// @param loader (uninitialized) info about loaded column /// @param loader (uninitialized) info about loaded column
template <bool CHECKED> template <bool CHECKED>
__device__ void initColumn(const int columnX, const int * const input, __device__ void initColumn(const int columnX, const int * const input,
const int sizeX, const int sizeY, const int sizeX, const int sizeY,
RDWT53Column<CHECKED> & column, RDWT53Column<CHECKED> & column,
const int firstY) { const int firstY) {
@ -162,7 +162,7 @@ namespace dwt_cuda {
/// @tparam CHECKED_WRITES true if boundaries must be checked when writing /// @tparam CHECKED_WRITES true if boundaries must be checked when writing
/// @param in input image (5/3 transformed coefficients) /// @param in input image (5/3 transformed coefficients)
/// @param out output buffer (for reverse transformed image) /// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image /// @param sizeX width of the output image
/// @param sizeY height of the output image /// @param sizeY height of the output image
/// @param winSteps number of sliding window steps /// @param winSteps number of sliding window steps
template<bool CHECKED_LOADS, bool CHECKED_WRITES> template<bool CHECKED_LOADS, bool CHECKED_WRITES>
@ -182,7 +182,7 @@ namespace dwt_cuda {
// column #0, thread #1 get right column #1 and thread #2 left column. // column #0, thread #1 get right column #1 and thread #2 left column.
const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3); const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
// Thread initializes offset of the boundary column (in shared // Thread initializes offset of the boundary column (in shared
// buffer), first 3 pixels of the column and a loader for this column. // buffer), first 3 pixels of the column and a loader for this column.
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY); initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
} }
@ -216,8 +216,8 @@ namespace dwt_cuda {
// horizontally transform all newly loaded lines // horizontally transform all newly loaded lines
horizontalTransform(WIN_SIZE_Y, 3); horizontalTransform(WIN_SIZE_Y, 3);
// Using 3 registers, remember current values of last 3 rows // Using 3 registers, remember current values of last 3 rows
// of transform buffer. These rows are transformed horizontally // of transform buffer. These rows are transformed horizontally
// only and will be used in next iteration. // only and will be used in next iteration.
int last3Lines[3]; int last3Lines[3];
last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE]; last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
@ -253,7 +253,7 @@ namespace dwt_cuda {
/// Main GPU 5/3 RDWT entry point. /// Main GPU 5/3 RDWT entry point.
/// @param in input image (5/3 transformed coefficients) /// @param in input image (5/3 transformed coefficients)
/// @param out output buffer (for reverse transformed image) /// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image /// @param sizeX width of the output image
/// @param sizeY height of the output image /// @param sizeY height of the output image
/// @param winSteps number of sliding window steps /// @param winSteps number of sliding window steps
__device__ static void run(const int * const input, int * const output, __device__ static void run(const int * const input, int * const output,
@ -284,13 +284,13 @@ namespace dwt_cuda {
} }
}; // end of class RDWT53 }; // end of class RDWT53
/// Main GPU 5/3 RDWT entry point. /// Main GPU 5/3 RDWT entry point.
/// @param in input image (5/3 transformed coefficients) /// @param in input image (5/3 transformed coefficients)
/// @param out output buffer (for reverse transformed image) /// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image /// @param sizeX width of the output image
/// @param sizeY height of the output image /// @param sizeY height of the output image
/// @param winSteps number of sliding window steps /// @param winSteps number of sliding window steps
template <int WIN_SX, int WIN_SY> template <int WIN_SX, int WIN_SY>
@ -299,34 +299,34 @@ namespace dwt_cuda {
const int sx, const int sy, const int steps) { const int sx, const int sy, const int steps) {
RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps); RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
} }
/// Only computes optimal number of sliding window steps, /// Only computes optimal number of sliding window steps,
/// number of threadblocks and then lanches the 5/3 RDWT kernel. /// number of threadblocks and then lanches the 5/3 RDWT kernel.
/// @tparam WIN_SX width of sliding window /// @tparam WIN_SX width of sliding window
/// @tparam WIN_SY height of sliding window /// @tparam WIN_SY height of sliding window
/// @param in input image /// @param in input image
/// @param out output buffer /// @param out output buffer
/// @param sx width of the input image /// @param sx width of the input image
/// @param sy height of the input image /// @param sy height of the input image
template <int WIN_SX, int WIN_SY> template <int WIN_SX, int WIN_SY>
void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) { void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
// compute optimal number of steps of each sliding window // compute optimal number of steps of each sliding window
const int steps = divRndUp(sy, 15 * WIN_SY); const int steps = divRndUp(sy, 15 * WIN_SY);
// prepare grid size // prepare grid size
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
// finally transform this level // finally transform this level
PERF_BEGIN PERF_BEGIN
rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps); rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
PERF_END(" RDWT53", sx, sy) PERF_END(" RDWT53", sx, sy)
CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel"); CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
} }
/// Reverse 5/3 2D DWT. See common rules (above) for more details. /// Reverse 5/3 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Format described in common rules. /// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten). /// Will not be preserved (will be overwritten).
@ -341,11 +341,11 @@ namespace dwt_cuda {
const int llSizeX = divRndUp(sizeX, 2); const int llSizeX = divRndUp(sizeX, 2);
const int llSizeY = divRndUp(sizeY, 2); const int llSizeY = divRndUp(sizeY, 2);
rdwt53(in, out, llSizeX, llSizeY, levels - 1); rdwt53(in, out, llSizeX, llSizeY, levels - 1);
// copy reverse transformed LL band from output back into the input // copy reverse transformed LL band from output back into the input
memCopy(in, out, llSizeX, llSizeY); memCopy(in, out, llSizeX, llSizeY);
} }
// select right width of kernel for the size of the image // select right width of kernel for the size of the image
if(sizeX >= 960) { if(sizeX >= 960) {
launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY); launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
@ -355,6 +355,6 @@ namespace dwt_cuda {
launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY); launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
} }
} }
} // end of namespace dwt_cuda } // end of namespace dwt_cuda

View File

@ -1,4 +1,4 @@
/// ///
/// @file rdwt97.cu /// @file rdwt97.cu
/// @brief CUDA implementation of reverse 9/7 2D DWT. /// @brief CUDA implementation of reverse 9/7 2D DWT.
/// @author Martin Jirman (207962@mail.muni.cz) /// @author Martin Jirman (207962@mail.muni.cz)
@ -7,16 +7,16 @@
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -38,7 +38,7 @@
namespace dwt_cuda { namespace dwt_cuda {
/// Wraps shared memory buffer and methods for computing 9/7 RDWT using /// Wraps shared memory buffer and methods for computing 9/7 RDWT using
/// lifting schema and sliding window. /// lifting schema and sliding window.
/// @tparam WIN_SIZE_X width of the sliding window /// @tparam WIN_SIZE_X width of the sliding window
@ -46,7 +46,7 @@ namespace dwt_cuda {
template <int WIN_SIZE_X, int WIN_SIZE_Y> template <int WIN_SIZE_X, int WIN_SIZE_Y>
class RDWT97 { class RDWT97 {
private: private:
/// Info related to loading of one input column. /// Info related to loading of one input column.
/// @tparam CHECKED true if boundary chould be checked, /// @tparam CHECKED true if boundary chould be checked,
/// false if there is no near boudnary /// false if there is no near boudnary
@ -54,10 +54,10 @@ namespace dwt_cuda {
struct RDWT97Column { struct RDWT97Column {
/// laoder of input pxels for given column. /// laoder of input pxels for given column.
VerticalDWTBandLoader<float, CHECKED> loader; VerticalDWTBandLoader<float, CHECKED> loader;
/// Offset of loaded column in shared memory buffer. /// Offset of loaded column in shared memory buffer.
int offset; int offset;
/// Sets all fields to some values to avoid 'uninitialized' warnings. /// Sets all fields to some values to avoid 'uninitialized' warnings.
__device__ void clear() { __device__ void clear() {
loader.clear(); loader.clear();
@ -104,7 +104,7 @@ namespace dwt_cuda {
/// @param column (uninitialized) info about loading one column /// @param column (uninitialized) info about loading one column
/// @param firstY index of first image row to be transformed /// @param firstY index of first image row to be transformed
template <bool CHECKED> template <bool CHECKED>
__device__ void initColumn(const int colIndex, const float * const input, __device__ void initColumn(const int colIndex, const float * const input,
const int sizeX, const int sizeY, const int sizeX, const int sizeY,
RDWT97Column<CHECKED> & column, RDWT97Column<CHECKED> & column,
const int firstY) { const int firstY) {
@ -124,7 +124,7 @@ namespace dwt_cuda {
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input); buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
buffer[column.offset + 5 * STRIDE] = buffer[column.offset + 5 * STRIDE] =
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input); buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
buffer[column.offset + 6 * STRIDE] = buffer[column.offset + 6 * STRIDE] =
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input); buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
} else { } else {
// non-topmost row - regular loading: // non-topmost row - regular loading:
@ -162,7 +162,7 @@ namespace dwt_cuda {
/// when writing into output buffer /// when writing into output buffer
/// @param in input image (9/7 transformed coefficients) /// @param in input image (9/7 transformed coefficients)
/// @param out output buffer (for reverse transformed image) /// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image /// @param sizeX width of the output image
/// @param sizeY height of the output image /// @param sizeY height of the output image
/// @param winSteps number of steps of sliding window /// @param winSteps number of steps of sliding window
template <bool CHECKED_LOADS, bool CHECKED_WRITES> template <bool CHECKED_LOADS, bool CHECKED_WRITES>
@ -182,7 +182,7 @@ namespace dwt_cuda {
// each thread among first 7 ones gets index of one of boundary columns // each thread among first 7 ones gets index of one of boundary columns
const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7); const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7);
// Thread initializes offset of the boundary column (in shared // Thread initializes offset of the boundary column (in shared
// buffer), first 7 pixels of the column and a loader for this column. // buffer), first 7 pixels of the column and a loader for this column.
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY); initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
} }
@ -201,7 +201,7 @@ namespace dwt_cuda {
// offset of column (in transform buffer) saved by this thread // offset of column (in transform buffer) saved by this thread
const int outColumnOffset = buffer.getColumnOffset(threadIdx.x); const int outColumnOffset = buffer.getColumnOffset(threadIdx.x);
// (Each iteration assumes that first 7 rows of transform buffer are // (Each iteration assumes that first 7 rows of transform buffer are
// already loaded with horizontally transformed pixels.) // already loaded with horizontally transformed pixels.)
for(int w = 0; w < winSteps; w++) { for(int w = 0; w < winSteps; w++) {
// Load another WIN_SIZE_Y lines of this thread's column // Load another WIN_SIZE_Y lines of this thread's column
@ -216,8 +216,8 @@ namespace dwt_cuda {
// horizontally transform all newly loaded lines // horizontally transform all newly loaded lines
horizontalRDWT97(WIN_SIZE_Y, 7); horizontalRDWT97(WIN_SIZE_Y, 7);
// Using 7 registers, remember current values of last 7 rows // Using 7 registers, remember current values of last 7 rows
// of transform buffer. These rows are transformed horizontally // of transform buffer. These rows are transformed horizontally
// only and will be used in next iteration. // only and will be used in next iteration.
float last7Lines[7]; float last7Lines[7];
for(int i = 0; i < 7; i++) { for(int i = 0; i < 7; i++) {
@ -257,13 +257,13 @@ namespace dwt_cuda {
/// Main GPU 9/7 RDWT entry point. /// Main GPU 9/7 RDWT entry point.
/// @param in input image (9/7 transformed coefficients) /// @param in input image (9/7 transformed coefficients)
/// @param out output buffer (for reverse transformed image) /// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image /// @param sizeX width of the output image
/// @param sizeY height of the output image /// @param sizeY height of the output image
__device__ static void run(const float * const input, float * const output, __device__ static void run(const float * const input, float * const output,
const int sx, const int sy, const int steps) { const int sx, const int sy, const int steps) {
// prepare instance with buffer in shared memory // prepare instance with buffer in shared memory
__shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97; __shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97;
// Compute limits of this threadblock's block of pixels and use them to // Compute limits of this threadblock's block of pixels and use them to
// determine, whether this threadblock will have to deal with boundary. // determine, whether this threadblock will have to deal with boundary.
// (3 in next expressions is for radius of impulse response of 9/7 RDWT.) // (3 in next expressions is for radius of impulse response of 9/7 RDWT.)
@ -285,15 +285,15 @@ namespace dwt_cuda {
rdwt97.transform<false, false>(input, output, sx, sy, steps); rdwt97.transform<false, false>(input, output, sx, sy, steps);
} }
} }
}; // end of class RDWT97 }; // end of class RDWT97
/// Main GPU 9/7 RDWT entry point. /// Main GPU 9/7 RDWT entry point.
/// @param in input image (9/7 transformed coefficients) /// @param in input image (9/7 transformed coefficients)
/// @param out output buffer (for reverse transformed image) /// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image /// @param sizeX width of the output image
/// @param sizeY height of the output image /// @param sizeY height of the output image
template <int WIN_SX, int WIN_SY> template <int WIN_SX, int WIN_SY>
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8)) __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8))
@ -301,34 +301,34 @@ namespace dwt_cuda {
const int sx, const int sy, const int steps) { const int sx, const int sy, const int steps) {
RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps); RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
} }
/// Only computes optimal number of sliding window steps, /// Only computes optimal number of sliding window steps,
/// number of threadblocks and then lanches the 9/7 RDWT kernel. /// number of threadblocks and then lanches the 9/7 RDWT kernel.
/// @tparam WIN_SX width of sliding window /// @tparam WIN_SX width of sliding window
/// @tparam WIN_SY height of sliding window /// @tparam WIN_SY height of sliding window
/// @param in input image /// @param in input image
/// @param out output buffer /// @param out output buffer
/// @param sx width of the input image /// @param sx width of the input image
/// @param sy height of the input image /// @param sy height of the input image
template <int WIN_SX, int WIN_SY> template <int WIN_SX, int WIN_SY>
void launchRDWT97Kernel (float * in, float * out, int sx, int sy) { void launchRDWT97Kernel (float * in, float * out, int sx, int sy) {
// compute optimal number of steps of each sliding window // compute optimal number of steps of each sliding window
const int steps = divRndUp(sy, 15 * WIN_SY); const int steps = divRndUp(sy, 15 * WIN_SY);
// prepare grid size // prepare grid size
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps)); dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
// finally launch kernel // finally launch kernel
PERF_BEGIN PERF_BEGIN
rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps); rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
PERF_END(" RDWT97", sx, sy) PERF_END(" RDWT97", sx, sy)
CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel"); CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel");
} }
/// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details. /// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details.
/// @param in Input DWT coefficients. Format described in common rules. /// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten). /// Will not be preserved (will be overwritten).
@ -343,11 +343,11 @@ namespace dwt_cuda {
const int llSizeX = divRndUp(sizeX, 2); const int llSizeX = divRndUp(sizeX, 2);
const int llSizeY = divRndUp(sizeY, 2); const int llSizeY = divRndUp(sizeY, 2);
rdwt97(in, out, llSizeX, llSizeY, levels - 1); rdwt97(in, out, llSizeX, llSizeY, levels - 1);
// copy reverse transformed LL band from output back into the input // copy reverse transformed LL band from output back into the input
memCopy(in, out, llSizeX, llSizeY); memCopy(in, out, llSizeX, llSizeY);
} }
// select right width of kernel for the size of the image // select right width of kernel for the size of the image
if(sizeX >= 960) { if(sizeX >= 960) {
launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY); launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
@ -357,7 +357,7 @@ namespace dwt_cuda {
launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY); launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
} }
} }
} // end of namespace dwt_cuda } // end of namespace dwt_cuda

599
examples/dwt2d/dwt_cuda/transform_buffer.h Executable file → Normal file
View File

@ -7,16 +7,16 @@
/// ///
/// Copyright (c) 2011 Martin Jirman /// Copyright (c) 2011 Martin Jirman
/// All rights reserved. /// All rights reserved.
/// ///
/// Redistribution and use in source and binary forms, with or without /// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met: /// modification, are permitted provided that the following conditions are met:
/// ///
/// * Redistributions of source code must retain the above copyright /// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer. /// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright /// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the /// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution. /// documentation and/or other materials provided with the distribution.
/// ///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -30,344 +30,309 @@
/// POSSIBILITY OF SUCH DAMAGE. /// POSSIBILITY OF SUCH DAMAGE.
/// ///
#ifndef TRANSFORM_BUFFER_H #ifndef TRANSFORM_BUFFER_H
#define TRANSFORM_BUFFER_H #define TRANSFORM_BUFFER_H
namespace dwt_cuda { namespace dwt_cuda {
/// Buffer (in shared memory of GPU) where block of input image is stored,
/// but odd and even lines are separated. (Generates less bank conflicts when
/// using lifting schema.) All operations expect SIZE_X threads.
/// Also implements basic building blocks of lifting schema.
/// @tparam SIZE_X width of the buffer excluding two boundaries (Also
/// a number of threads participating on all operations.)
/// Must be divisible by 4.
/// @tparam SIZE_Y height of buffer (total number of lines)
/// @tparam BOUNDARY_X number of extra pixels at the left and right side
/// boundary is expected to be smaller than half SIZE_X
/// Must be divisible by 2.
template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
class TransformBuffer {
public:
enum {
/// difference between pointers to two vertical neigbors
VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
};
private:
enum {
/// number of shared memory banks - needed for correct padding
#ifdef __CUDA_ARCH__
SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
#else
SHM_BANKS = 16, // for host code only - can be anything, won't be used
#endif
/// size of one of two buffers (odd or even)
BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
/// unused space between two buffers
PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
/// offset of the odd columns buffer from the beginning of data buffer
ODD_OFFSET = BUFFER_SIZE + PADDING,
};
/// buffer for both even and odd columns /// Buffer (in shared memory of GPU) where block of input image is stored,
T data[2 * BUFFER_SIZE + PADDING]; /// but odd and even lines are separated. (Generates less bank conflicts when
/// using lifting schema.) All operations expect SIZE_X threads.
/// Also implements basic building blocks of lifting schema.
/// @tparam SIZE_X width of the buffer excluding two boundaries (Also
/// Applies specified function to all central elements while also passing /// a number of threads participating on all operations.)
/// previous and next elements as parameters. /// Must be divisible by 4.
/// @param count count of central elements to apply function to /// @tparam SIZE_Y height of buffer (total number of lines)
/// @param prevOffset offset of first central element /// @tparam BOUNDARY_X number of extra pixels at the left and right side
/// @param midOffset offset of first central element's predecessor /// boundary is expected to be smaller than half SIZE_X
/// @param nextOffset offset of first central element's successor /// Must be divisible by 2.
/// @param function the function itself template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
template <typename FUNC> class TransformBuffer {
__device__ void horizontalStep(const int count, const int prevOffset, public:
const int midOffset, const int nextOffset, enum {
const FUNC & function) { /// difference between pointers to two vertical neigbors
// number of unchecked iterations VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
const int STEPS = count / SIZE_X; };
// items remaining after last unchecked iteration private:
const int finalCount = count % SIZE_X; enum {
/// number of shared memory banks - needed for correct padding
// offset of items processed in last (checked) iteration #ifdef __CUDA_ARCH__
const int finalOffset = count - finalCount; SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
#else
// all threads perform fixed number of iterations ... SHM_BANKS = 16, // for host code only - can be anything, won't be used
for(int i = 0; i < STEPS; i++) { #endif
/// size of one of two buffers (odd or even)
BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
/// unused space between two buffers
PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
/// offset of the odd columns buffer from the beginning of data buffer
ODD_OFFSET = BUFFER_SIZE + PADDING,
};
/// buffer for both even and odd columns
T data[2 * BUFFER_SIZE + PADDING];
/// Applies specified function to all central elements while also passing
/// previous and next elements as parameters.
/// @param count count of central elements to apply function to
/// @param prevOffset offset of first central element
/// @param midOffset offset of first central element's predecessor
/// @param nextOffset offset of first central element's successor
/// @param function the function itself
template <typename FUNC>
__device__ void horizontalStep(const int count, const int prevOffset,
const int midOffset, const int nextOffset,
const FUNC &function) {
// number of unchecked iterations
const int STEPS = count / SIZE_X;
// items remaining after last unchecked iteration
const int finalCount = count % SIZE_X;
// offset of items processed in last (checked) iteration
const int finalOffset = count - finalCount;
// all threads perform fixed number of iterations ...
for (int i = 0; i < STEPS; i++) {
// for(int i = 0; i < 3; i++) { // for(int i = 0; i < 3; i++) {
const T previous = data[prevOffset + i * SIZE_X + threadIdx.x]; const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
const T next = data[nextOffset + i * SIZE_X + threadIdx.x]; const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
T & center = data[midOffset + i * SIZE_X + threadIdx.x]; T &center = data[midOffset + i * SIZE_X + threadIdx.x];
// function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x)); // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
function(previous, center, next);// the real one function(previous, center, next); // the real one
}
// ... but not all threads participate on final iteration
if(threadIdx.x < finalCount) {
const T previous = data[prevOffset + finalOffset + threadIdx.x];
const T next = data[nextOffset + finalOffset + threadIdx.x];
T & center = data[midOffset + finalOffset + threadIdx.x];
// function(previous, center, (nextOffset+finalOffset+threadIdx.x));
// kaixi
function(previous, center, next);//the real one
}
} }
public: // ... but not all threads participate on final iteration
if (threadIdx.x < finalCount) {
__device__ void getPrintData() { const T previous = data[prevOffset + finalOffset + threadIdx.x];
// const T next = data[nextOffset + finalOffset + threadIdx.x];
for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) { T &center = data[midOffset + finalOffset + threadIdx.x];
printf(" index: %d data: %f \n ", i ,data[i]); // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
} // kaixi
function(previous, center, next); // the real one
}
/// Gets offset of the column with given index. Central columns have
/// indices from 0 to NUM_LINES - 1, left boundary columns have negative
/// indices and right boundary columns indices start with NUM_LINES.
/// @param columnIndex index of column to get pointer to
/// @return offset of the first item of column with specified index
__device__ int getColumnOffset(int columnIndex) {
columnIndex += BOUNDARY_X; // skip boundary
return columnIndex / 2 // select right column
+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
} }
}
/// Provides access to data of the transform buffer.
/// @param index index of the item to work with
/// @return reference to item at given index
__device__ T & operator[] (const int index) {
return data[index];
}
/// Applies specified function to all horizontally even elements in
/// specified lines. (Including even elements in boundaries except
/// first even element in first left boundary.) SIZE_X threads participate
/// and synchronization is needed before result can be used.
/// @param firstLine index of first line
/// @param numLines count of lines
/// @param func function to be applied on all even elements
/// parameters: previous (odd) element, the even
/// element itself and finally next (odd) element
template <typename FUNC>
__device__ void forEachHorizontalEven(const int firstLine,
const int numLines,
const FUNC & func) {
// number of even elemens to apply function to
const int count = numLines * VERTICAL_STRIDE - 1;
// offset of first even element
const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
// offset of odd predecessor of first even element
const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
// offset of odd successor of first even element
const int nextOffset = prevOffset + 1;
// if(threadIdx.x == 0) { public:
__device__ void getPrintData() {
//
for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
printf(" index: %d data: %f \n ", i, data[i]);
}
}
// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); /// Gets offset of the column with given index. Central columns have
// } /// indices from 0 to NUM_LINES - 1, left boundary columns have negative
/// indices and right boundary columns indices start with NUM_LINES.
// call generic horizontal step function /// @param columnIndex index of column to get pointer to
horizontalStep(count, prevOffset, centerOffset, nextOffset, func); /// @return offset of the first item of column with specified index
} __device__ int getColumnOffset(int columnIndex) {
columnIndex += BOUNDARY_X; // skip boundary
return columnIndex / 2 // select right column
/// Applies given function to all horizontally odd elements in specified + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
/// lines. (Including odd elements in boundaries except last odd element }
/// in last right boundary.) SIZE_X threads participate and synchronization
/// is needed before result can be used.
/// @param firstLine index of first line
/// @param numLines count of lines
/// @param func function to be applied on all odd elements
/// parameters: previous (even) element, the odd
/// element itself and finally next (even) element
template <typename FUNC>
__device__ void forEachHorizontalOdd(const int firstLine,
const int numLines,
const FUNC & func) {
// numbet of odd elements to apply function to
const int count = numLines * VERTICAL_STRIDE - 1;
// offset of even predecessor of first odd element
const int prevOffset = firstLine * VERTICAL_STRIDE;
// offset of first odd element
const int centerOffset = prevOffset + ODD_OFFSET;
// offset of even successor of first odd element
const int nextOffset = prevOffset + 1;
// if(threadIdx.x == 0) { /// Provides access to data of the transform buffer.
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); /// @param index index of the item to work with
// } /// @return reference to item at given index
__device__ T &operator[](const int index) { return data[index]; }
// call generic horizontal step function /// Applies specified function to all horizontally even elements in
horizontalStep(count, prevOffset, centerOffset, nextOffset, func); /// specified lines. (Including even elements in boundaries except
} /// first even element in first left boundary.) SIZE_X threads participate
/// and synchronization is needed before result can be used.
/// @param firstLine index of first line
/// Applies specified function to all even elements (except element #0) /// @param numLines count of lines
/// of given column. Each thread takes care of one column, so there's /// @param func function to be applied on all even elements
/// no need for synchronization. /// parameters: previous (odd) element, the even
/// @param columnOffset offset of thread's column /// element itself and finally next (odd) element
/// @param f function to be applied on all even elements template <typename FUNC>
/// parameters: previous (odd) element, the even __device__ void forEachHorizontalEven(const int firstLine, const int numLines,
/// element itself and finally next (odd) element const FUNC &func) {
template <typename F> // number of even elemens to apply function to
__device__ void forEachVerticalEven(const int columnOffset, const F & f) { const int count = numLines * VERTICAL_STRIDE - 1;
if(SIZE_Y > 3) { // makes no sense otherwise // offset of first even element
const int steps = SIZE_Y / 2 - 1; const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
for(int i = 0; i < steps; i++) { // offset of odd predecessor of first even element
const int row = 2 + i * 2; const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; // offset of odd successor of first even element
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; const int nextOffset = prevOffset + 1;
f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next);
// if(threadIdx.x == 0) {
//--------------- FOR TEST -----------------
/* __syncthreads(); // printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ // nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
diffOut[2500]++; // }
diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
} // call generic horizontal step function
__syncthreads(); horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
*/ //--------------- FOR TEST ----------------- }
/// Applies given function to all horizontally odd elements in specified
} /// lines. (Including odd elements in boundaries except last odd element
} /// in last right boundary.) SIZE_X threads participate and synchronization
} /// is needed before result can be used.
/// @param firstLine index of first line
/// @param numLines count of lines
/// Applies specified function to all odd elements of given column. /// @param func function to be applied on all odd elements
/// Each thread takes care of one column, so there's no need for /// parameters: previous (even) element, the odd
/// synchronization. /// element itself and finally next (even) element
/// @param columnOffset offset of thread's column template <typename FUNC>
/// @param f function to be applied on all odd elements __device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
/// parameters: previous (even) element, the odd const FUNC &func) {
/// element itself and finally next (even) element // numbet of odd elements to apply function to
template <typename F> const int count = numLines * VERTICAL_STRIDE - 1;
__device__ void forEachVerticalOdd(const int columnOffset, const F & f) { // offset of even predecessor of first odd element
const int steps = (SIZE_Y - 1) / 2; const int prevOffset = firstLine * VERTICAL_STRIDE;
for(int i = 0; i < steps; i++) { // offset of first odd element
const int row = i * 2 + 1; const int centerOffset = prevOffset + ODD_OFFSET;
// offset of even successor of first odd element
const int nextOffset = prevOffset + 1;
// if(threadIdx.x == 0) {
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
// }
// call generic horizontal step function
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
}
/// Applies specified function to all even elements (except element #0)
/// of given column. Each thread takes care of one column, so there's
/// no need for synchronization.
/// @param columnOffset offset of thread's column
/// @param f function to be applied on all even elements
/// parameters: previous (odd) element, the even
/// element itself and finally next (odd) element
template <typename F>
__device__ void forEachVerticalEven(const int columnOffset, const F &f) {
if (SIZE_Y > 3) { // makes no sense otherwise
const int steps = SIZE_Y / 2 - 1;
for (int i = 0; i < steps; i++) {
const int row = 2 + i * 2;
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); //--------------- FOR TEST -----------------
/* __syncthreads();
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
//--------------- FOR TEST ----------------- diffOut[2500]++;
/* __syncthreads(); diffOut[diffOut[2500]] = 2;//data[columnOffset +
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ row * VERTICAL_STRIDE];
diffOut[2500]++; }
diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE]; __syncthreads();
} */ //--------------- FOR TEST -----------------
__syncthreads();
*/ //--------------- FOR TEST -----------------
} }
} }
}
/// Scales elements at specified lines.
/// @param evenScale scaling factor for horizontally even elements
/// @param oddScale scaling factor for horizontally odd elements
/// @param numLines number of lines, whose elements should be scaled
/// @param firstLine index of first line to scale elements in
__device__ void scaleHorizontal(const T evenScale, const T oddScale,
const int firstLine, const int numLines) {
const int offset = firstLine * VERTICAL_STRIDE;
const int count = numLines * VERTICAL_STRIDE;
const int steps = count / SIZE_X;
const int finalCount = count % SIZE_X;
const int finalOffset = count - finalCount;
// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset); /// Applies specified function to all odd elements of given column.
/// Each thread takes care of one column, so there's no need for
// run iterations, whete all threads participate /// synchronization.
for(int i = 0; i < steps; i++) { /// @param columnOffset offset of thread's column
data[threadIdx.x + i * SIZE_X + offset] *= evenScale; /// @param f function to be applied on all odd elements
// if(threadIdx.x + i * SIZE_X + offset == 531) { /// parameters: previous (even) element, the odd
// printf("threadidx 531: %d \n", threadIdx.x); /// element itself and finally next (even) element
// } template <typename F>
// if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) { __device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
// printf("threadidx 531: %d \n", threadIdx.x); const int steps = (SIZE_Y - 1) / 2;
// } for (int i = 0; i < steps; i++) {
data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale; const int row = i * 2 + 1;
} const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
// some threads also finish remaining unscaled items
if(threadIdx.x < finalCount) {
data[threadIdx.x + finalOffset + offset] *= evenScale;
// if(threadIdx.x + finalOffset + offset == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
// if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
}
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
//--------------- FOR TEST -----------------
/* __syncthreads();
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
diffOut[2500]++;
diffOut[diffOut[2500]] = 1; //data[columnOffset +
row * VERTICAL_STRIDE];
}
__syncthreads();
*/ //--------------- FOR TEST -----------------
} }
}
/// Scales elements in specified column. /// Scales elements at specified lines.
/// @param evenScale scaling factor for vertically even elements /// @param evenScale scaling factor for horizontally even elements
/// @param oddScale scaling factor for vertically odd elements /// @param oddScale scaling factor for horizontally odd elements
/// @param columnOffset offset of the column to work with /// @param numLines number of lines, whose elements should be scaled
/// @param numLines number of lines, whose elements should be scaled /// @param firstLine index of first line to scale elements in
/// @param firstLine index of first line to scale elements in __device__ void scaleHorizontal(const T evenScale, const T oddScale,
__device__ void scaleVertical(const T evenScale, const T oddScale, const int firstLine, const int numLines) {
const int columnOffset, const int numLines, const int offset = firstLine * VERTICAL_STRIDE;
const int firstLine) { const int count = numLines * VERTICAL_STRIDE;
for(int i = firstLine; i < (numLines + firstLine); i++) { const int steps = count / SIZE_X;
if(i & 1) { const int finalCount = count % SIZE_X;
data[columnOffset + i * VERTICAL_STRIDE] *= oddScale; const int finalOffset = count - finalCount;
} else {
data[columnOffset + i * VERTICAL_STRIDE] *= evenScale; // printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d,
} // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
// finalCount, finalOffset);
// run iterations, whete all threads participate
for (int i = 0; i < steps; i++) {
data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
// if(threadIdx.x + i * SIZE_X + offset == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
// if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
}
// some threads also finish remaining unscaled items
if (threadIdx.x < finalCount) {
data[threadIdx.x + finalOffset + offset] *= evenScale;
// if(threadIdx.x + finalOffset + offset == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
// if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
}
}
/// Scales elements in specified column.
/// @param evenScale scaling factor for vertically even elements
/// @param oddScale scaling factor for vertically odd elements
/// @param columnOffset offset of the column to work with
/// @param numLines number of lines, whose elements should be scaled
/// @param firstLine index of first line to scale elements in
__device__ void scaleVertical(const T evenScale, const T oddScale,
const int columnOffset, const int numLines,
const int firstLine) {
for (int i = firstLine; i < (numLines + firstLine); i++) {
if (i & 1) {
data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
} else {
data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
} }
} }
}
//****************For Test(Feb23), test inter parameters*************
__device__ int getVERTICAL_STRIDE(){
return VERTICAL_STRIDE;
}
__device__ int getSHM_BANKS(){
return SHM_BANKS;
}
__device__ int getBuffersize(){
return BUFFER_SIZE;
}
__device__ int getPADDING(){
return PADDING;
}
__device__ int getODD_OFFSET(){
return ODD_OFFSET;
}
//****************For Test(Feb23), test inter parameters*************
__device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
__device__ int getSHM_BANKS() { return SHM_BANKS; }
__device__ int getBuffersize() { return BUFFER_SIZE; }
__device__ int getPADDING() { return PADDING; }
__device__ int getODD_OFFSET() { return ODD_OFFSET; }
//****************For Test(Feb23), test inter parameters************* //****************For Test(Feb23), test inter parameters*************
}; // end of class TransformBuffer
}; // end of class TransformBuffer
} // namespace dwt_cuda } // namespace dwt_cuda
#endif // TRANSFORM_BUFFER_H
#endif // TRANSFORM_BUFFER_H

View File

@ -1,16 +1,16 @@
/* /*
* Copyright (c) 2009, Jiri Matela * Copyright (c) 2009, Jiri Matela
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
* *
* * Redistributions of source code must retain the above copyright * * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright * * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the * notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution. * documentation and/or other materials provided with the distribution.
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ -54,7 +54,7 @@ int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
// printf("Loading ipnput: %s\n", srcFilename); // printf("Loading ipnput: %s\n", srcFilename);
char *path = "../../data/dwt2d/"; char *path = "../../data/dwt2d/";
char *newSrc = NULL; char *newSrc = NULL;
if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL) if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
{ {
newSrc[0] = '\0'; newSrc[0] = '\0';
@ -67,7 +67,7 @@ int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
//srcFilename = strcat("../../data/dwt2d/",srcFilename); //srcFilename = strcat("../../data/dwt2d/",srcFilename);
//read image //read image
int i = open(srcFilename, O_RDONLY, 0644); int i = open(srcFilename, O_RDONLY, 0644);
if (i == -1) { if (i == -1) {
error(0,errno,"cannot access %s", srcFilename); error(0,errno,"cannot access %s", srcFilename);
return -1; return -1;
} }
@ -97,18 +97,18 @@ template <typename T>
void processDWT(struct dwt *d, int forward, int writeVisual) void processDWT(struct dwt *d, int forward, int writeVisual)
{ {
int componentSize = d->pixWidth*d->pixHeight*sizeof(T); int componentSize = d->pixWidth*d->pixHeight*sizeof(T);
T *c_r_out, *backup ; T *c_r_out, *backup ;
cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size
cudaCheckError("Alloc device memory"); cudaCheckError("Alloc device memory");
cudaMemset(c_r_out, 0, componentSize); cudaMemset(c_r_out, 0, componentSize);
cudaCheckError("Memset device memory"); cudaCheckError("Memset device memory");
cudaMalloc((void**)&backup, componentSize); //< aligned component size cudaMalloc((void**)&backup, componentSize); //< aligned component size
cudaCheckError("Alloc device memory"); cudaCheckError("Alloc device memory");
cudaMemset(backup, 0, componentSize); cudaMemset(backup, 0, componentSize);
cudaCheckError("Memset device memory"); cudaCheckError("Memset device memory");
if (d->components == 3) { if (d->components == 3) {
/* Alloc two more buffers for G and B */ /* Alloc two more buffers for G and B */
T *c_g_out, *c_b_out; T *c_g_out, *c_b_out;
@ -116,12 +116,12 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
cudaCheckError("Alloc device memory"); cudaCheckError("Alloc device memory");
cudaMemset(c_g_out, 0, componentSize); cudaMemset(c_g_out, 0, componentSize);
cudaCheckError("Memset device memory"); cudaCheckError("Memset device memory");
cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size
cudaCheckError("Alloc device memory"); cudaCheckError("Alloc device memory");
cudaMemset(c_b_out, 0, componentSize); cudaMemset(c_b_out, 0, componentSize);
cudaCheckError("Memset device memory"); cudaCheckError("Memset device memory");
/* Load components */ /* Load components */
T *c_r, *c_g, *c_b; T *c_r, *c_g, *c_b;
cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size
@ -140,13 +140,13 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
cudaCheckError("Memset device memory"); cudaCheckError("Memset device memory");
rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight); rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight);
/* Compute DWT and always store into file */ /* Compute DWT and always store into file */
nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
// -------test---------- // -------test----------
// T *h_r_out=(T*)malloc(componentSize); // T *h_r_out=(T*)malloc(componentSize);
// cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost); // cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost);
@ -156,13 +156,13 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
// if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n"); // if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
// } // }
// -------test---------- // -------test----------
/* Store DWT to file */ /* Store DWT to file */
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r"); writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
// writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g"); // writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
// writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b"); // writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
#ifdef OUTPUT #ifdef OUTPUT
if (writeVisual) { if (writeVisual) {
writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r"); writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g"); writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
@ -186,7 +186,7 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
cudaFree(c_b_out); cudaFree(c_b_out);
cudaCheckError("Cuda free"); cudaCheckError("Cuda free");
} }
else if (d->components == 1) { else if (d->components == 1) {
//Load component //Load component
T *c_r; T *c_r;
@ -197,11 +197,11 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight); bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight);
// Compute DWT // Compute DWT
nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward); nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
// Store DWT to file // Store DWT to file
// #ifdef OUTPUT // #ifdef OUTPUT
if (writeVisual) { if (writeVisual) {
writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out"); writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out");
} else { } else {
@ -218,7 +218,7 @@ void processDWT(struct dwt *d, int forward, int writeVisual)
cudaCheckError("Cuda free device"); cudaCheckError("Cuda free device");
} }
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
int optindex = 0; int optindex = 0;
char ch; char ch;
@ -233,13 +233,13 @@ int main(int argc, char **argv)
{"97", no_argument, 0, '9'}, //9/7 transform {"97", no_argument, 0, '9'}, //9/7 transform
{"53", no_argument, 0, '5' }, //5/3transform {"53", no_argument, 0, '5' }, //5/3transform
{"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear {"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
{"help", no_argument, 0, 'h'} {"help", no_argument, 0, 'h'}
}; };
int pixWidth = 0; //<real pixWidth int pixWidth = 0; //<real pixWidth
int pixHeight = 0; //<real pixHeight int pixHeight = 0; //<real pixHeight
int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
int bitDepth = 8; int bitDepth = 8;
int dwtLvls = 3; //default numuber of DWT levels int dwtLvls = 3; //default numuber of DWT levels
int device = 0; int device = 0;
int forward = 1; //forward transform int forward = 1; //forward transform
@ -322,19 +322,19 @@ int main(int argc, char **argv)
if (devCount == 0) { if (devCount == 0) {
printf("No CUDA enabled device\n"); printf("No CUDA enabled device\n");
return -1; return -1;
} }
if (device < 0 || device > devCount -1) { if (device < 0 || device > devCount -1) {
printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n", printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
device, 0, devCount -1); device, 0, devCount -1);
return -1; return -1;
} }
cudaDeviceProp devProp; cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, device); cudaGetDeviceProperties(&devProp, device);
cudaCheckError("Get device properties"); cudaCheckError("Get device properties");
// if (devProp.major < 1) { // if (devProp.major < 1) {
// printf("Device %d does not support CUDA\n", device); // printf("Device %d does not support CUDA\n", device);
// return -1; // return -1;
// } // }
printf("Using device %d: %s\n", device, devProp.name); printf("Using device %d: %s\n", device, devProp.name);
cudaSetDevice(device); cudaSetDevice(device);
cudaCheckError("Set selected device"); cudaCheckError("Set selected device");
@ -366,14 +366,14 @@ int main(int argc, char **argv)
printf(" DWT levels:\t\t%d\n", dwtLvls); printf(" DWT levels:\t\t%d\n", dwtLvls);
printf(" Forward transform:\t%d\n", forward); printf(" Forward transform:\t%d\n", forward);
printf(" 9/7 transform:\t\t%d\n", dwt97); printf(" 9/7 transform:\t\t%d\n", dwt97);
//data sizes //data sizes
int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
//load img source image //load img source image
cudaMallocHost((void **)&d->srcImg, inputSize); cudaMallocHost((void **)&d->srcImg, inputSize);
cudaCheckError("Alloc host memory"); cudaCheckError("Alloc host memory");
if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
return -1; return -1;
/* DWT */ /* DWT */

View File

@ -5,4 +5,3 @@
./dwt2d 4.bmp -d 4x4 -r -5 -l 3 ./dwt2d 4.bmp -d 4x4 -r -5 -l 3
# ./dwt2d 4.bmp -d 4x4 -r -9 -l 3 # ./dwt2d 4.bmp -d 4x4 -r -9 -l 3
# ./dwt2d 8.bmp -d 8x8 -f -9 -l 3 # ./dwt2d 8.bmp -d 8x8 -f -9 -l 3

View File

@ -7,12 +7,3 @@
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart

View File

@ -1,42 +1,40 @@
#include <stdio.h> #include <stdio.h>
__global__ __global__ void saxpy(int n, float a, float *x, float *y) {
void saxpy(int n, float a, float *x, float *y) int i = blockIdx.x * blockDim.x + threadIdx.x;
{ if (i < n)
int i = blockIdx.x*blockDim.x + threadIdx.x; y[i] = a * x[i] + y[i];
if (i < n) y[i] = a*x[i] + y[i];
} }
int main(void) int main(void) {
{ int N = 1 << 20;
int N = 1<<20;
float *x, *y, *d_x, *d_y; float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float)); x = (float *)malloc(N * sizeof(float));
y = (float*)malloc(N*sizeof(float)); y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N*sizeof(float)); cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N*sizeof(float)); cudaMalloc(&d_y, N * sizeof(float));
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
x[i] = 1.0f; x[i] = 1.0f;
y[i] = 2.0f; y[i] = 2.0f;
} }
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements // Perform SAXPY on 1M elements
// saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y); // saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f; float maxError = 0.0f;
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
maxError = max(maxError, abs(y[i]-4.0f)); maxError = max(maxError, abs(y[i] - 4.0f));
printf("Max error: %f\n", maxError); printf("Max error: %f\n", maxError);
cudaFree(d_x); cudaFree(d_x);
cudaFree(d_y); cudaFree(d_y);
free(x); free(x);
free(y); free(y);
} }

View File

@ -1,42 +1,39 @@
#include <stdio.h> #include <stdio.h>
__global__ __global__ void saxpy(void) {
void saxpy(void) int i = blockIdx.x * blockDim.x + threadIdx.x;
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
printf("block_id:%d thread_id:%d \n", i) printf("block_id:%d thread_id:%d \n", i)
} }
int main(void) int main(void) {
{ int N = 1 << 20;
int N = 1<<20;
float *x, *y, *d_x, *d_y; float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float)); x = (float *)malloc(N * sizeof(float));
y = (float*)malloc(N*sizeof(float)); y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N*sizeof(float)); cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N*sizeof(float)); cudaMalloc(&d_y, N * sizeof(float));
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
x[i] = 1.0f; x[i] = 1.0f;
y[i] = 2.0f; y[i] = 2.0f;
} }
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements // Perform SAXPY on 1M elements
saxpy<<<(1,1)>>>; saxpy<<<(1, 1)>>>;
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f; float maxError = 0.0f;
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
maxError = max(maxError, abs(y[i]-4.0f)); maxError = max(maxError, abs(y[i] - 4.0f));
printf("Max error: %f\n", maxError); printf("Max error: %f\n", maxError);
cudaFree(d_x); cudaFree(d_x);
cudaFree(d_y); cudaFree(d_y);
free(x); free(x);
free(y); free(y);
} }

View File

@ -1,41 +1,36 @@
#include <stdio.h> #include <stdio.h>
__global__ __global__ void saxpy(int N) { printf("hello!: %d\n", N); }
void saxpy(int N)
{
printf("hello!: %d\n", N);
}
int main(void) int main(void) {
{ int N = 1 << 20;
int N = 1<<20;
float *x, *y, *d_x, *d_y; float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float)); x = (float *)malloc(N * sizeof(float));
y = (float*)malloc(N*sizeof(float)); y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N*sizeof(float)); cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N*sizeof(float)); cudaMalloc(&d_y, N * sizeof(float));
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
x[i] = 1.0f; x[i] = 1.0f;
y[i] = 2.0f; y[i] = 2.0f;
} }
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements // Perform SAXPY on 1M elements
saxpy<<<(1,1)>>>(N); saxpy<<<(1, 1)>>>(N);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f; float maxError = 0.0f;
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
maxError = max(maxError, abs(y[i]-4.0f)); maxError = max(maxError, abs(y[i] - 4.0f));
printf("Max error: %f\n", maxError); printf("Max error: %f\n", maxError);
cudaFree(d_x); cudaFree(d_x);
cudaFree(d_y); cudaFree(d_y);
free(x); free(x);
free(y); free(y);
} }

View File

@ -1,41 +1,36 @@
#include <stdio.h> #include <stdio.h>
__global__ __global__ void saxpy(void) { printf("hello!\n"); }
void saxpy(void)
{
printf("hello!\n");
}
int main(void) int main(void) {
{ int N = 1 << 20;
int N = 1<<20;
float *x, *y, *d_x, *d_y; float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float)); x = (float *)malloc(N * sizeof(float));
y = (float*)malloc(N*sizeof(float)); y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N*sizeof(float)); cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N*sizeof(float)); cudaMalloc(&d_y, N * sizeof(float));
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
x[i] = 1.0f; x[i] = 1.0f;
y[i] = 2.0f; y[i] = 2.0f;
} }
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements // Perform SAXPY on 1M elements
saxpy<<<(1,1)>>>; saxpy<<<(1, 1)>>>;
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f; float maxError = 0.0f;
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
maxError = max(maxError, abs(y[i]-4.0f)); maxError = max(maxError, abs(y[i] - 4.0f));
printf("Max error: %f\n", maxError); printf("Max error: %f\n", maxError);
cudaFree(d_x); cudaFree(d_x);
cudaFree(d_y); cudaFree(d_y);
free(x); free(x);
free(y); free(y);
} }

View File

@ -43,7 +43,7 @@ cudaError_t cudaMallocHost(void **devPtr, size_t size) {
*devPtr = malloc(size); *devPtr = malloc(size);
if (devPtr == NULL) if (devPtr == NULL)
return cudaErrorMemoryAllocation; return cudaErrorMemoryAllocation;
return cudaSuccess; return cudaSuccess;
} }
cudaError_t cudaMemset(void *devPtr, int value, size_t count) { cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
memset(devPtr, value, count); memset(devPtr, value, count);