add dwt2d example and fixes
This commit is contained in:
parent
1d0a0a228d
commit
1add53fa86
|
@ -395,5 +395,9 @@ void init_block(llvm::Module *M, std::ofstream &fout) {
|
|||
// replace asm Inline
|
||||
replace_asm_call(M);
|
||||
// replace dynamic shared memory
|
||||
replace_dynamic_shared_memory(M);
|
||||
auto dynamic_shared_memory_addr =
|
||||
M->getGlobalVariable("dynamic_shared_memory");
|
||||
if (dynamic_shared_memory_addr) {
|
||||
replace_dynamic_shared_memory(M);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -270,11 +270,17 @@ void AddContextSaveRestore(llvm::Instruction *instruction,
|
|||
AddContextSave(instruction, alloca, intra_warp_loop);
|
||||
|
||||
std::vector<Instruction *> uses;
|
||||
Function *f2 = instruction->getParent()->getParent();
|
||||
|
||||
|
||||
for (Instruction::use_iterator ui = instruction->use_begin(),
|
||||
ue = instruction->use_end();
|
||||
ui != ue; ++ui) {
|
||||
llvm::Instruction *user = cast<Instruction>(ui->getUser());
|
||||
Function *f1 = user->getParent()->getParent();
|
||||
if(f2->getName() != f1->getName()) {
|
||||
continue;
|
||||
}
|
||||
if (user == NULL)
|
||||
continue;
|
||||
if (user == theStore)
|
||||
|
|
|
@ -86,6 +86,24 @@ void mem_share2global(llvm::Module *M) {
|
|||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
} else if (element_type->isStructTy()) {
|
||||
auto undef = llvm::UndefValue::get(element_type);
|
||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||
*M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef,
|
||||
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
|
||||
false);
|
||||
global_memory->setDSOLocal(true);
|
||||
Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName()));
|
||||
comdat->setSelectionKind(Comdat::SelectionKind::Any);
|
||||
global_memory->setComdat(comdat);
|
||||
global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
|
||||
global_memory->setInitializer(undef);
|
||||
global_memory->setAlignment(share_memory->getAlignment());
|
||||
corresponding_global_memory.insert(
|
||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||
global_memory));
|
||||
|
||||
|
||||
} else {
|
||||
assert(0 && "The required Share Memory Type is not supported\n");
|
||||
}
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _COMMON_H
|
||||
#define _COMMON_H
|
||||
|
||||
//24-bit multiplication is faster on G80,
|
||||
//but we must be sure to multiply integers
|
||||
//only within [-8M, 8M - 1] range
|
||||
#define IMUL(a, b) __mul24(a, b)
|
||||
|
||||
////cuda timing macros
|
||||
//#define CTIMERINIT cudaEvent_t cstart, cstop; \
|
||||
// cudaEventCreate(&cstart); \
|
||||
// cudaEventCreate(&cstop); \
|
||||
// float elapsedTime
|
||||
//#define CTIMERSTART(cstart) cudaEventRecord(cstart,0)
|
||||
//#define CTIMERSTOP(cstop) cudaEventRecord(cstop,0); \
|
||||
// cudaEventSynchronize(cstop); \
|
||||
// cudaEventElapsedTime(&elapsedTime, cstart, cstop)
|
||||
|
||||
//divide and round up macro
|
||||
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
|
||||
|
||||
# define cudaCheckError( msg ) { \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "%s: %i: %s: %s.\n", \
|
||||
__FILE__, __LINE__, msg, cudaGetErrorString( err) ); \
|
||||
exit(-1); \
|
||||
} }
|
||||
|
||||
# define cudaCheckAsyncError( msg ) { \
|
||||
cudaThreadSynchronize(); \
|
||||
cudaCheckError( msg ); \
|
||||
}
|
||||
|
||||
|
||||
#endif
|
|
@ -0,0 +1,193 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <error.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "components.h"
|
||||
#include "common.h"
|
||||
|
||||
#define THREADS 256
|
||||
|
||||
/* Store 3 RGB float components */
|
||||
__device__ void storeComponents(float *d_r, float *d_g, float *d_b, float r, float g, float b, int pos)
|
||||
{
|
||||
d_r[pos] = (r/255.0f) - 0.5f;
|
||||
d_g[pos] = (g/255.0f) - 0.5f;
|
||||
d_b[pos] = (b/255.0f) - 0.5f;
|
||||
}
|
||||
|
||||
/* Store 3 RGB intege components */
|
||||
__device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int b, int pos)
|
||||
{
|
||||
d_r[pos] = r - 128;
|
||||
d_g[pos] = g - 128;
|
||||
d_b[pos] = b - 128;
|
||||
}
|
||||
|
||||
/* Store float component */
|
||||
__device__ void storeComponent(float *d_c, float c, int pos)
|
||||
{
|
||||
d_c[pos] = (c/255.0f) - 0.5f;
|
||||
}
|
||||
|
||||
/* Store integer component */
|
||||
__device__ void storeComponent(int *d_c, int c, int pos)
|
||||
{
|
||||
d_c[pos] = c - 128;
|
||||
}
|
||||
|
||||
/* Copy img src data into three separated component buffers */
|
||||
template<typename T>
|
||||
__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
|
||||
unsigned char * d_src,
|
||||
int pixels)
|
||||
{
|
||||
int x = threadIdx.x;
|
||||
int gX = blockDim.x*blockIdx.x;
|
||||
|
||||
__shared__ unsigned char sData[THREADS*3];
|
||||
|
||||
/* Copy data to shared mem by 4bytes
|
||||
other checks are not necessary, since
|
||||
d_src buffer is aligned to sharedDataSize */
|
||||
if ( (x*4) < THREADS*3 ) {
|
||||
float *s = (float *)d_src;
|
||||
float *d = (float *)sData;
|
||||
d[x] = s[((gX*3)>>2) + x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
T r, g, b;
|
||||
|
||||
int offset = x*3;
|
||||
r = (T)(sData[offset]);
|
||||
g = (T)(sData[offset+1]);
|
||||
b = (T)(sData[offset+2]);
|
||||
|
||||
int globalOutputPosition = gX + x;
|
||||
if (globalOutputPosition < pixels) {
|
||||
storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
|
||||
}
|
||||
}
|
||||
|
||||
/* Copy img src data into three separated component buffers */
|
||||
template<typename T>
|
||||
__global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
|
||||
{
|
||||
int x = threadIdx.x;
|
||||
int gX = blockDim.x*blockIdx.x;
|
||||
|
||||
__shared__ unsigned char sData[THREADS];
|
||||
|
||||
/* Copy data to shared mem by 4bytes
|
||||
other checks are not necessary, since
|
||||
d_src buffer is aligned to sharedDataSize */
|
||||
if ( (x*4) < THREADS) {
|
||||
float *s = (float *)d_src;
|
||||
float *d = (float *)sData;
|
||||
d[x] = s[(gX>>2) + x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
T c;
|
||||
|
||||
c = (T)(sData[x]);
|
||||
|
||||
int globalOutputPosition = gX + x;
|
||||
if (globalOutputPosition < pixels) {
|
||||
storeComponent(d_c, c, globalOutputPosition);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Separate compoents of 8bit RGB source image */
|
||||
template<typename T>
|
||||
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height)
|
||||
{
|
||||
unsigned char * d_src;
|
||||
int pixels = width*height;
|
||||
int alignedSize = DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
|
||||
|
||||
/* Alloc d_src buffer */
|
||||
cudaMalloc((void **)&d_src, alignedSize);
|
||||
cudaCheckAsyncError("Cuda malloc")
|
||||
cudaMemset(d_src, 0, alignedSize);
|
||||
|
||||
/* Copy data to device */
|
||||
cudaMemcpy(d_src, src, pixels*3, cudaMemcpyHostToDevice);
|
||||
cudaCheckError("Copy data to device")
|
||||
|
||||
/* Kernel */
|
||||
dim3 threads(THREADS);
|
||||
dim3 grid(alignedSize/(THREADS*3));
|
||||
assert(alignedSize%(THREADS*3) == 0);
|
||||
c_CopySrcToComponents<<<grid, threads>>>(d_r, d_g, d_b, d_src, pixels);
|
||||
cudaCheckAsyncError("CopySrcToComponents kernel")
|
||||
|
||||
/* Free Memory */
|
||||
cudaFree(d_src);
|
||||
cudaCheckAsyncError("Free memory")
|
||||
}
|
||||
template void rgbToComponents<float>(float *d_r, float *d_g, float *d_b, unsigned char * src, int width, int height);
|
||||
template void rgbToComponents<int>(int *d_r, int *d_g, int *d_b, unsigned char * src, int width, int height);
|
||||
|
||||
|
||||
/* Copy a 8bit source image data into a color compoment of type T */
|
||||
template<typename T>
|
||||
void bwToComponent(T *d_c, unsigned char * src, int width, int height)
|
||||
{
|
||||
unsigned char * d_src;
|
||||
int pixels = width*height;
|
||||
int alignedSize = DIVANDRND(pixels, THREADS) * THREADS; //aligned to thread block size -- THREADS
|
||||
|
||||
/* Alloc d_src buffer */
|
||||
cudaMalloc((void **)&d_src, alignedSize);
|
||||
cudaCheckAsyncError("Cuda malloc")
|
||||
cudaMemset(d_src, 0, alignedSize);
|
||||
|
||||
/* Copy data to device */
|
||||
cudaMemcpy(d_src, src, pixels, cudaMemcpyHostToDevice);
|
||||
cudaCheckError("Copy data to device")
|
||||
|
||||
/* Kernel */
|
||||
dim3 threads(THREADS);
|
||||
dim3 grid(alignedSize/(THREADS));
|
||||
assert(alignedSize%(THREADS) == 0);
|
||||
c_CopySrcToComponent<<<grid, threads>>>(d_c, d_src, pixels);
|
||||
cudaCheckAsyncError("CopySrcToComponent kernel")
|
||||
|
||||
/* Free Memory */
|
||||
cudaFree(d_src);
|
||||
cudaCheckAsyncError("Free memory")
|
||||
}
|
||||
|
||||
template void bwToComponent<float>(float *d_c, unsigned char *src, int width, int height);
|
||||
template void bwToComponent<int>(int *d_c, unsigned char *src, int width, int height);
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _COMPONENTS_H
|
||||
#define _COMPONENTS_H
|
||||
|
||||
/* Separate compoents of source 8bit RGB image */
|
||||
template<typename T>
|
||||
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height);
|
||||
|
||||
/* Copy a 8bit source image data into a color compoment of type T */
|
||||
template<typename T>
|
||||
void bwToComponent(T *d_c, unsigned char * src, int width, int height);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,385 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <sys/time.h>
|
||||
#include <unistd.h>
|
||||
#include <error.h>
|
||||
#include "dwt_cuda/dwt.h"
|
||||
#include "dwt_cuda/common.h"
|
||||
#include "dwt.h"
|
||||
#include "common.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
inline void fdwt(float *in, float *out, int width, int height, int levels)
|
||||
{
|
||||
printf(" Running fdwt97 Float \n");
|
||||
dwt_cuda::fdwt97(in, out, width, height, levels);
|
||||
}
|
||||
/*
|
||||
inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut)
|
||||
{
|
||||
dwt_cuda::fdwt97(in, out, width, height, levels, diffOut);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
inline void fdwt(int *in, int *out, int width, int height, int levels)
|
||||
{
|
||||
printf(" Running fdwt53 Int \n");
|
||||
|
||||
dwt_cuda::fdwt53(in, out, width, height, levels);
|
||||
}
|
||||
/*
|
||||
inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut)
|
||||
{
|
||||
dwt_cuda::fdwt53(in, out, width, height, levels, diffOut);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
inline void rdwt(float *in, float *out, int width, int height, int levels)
|
||||
{
|
||||
printf(" Running rdwt97 Float \n");
|
||||
|
||||
dwt_cuda::rdwt97(in, out, width, height, levels);
|
||||
}
|
||||
|
||||
inline void rdwt(int *in, int *out, int width, int height, int levels)
|
||||
{
|
||||
printf(" Running rdwt53 Int \n");
|
||||
|
||||
dwt_cuda::rdwt53(in, out, width, height, levels);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
|
||||
{
|
||||
printf("\n*** %d stages of 2D forward DWT:\n", stages);
|
||||
|
||||
/* create backup of input, because each test iteration overwrites it */
|
||||
const int size = pixHeight * pixWidth * sizeof(T);
|
||||
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
|
||||
cudaCheckError("Memcopy device to device");
|
||||
|
||||
/* Measure time of individual levels. */
|
||||
if(forward)
|
||||
fdwt(in, out, pixWidth, pixHeight, stages);
|
||||
else
|
||||
rdwt(in, out, pixWidth, pixHeight, stages);
|
||||
|
||||
// Measure overall time of DWT.
|
||||
/* #ifdef GPU_DWT_TESTING_1
|
||||
|
||||
dwt_cuda::CudaDWTTester tester;
|
||||
for(int i = tester.getNumIterations(); i--; ) {
|
||||
// Recover input and measure one overall DWT run.
|
||||
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
|
||||
cudaCheckError("Memcopy device to device");
|
||||
tester.beginTestIteration();
|
||||
if(forward)
|
||||
fdwt(in, out, pixWidth, pixHeight, stages);
|
||||
else
|
||||
rdwt(in, out, pixWidth, pixHeight, stages);
|
||||
tester.endTestIteration();
|
||||
}
|
||||
tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
|
||||
#endif // GPU_DWT_TESTING
|
||||
|
||||
cudaCheckAsyncError("DWT Kernel calls");
|
||||
*/ return 0;
|
||||
}
|
||||
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
|
||||
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
template<typename T>
|
||||
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
|
||||
{
|
||||
printf("*** %d stages of 2D forward DWT:\n", stages);
|
||||
|
||||
// create backup of input, because each test iteration overwrites it
|
||||
const int size = pixHeight * pixWidth * sizeof(T);
|
||||
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
|
||||
cudaCheckError("Memcopy device to device");
|
||||
|
||||
// Measure time of individual levels.
|
||||
if(forward)
|
||||
fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
|
||||
else
|
||||
rdwt(in, out, pixWidth, pixHeight, stages);
|
||||
|
||||
// Measure overall time of DWT.
|
||||
#ifdef GPU_DWT_TESTING_1
|
||||
|
||||
dwt_cuda::CudaDWTTester tester;
|
||||
for(int i = tester.getNumIterations(); i--; ) {
|
||||
// Recover input and measure one overall DWT run.
|
||||
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
|
||||
cudaCheckError("Memcopy device to device");
|
||||
tester.beginTestIteration();
|
||||
if(forward)
|
||||
fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
|
||||
else
|
||||
rdwt(in, out, pixWidth, pixHeight, stages);
|
||||
tester.endTestIteration();
|
||||
}
|
||||
tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
|
||||
#endif // GPU_DWT_TESTING
|
||||
|
||||
cudaCheckAsyncError("DWT Kernel calls");
|
||||
return 0;
|
||||
}
|
||||
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool, float*);
|
||||
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool, int*);
|
||||
|
||||
*/
|
||||
|
||||
void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename)
|
||||
{
|
||||
int i;
|
||||
std::ofstream outputFile;
|
||||
char outfile[strlen(filename)+strlen(".txt")];
|
||||
strcpy(outfile, filename);
|
||||
strcpy(outfile+strlen(filename), ".txt");
|
||||
outputFile.open(outfile);
|
||||
|
||||
|
||||
for(i = 0; i < samplesNum; i++) {
|
||||
float r = (src[i]+0.5f) * 255;
|
||||
if (r > 255) r = 255;
|
||||
if (r < 0) r = 0;
|
||||
dst[i] = (unsigned char)r;
|
||||
outputFile << "index: " << i << " val: "<< r <<" \n";
|
||||
|
||||
|
||||
}
|
||||
outputFile.close();
|
||||
}
|
||||
|
||||
void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename)
|
||||
{
|
||||
int i;
|
||||
std::ofstream outputFile;
|
||||
char outfile[strlen(filename)+strlen(".txt")];
|
||||
strcpy(outfile, filename);
|
||||
strcpy(outfile+strlen(filename), ".txt");
|
||||
outputFile.open(outfile);
|
||||
for(i = 0; i < samplesNum; i++) {
|
||||
int r = src[i]+128;
|
||||
if (r > 255) r = 255;
|
||||
if (r < 0) r = 0;
|
||||
dst[i] = (unsigned char)r;
|
||||
// added this line to output check
|
||||
outputFile << "index: " << i << " val: "<< r <<" \n";
|
||||
}
|
||||
outputFile.close();
|
||||
}
|
||||
|
||||
///* Write output linear orderd*/
|
||||
template<typename T>
|
||||
int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
|
||||
const char * filename, const char * suffix)
|
||||
{
|
||||
unsigned char * result;
|
||||
T *gpu_output;
|
||||
int i;
|
||||
int size;
|
||||
int samplesNum = pixWidth*pixHeight;
|
||||
|
||||
size = samplesNum*sizeof(T);
|
||||
cudaMallocHost((void **)&gpu_output, size);
|
||||
cudaCheckError("Malloc host");
|
||||
memset(gpu_output, 0, size);
|
||||
result = (unsigned char *)malloc(samplesNum);
|
||||
cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost);
|
||||
cudaCheckError("Memcopy device to host");
|
||||
|
||||
/* T to char */
|
||||
samplesToChar(result, gpu_output, samplesNum, filename);
|
||||
|
||||
/* Write component */
|
||||
char outfile[strlen(filename)+strlen(suffix)];
|
||||
strcpy(outfile, filename);
|
||||
strcpy(outfile+strlen(filename), suffix);
|
||||
i = open(outfile, O_CREAT|O_WRONLY, 0644);
|
||||
if (i == -1) {
|
||||
error(0,errno,"cannot access %s", outfile);
|
||||
return -1;
|
||||
}
|
||||
printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
|
||||
ssize_t x ;
|
||||
x = write(i, result, samplesNum);
|
||||
close(i);
|
||||
|
||||
/* Clean up */
|
||||
cudaFreeHost(gpu_output);
|
||||
cudaCheckError("Cuda free host memory");
|
||||
free(result);
|
||||
if(x == 0) return 1;
|
||||
return 0;
|
||||
}
|
||||
template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
|
||||
template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
|
||||
|
||||
/* Write output visual ordered */
|
||||
template<typename T>
|
||||
int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
|
||||
int stages, const char * filename, const char * suffix)
|
||||
{
|
||||
struct band {
|
||||
int dimX;
|
||||
int dimY;
|
||||
};
|
||||
struct dimensions {
|
||||
struct band LL;
|
||||
struct band HL;
|
||||
struct band LH;
|
||||
struct band HH;
|
||||
};
|
||||
|
||||
unsigned char * result;
|
||||
T *src, *dst;
|
||||
int i,s;
|
||||
int size;
|
||||
int offset;
|
||||
int yOffset;
|
||||
int samplesNum = pixWidth*pixHeight;
|
||||
struct dimensions * bandDims;
|
||||
|
||||
bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
|
||||
|
||||
bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
|
||||
bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
|
||||
bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
|
||||
bandDims[0].HL.dimY = bandDims[0].LL.dimY;
|
||||
bandDims[0].LH.dimX = bandDims[0].LL.dimX;
|
||||
bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
|
||||
bandDims[0].HH.dimX = bandDims[0].HL.dimX;
|
||||
bandDims[0].HH.dimY = bandDims[0].LH.dimY;
|
||||
|
||||
for (i = 1; i < stages; i++) {
|
||||
bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
|
||||
bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
|
||||
bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
|
||||
bandDims[i].HL.dimY = bandDims[i].LL.dimY;
|
||||
bandDims[i].LH.dimX = bandDims[i].LL.dimX;
|
||||
bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
|
||||
bandDims[i].HH.dimX = bandDims[i].HL.dimX;
|
||||
bandDims[i].HH.dimY = bandDims[i].LH.dimY;
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
|
||||
for (i = 0; i < stages; i++) {
|
||||
printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
|
||||
printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
|
||||
printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
|
||||
printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
|
||||
}
|
||||
#endif
|
||||
|
||||
size = samplesNum*sizeof(T);
|
||||
cudaMallocHost((void **)&src, size);
|
||||
cudaCheckError("Malloc host");
|
||||
dst = (T*)malloc(size);
|
||||
memset(src, 0, size);
|
||||
memset(dst, 0, size);
|
||||
result = (unsigned char *)malloc(samplesNum);
|
||||
cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost);
|
||||
cudaCheckError("Memcopy device to host");
|
||||
|
||||
// LL Band
|
||||
size = bandDims[stages-1].LL.dimX * sizeof(T);
|
||||
for (i = 0; i < bandDims[stages-1].LL.dimY; i++) {
|
||||
memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
|
||||
}
|
||||
|
||||
for (s = stages - 1; s >= 0; s--) {
|
||||
// HL Band
|
||||
size = bandDims[s].HL.dimX * sizeof(T);
|
||||
offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
|
||||
for (i = 0; i < bandDims[s].HL.dimY; i++) {
|
||||
memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
|
||||
src+offset+i*bandDims[s].HL.dimX,
|
||||
size);
|
||||
}
|
||||
|
||||
// LH band
|
||||
size = bandDims[s].LH.dimX * sizeof(T);
|
||||
offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
|
||||
yOffset = bandDims[s].LL.dimY;
|
||||
for (i = 0; i < bandDims[s].HL.dimY; i++) {
|
||||
memcpy(dst+(yOffset+i)*pixWidth,
|
||||
src+offset+i*bandDims[s].LH.dimX,
|
||||
size);
|
||||
}
|
||||
|
||||
//HH band
|
||||
size = bandDims[s].HH.dimX * sizeof(T);
|
||||
offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
|
||||
yOffset = bandDims[s].HL.dimY;
|
||||
for (i = 0; i < bandDims[s].HH.dimY; i++) {
|
||||
memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
|
||||
src+offset+i*bandDims[s].HH.dimX,
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
/* Write component */
|
||||
samplesToChar(result, dst, samplesNum, filename);
|
||||
|
||||
char outfile[strlen(filename)+strlen(suffix)];
|
||||
strcpy(outfile, filename);
|
||||
strcpy(outfile+strlen(filename), suffix);
|
||||
i = open(outfile, O_CREAT|O_WRONLY, 0644);
|
||||
if (i == -1) {
|
||||
error(0,errno,"cannot access %s", outfile);
|
||||
return -1;
|
||||
}
|
||||
printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
|
||||
ssize_t x;
|
||||
x = write(i, result, samplesNum);
|
||||
close(i);
|
||||
|
||||
cudaFreeHost(src);
|
||||
cudaCheckError("Cuda free host memory");
|
||||
free(dst);
|
||||
free(result);
|
||||
free(bandDims);
|
||||
if (x == 0) return 1;
|
||||
return 0;
|
||||
}
|
||||
template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
|
||||
template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _DWT_H
|
||||
#define _DWT_H
|
||||
|
||||
template<typename T>
|
||||
int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
|
||||
|
||||
template<typename T>
|
||||
int writeNStage2DDWT(T *component_cuda, int width, int height,
|
||||
int stages, const char * filename, const char * suffix);
|
||||
template<typename T>
|
||||
int writeLinear(T *component_cuda, int width, int height,
|
||||
const char * filename, const char * suffix);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,35 @@
|
|||
///
|
||||
/// @file common.cu
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-01-20 14:37
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
#include "common.h"
|
||||
|
||||
namespace dwt_cuda {
|
||||
bool CudaDWTTester::testRunning = false;
|
||||
}
|
|
@ -0,0 +1,261 @@
|
|||
///
|
||||
/// @file common.h
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @brief Common stuff for all CUDA dwt functions.
|
||||
/// @date 2011-01-20 14:19
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#ifndef DWT_COMMON_H
|
||||
#define DWT_COMMON_H
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
|
||||
|
||||
// compile time minimum macro
|
||||
#define CTMIN(a,b) (((a) < (b)) ? (a) : (b))
|
||||
|
||||
|
||||
|
||||
// performance testing macros
|
||||
#if defined(GPU_DWT_TESTING)
|
||||
#define PERF_BEGIN \
|
||||
{ \
|
||||
dwt_cuda::CudaDWTTester PERF_TESTER; \
|
||||
for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \
|
||||
{ \
|
||||
PERF_TESTER.beginTestIteration();
|
||||
|
||||
#define PERF_END(PERF_NAME, PERF_W, PERF_H) \
|
||||
PERF_TESTER.endTestIteration(); \
|
||||
} \
|
||||
PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
|
||||
}
|
||||
#else // GPU_DWT_TESTING
|
||||
#define PERF_BEGIN
|
||||
#define PERF_END(PERF_NAME, PERF_W, PERF_H)
|
||||
#endif // GPU_DWT_TESTING
|
||||
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
/// Divide and round up.
|
||||
template <typename T>
|
||||
__device__ __host__ inline T divRndUp(const T & n, const T & d) {
|
||||
return (n / d) + ((n % d) ? 1 : 0);
|
||||
}
|
||||
|
||||
|
||||
// 9/7 forward DWT lifting schema coefficients
|
||||
const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1
|
||||
const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
|
||||
const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2
|
||||
const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2
|
||||
|
||||
|
||||
// 9/7 reverse DWT lifting schema coefficients
|
||||
const float r97update2 = -f97Update2; ///< undo 9/7 update 2
|
||||
const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
|
||||
const float r97update1 = -f97Update1; ///< undo 9/7 update 1
|
||||
const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
|
||||
|
||||
// FDWT 9/7 scaling coefficients
|
||||
const float scale97Mul = 1.23017410491400f;
|
||||
const float scale97Div = 1.0 / scale97Mul;
|
||||
|
||||
|
||||
// 5/3 forward DWT lifting schema coefficients
|
||||
const float forward53Predict = -0.5f; /// forward 5/3 predict
|
||||
const float forward53Update = 0.25f; /// forward 5/3 update
|
||||
|
||||
// 5/3 forward DWT lifting schema coefficients
|
||||
const float reverse53Update = -forward53Update; /// undo 5/3 update
|
||||
const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
|
||||
|
||||
|
||||
|
||||
/// Functor which adds scaled sum of neighbors to given central pixel.
|
||||
struct AddScaledSum {
|
||||
const float scale; // scale of neighbors
|
||||
__device__ AddScaledSum(const float scale) : scale(scale) {}
|
||||
__device__ void operator()(const float p, float & c, const float n) const {
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
|
||||
// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );
|
||||
|
||||
// }
|
||||
|
||||
c += scale * (p + n);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/// Returns index ranging from 0 to num threads, such that first half
|
||||
/// of threads get even indices and others get odd indices. Each thread
|
||||
/// gets different index.
|
||||
/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7
|
||||
/// parityIdx: 0 2 4 6 1 3 5 7
|
||||
/// @tparam THREADS total count of participating threads
|
||||
/// @return parity-separated index of thread
|
||||
template <int THREADS>
|
||||
__device__ inline int parityIdx() {
|
||||
return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// size of shared memory
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
const int SHM_SIZE = 48 * 1024;
|
||||
#else
|
||||
const int SHM_SIZE = 16 * 1024;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/// Perrformance and return code tester.
|
||||
class CudaDWTTester {
|
||||
private:
|
||||
static bool testRunning; ///< true if any test is currently running
|
||||
cudaEvent_t beginEvent; ///< begin CUDA event
|
||||
cudaEvent_t endEvent; ///< end CUDA event
|
||||
std::vector<float> times; ///< collected times
|
||||
const bool disabled; ///< true if this object is disabled
|
||||
public:
|
||||
/// Checks CUDA related error.
|
||||
/// @param status return code to be checked
|
||||
/// @param message message to be shown if there was an error
|
||||
/// @return true if there was no error, false otherwise
|
||||
static bool check(const cudaError_t & status, const char * message) {
|
||||
#if defined(GPU_DWT_TESTING)
|
||||
if((!testRunning) && status != cudaSuccess) {
|
||||
const char * errorString = cudaGetErrorString(status);
|
||||
fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
|
||||
fflush(stderr);
|
||||
return false;
|
||||
}
|
||||
#endif // GPU_DWT_TESTING
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Checks last kernel call for errors.
|
||||
/// @param message description of the kernel call
|
||||
/// @return true if there was no error, false otherwise
|
||||
static bool checkLastKernelCall(const char * message) {
|
||||
#if defined(GPU_DWT_TESTING)
|
||||
return testRunning ? true : check(cudaThreadSynchronize(), message);
|
||||
#else // GPU_DWT_TESTING
|
||||
return true;
|
||||
#endif // GPU_DWT_TESTING
|
||||
}
|
||||
|
||||
/// Initializes DWT tester for time measurement
|
||||
CudaDWTTester() : disabled(testRunning) {}
|
||||
|
||||
/// Gets rpefered number of iterations
|
||||
int getNumIterations() {
|
||||
return disabled ? 1 : 31;
|
||||
}
|
||||
|
||||
/// Starts one test iteration.
|
||||
void beginTestIteration() {
|
||||
if(!disabled) {
|
||||
cudaEventCreate(&beginEvent);
|
||||
cudaEventCreate(&endEvent);
|
||||
cudaEventRecord(beginEvent, 0);
|
||||
testRunning = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// Ends on etest iteration.
|
||||
void endTestIteration() {
|
||||
if(!disabled) {
|
||||
float time;
|
||||
testRunning = false;
|
||||
cudaEventRecord(endEvent, 0);
|
||||
cudaEventSynchronize(endEvent);
|
||||
cudaEventElapsedTime(&time, beginEvent, endEvent);
|
||||
cudaEventDestroy(beginEvent);
|
||||
cudaEventDestroy(endEvent);
|
||||
times.push_back(time);
|
||||
}
|
||||
}
|
||||
|
||||
/// Shows brief info about all iterations.
|
||||
/// @param name name of processing method
|
||||
/// @param sizeX width of processed image
|
||||
/// @param sizeY height of processed image
|
||||
void showPerformance(const char * name, const int sizeX, const int sizeY) {
|
||||
if(!disabled) {
|
||||
// compute mean and median
|
||||
std::sort(times.begin(), times.end());
|
||||
double sum = 0;
|
||||
for(int i = times.size(); i--; ) {
|
||||
sum += times[i];
|
||||
}
|
||||
const double median = (times[times.size() / 2]
|
||||
+ times[(times.size() - 1) / 2]) * 0.5f;
|
||||
printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "
|
||||
"(%d x %d)\n", name, (sum / times.size()), median,
|
||||
times[times.size() - 1], sizeX, sizeY);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/// Simple cudaMemcpy wrapped in performance tester.
|
||||
/// @param dest destination bufer
|
||||
/// @param src source buffer
|
||||
/// @param sx width of copied image
|
||||
/// @param sy height of copied image
|
||||
template <typename T>
|
||||
inline void memCopy(T * const dest, const T * const src,
|
||||
const size_t sx, const size_t sy) {
|
||||
cudaError_t status;
|
||||
PERF_BEGIN
|
||||
status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice);
|
||||
PERF_END(" memcpy", sx, sy)
|
||||
CudaDWTTester::check(status, "memcpy device > device");
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
||||
|
||||
|
||||
|
||||
#endif // DWT_COMMON_CUDA_H
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
///
|
||||
/// @file dwt.h
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @brief Entry points for CUDA implementaion of 9/7 and 5/3 DWT.
|
||||
/// @date 2011-01-20 11:41
|
||||
///
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
///
|
||||
///
|
||||
/// Following conditions are common for all four DWT functions:
|
||||
/// - Both input and output images are stored in GPU memory with no padding
|
||||
/// of lines or interleaving of pixels.
|
||||
/// - DWT coefficients are stored as follows: Each band is saved as one
|
||||
/// consecutive chunk (no padding/stride/interleaving). Deepest level bands
|
||||
/// (smallest ones) are stored first (at the beginning of the input/output
|
||||
/// buffers), less deep bands follow. There is no padding between stored
|
||||
/// bands in the buffer. Order of bands of the same level in the buffer is
|
||||
/// following: Low-low band (or deeper level subbands) is stored first.
|
||||
/// Vertical-low/horizontal-high band follows. Vertical-high/horizonal-low
|
||||
/// band is saved next and finally, the high-high band is saved. Out of all
|
||||
/// low-low bands, only th edeepest one is saved (right at the beginning of
|
||||
/// the buffer), others are replaced with deeper level subbands.
|
||||
/// - Input images of all functions won't be preserved (will be overwritten).
|
||||
/// - Input and output buffers can't overlap.
|
||||
/// - Size of output buffer must be greater or equal to size of input buffer.
|
||||
///
|
||||
/// There are no common compile time settings (buffer size, etc...) for
|
||||
/// all DWTs, because each DTW type needs different amount of GPU resources.
|
||||
/// Instead, each DWT type has its own compile time settings, which can be
|
||||
/// found in *.cu file, where it is implemented.
|
||||
///
|
||||
|
||||
#ifndef DWT_CUDA_H
|
||||
#define DWT_CUDA_H
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
/// Forward 5/3 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Expected to be normalized into range [-128, 127].
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
|
||||
|
||||
|
||||
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Input DWT coefficients. Format described in common rules.
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - will contain original image
|
||||
/// in normalized range [-128, 127].
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
|
||||
|
||||
|
||||
/// Forward 9/7 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Input DWT coefficients. Should be normalized (in range
|
||||
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - format specified in common rules
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
|
||||
|
||||
|
||||
/// Reverse 9/7 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Input DWT coefficients. Format described in common rules.
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - will contain original image
|
||||
/// in normalized range [-0.5, 0.5].
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
|
||||
|
||||
|
||||
} // namespace dwt_cuda
|
||||
|
||||
|
||||
|
||||
#endif // DWT_CUDA_H
|
||||
|
|
@ -0,0 +1,400 @@
|
|||
/// @file fdwt53.cu
|
||||
/// @brief CUDA implementation of forward 5/3 2D DWT.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-02-04 13:23
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "transform_buffer.h"
|
||||
#include "io.h"
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
/// Wraps buffer and methods needed for computing one level of 5/3 FDWT
|
||||
/// using sliding window approach.
|
||||
/// @tparam WIN_SIZE_X width of sliding window
|
||||
/// @tparam WIN_SIZE_Y height of sliding window
|
||||
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
||||
class FDWT53 {
|
||||
private:
|
||||
|
||||
/// Info needed for processing of one input column.
|
||||
/// @tparam CHECKED_LOADER true if column's loader should check boundaries
|
||||
/// false if there are no near boudnaries to check
|
||||
template <bool CHECKED_LOADER>
|
||||
struct FDWT53Column {
|
||||
/// loader for the column
|
||||
VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
|
||||
|
||||
/// offset of the column in shared buffer
|
||||
int offset;
|
||||
|
||||
// backup of first 3 loaded pixels (not transformed)
|
||||
int pixel0, pixel1, pixel2;
|
||||
|
||||
/// Sets all fields to anything to prevent 'uninitialized' warnings.
|
||||
__device__ void clear() {
|
||||
offset = pixel0 = pixel1 = pixel2 = 0;
|
||||
loader.clear();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Type of shared memory buffer for 5/3 FDWT transforms.
|
||||
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> FDWT53Buffer;
|
||||
|
||||
/// Actual shared buffer used for forward 5/3 DWT.
|
||||
FDWT53Buffer buffer;
|
||||
|
||||
/// Difference between indices of two vertical neighbors in buffer.
|
||||
enum { STRIDE = FDWT53Buffer::VERTICAL_STRIDE };
|
||||
|
||||
|
||||
/// Forward 5/3 DWT predict operation.
|
||||
struct Forward53Predict {
|
||||
__device__ void operator() (const int p, int & c, const int n) const {
|
||||
// c = n;
|
||||
c -= (p + n) / 2; // F.8, page 126, ITU-T Rec. T.800 final draft the real one
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Forward 5/3 DWT update operation.
|
||||
struct Forward53Update {
|
||||
__device__ void operator() (const int p, int & c, const int n) const {
|
||||
c += (p + n + 2) / 4; // F.9, page 126, ITU-T Rec. T.800 final draft
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Initializes one column: computes offset of the column in shared memory
|
||||
/// buffer, initializes loader and finally uses it to load first 3 pixels.
|
||||
/// @tparam CHECKED true if loader of the column checks boundaries
|
||||
/// @param column (uninitialized) column info to be initialized
|
||||
/// @param input input image
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param colIndex x-axis coordinate of the column (relative to the left
|
||||
/// side of this threadblock's block of input pixels)
|
||||
/// @param firstY y-axis coordinate of first image row to be transformed
|
||||
|
||||
template <bool CHECKED>
|
||||
__device__ void initColumn(FDWT53Column<CHECKED> & column,
|
||||
const int * const input,
|
||||
const int sizeX, const int sizeY,
|
||||
const int colIndex, const int firstY) {
|
||||
// get offset of the column with index 'cId'
|
||||
column.offset = buffer.getColumnOffset(colIndex);
|
||||
|
||||
// coordinates of the first pixel to be loaded
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
|
||||
|
||||
if(blockIdx.y == 0) {
|
||||
// topmost block - apply mirroring rules when loading first 3 rows
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// load pixels in mirrored way
|
||||
column.pixel2 = column.loader.loadFrom(input); // loaded pixel #0
|
||||
column.pixel1 = column.loader.loadFrom(input); // loaded pixel #1
|
||||
column.pixel0 = column.loader.loadFrom(input); // loaded pixel #2
|
||||
|
||||
// reinitialize loader to start with pixel #1 again
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY + 1);
|
||||
} else {
|
||||
// non-topmost row - regular loading:
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY - 2);
|
||||
|
||||
// load 3 rows into the column
|
||||
column.pixel0 = column.loader.loadFrom(input);
|
||||
column.pixel1 = column.loader.loadFrom(input);
|
||||
column.pixel2 = column.loader.loadFrom(input);
|
||||
// Now, the next pixel, which will be loaded by loader, is pixel #1.
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Loads and vertically transforms given column. Assumes that first 3
|
||||
/// pixels are already loaded in column fields pixel0 ... pixel2.
|
||||
/// @tparam CHECKED true if loader of the column checks boundaries
|
||||
/// @param column column to be loaded and vertically transformed
|
||||
/// @param input pointer to input image data
|
||||
template <bool CHECKED>
|
||||
__device__ void loadAndVerticallyTransform(FDWT53Column<CHECKED> & column,
|
||||
const int * const input) {
|
||||
// take 3 loaded pixels and put them into shared memory transform buffer
|
||||
buffer[column.offset + 0 * STRIDE] = column.pixel0;
|
||||
buffer[column.offset + 1 * STRIDE] = column.pixel1;
|
||||
buffer[column.offset + 2 * STRIDE] = column.pixel2;
|
||||
|
||||
// load remaining pixels to be able to vertically transform the window
|
||||
|
||||
for(int i = 3; i < (3 + WIN_SIZE_Y); i++)
|
||||
{
|
||||
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
|
||||
}
|
||||
|
||||
// remember last 3 pixels for use in next iteration
|
||||
column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
|
||||
column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
|
||||
column.pixel2 = buffer[column.offset + (WIN_SIZE_Y + 2) * STRIDE];
|
||||
|
||||
// vertically transform the column in transform buffer
|
||||
buffer.forEachVerticalOdd(column.offset, Forward53Predict());
|
||||
buffer.forEachVerticalEven(column.offset, Forward53Update());
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Actual implementation of 5/3 FDWT.
|
||||
/// @tparam CHECK_LOADS true if input loader must check boundaries
|
||||
/// @tparam CHECK_WRITES true if output writer must check boundaries
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param winSteps number of sliding window steps
|
||||
template <bool CHECK_LOADS, bool CHECK_WRITES>
|
||||
__device__ void transform(const int * const in, int * const out,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
// info about one main and one boundary columns processed by this thread
|
||||
FDWT53Column<CHECK_LOADS> column;
|
||||
FDWT53Column<CHECK_LOADS> boundaryColumn; // only few threads use this
|
||||
|
||||
// Initialize all column info: initialize loaders, compute offset of
|
||||
// column in shared buffer and initialize loader of column.
|
||||
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
||||
initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
|
||||
|
||||
|
||||
// first 3 threads initialize boundary columns, others do not use them
|
||||
boundaryColumn.clear();
|
||||
if(threadIdx.x < 3) {
|
||||
// index of boundary column (relative x-axis coordinate of the column)
|
||||
const int colId = threadIdx.x + ((threadIdx.x == 0) ? WIN_SIZE_X : -3);
|
||||
|
||||
// initialize the column
|
||||
initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// index of column which will be written into output by this thread
|
||||
const int outColumnIndex = parityIdx<WIN_SIZE_X>();
|
||||
|
||||
// offset of column which will be written by this thread into output
|
||||
const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
|
||||
|
||||
// initialize output writer for this thread
|
||||
const int outputFirstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
|
||||
VerticalDWTBandWriter<int, CHECK_WRITES> writer;
|
||||
writer.init(sizeX, sizeY, outputFirstX, firstY);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
// Sliding window iterations:
|
||||
// Each iteration assumes that first 3 pixels of each column are loaded.
|
||||
for(int w = 0; w < winSteps; w++) {
|
||||
|
||||
// For each column (including boundary columns): load and vertically
|
||||
// transform another WIN_SIZE_Y lines.
|
||||
loadAndVerticallyTransform(column, in);
|
||||
if(threadIdx.x < 3) {
|
||||
loadAndVerticallyTransform(boundaryColumn, in);
|
||||
}
|
||||
|
||||
// wait for all columns to be vertically transformed and transform all
|
||||
// output rows horizontally
|
||||
__syncthreads();
|
||||
|
||||
|
||||
buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
|
||||
__syncthreads();
|
||||
|
||||
buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
|
||||
|
||||
// wait for all output rows to be transformed horizontally and write
|
||||
// them into output buffer
|
||||
__syncthreads();
|
||||
|
||||
|
||||
for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
|
||||
// Write low coefficients from output column into low band ...
|
||||
writer.writeLowInto(out, buffer[outColumnOffset + r * STRIDE]);
|
||||
// ... and high coeficients into the high band.
|
||||
writer.writeHighInto(out, buffer[outColumnOffset + (r+1) * STRIDE]);
|
||||
}
|
||||
|
||||
// before proceeding to next iteration, wait for all output columns
|
||||
// to be written into the output
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
/// Determines, whether this block's pixels touch boundary and selects
|
||||
/// right version of algorithm according to it - for many threadblocks, it
|
||||
/// selects version which does not deal with boundary mirroring and thus is
|
||||
/// slightly faster.
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
/// @param steps number of sliding window steps
|
||||
__device__ static void run(const int * const in, int * const out,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// if(blockIdx.x==0 && blockIdx.y ==11 && threadIdx.x >=0&&threadIdx.x <64){
|
||||
// object with transform buffer in shared memory
|
||||
__shared__ FDWT53<WIN_SIZE_X, WIN_SIZE_Y> fdwt53;
|
||||
|
||||
// Compute limits of this threadblock's block of pixels and use them to
|
||||
// determine, whether this threadblock will have to deal with boundary.
|
||||
// (1 in next expressions is for radius of impulse response of 9/7 FDWT.)
|
||||
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
|
||||
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
|
||||
const bool atRightBoudary = maxX >= sx;
|
||||
const bool atBottomBoudary = maxY >= sy;
|
||||
|
||||
// Select specialized version of code according to distance of this
|
||||
// threadblock's pixels from image boundary.
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("fdwt53 run");
|
||||
// }
|
||||
if(atBottomBoudary)
|
||||
{
|
||||
// near bottom boundary => check both writing and reading
|
||||
fdwt53.transform<true, true>(in, out, sx, sy, steps);
|
||||
} else if(atRightBoudary)
|
||||
{
|
||||
// near right boundary only => check writing only
|
||||
fdwt53.transform<false, true>(in, out, sx, sy, steps);
|
||||
} else
|
||||
{
|
||||
// no nearby boundary => check nothing
|
||||
fdwt53.transform<false, false>(in, out, sx, sy, steps);
|
||||
}
|
||||
}
|
||||
// }
|
||||
|
||||
}; // end of class FDWT53
|
||||
|
||||
|
||||
|
||||
/// Main GPU 5/3 FDWT entry point.
|
||||
/// @tparam WIN_SX width of sliding window to be used
|
||||
/// @tparam WIN_SY height of sliding window to be used
|
||||
/// @param input input image
|
||||
/// @param output output buffer
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param winSteps number of sliding window steps
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT53<WIN_SX, WIN_SY>), 8))
|
||||
__global__ void fdwt53Kernel(const int * const input, int * const output,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Only computes optimal number of sliding window steps,
|
||||
/// number of threadblocks and then lanches the 5/3 FDWT kernel.
|
||||
/// @tparam WIN_SX width of sliding window
|
||||
/// @tparam WIN_SY height of sliding window
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
|
||||
// compute optimal number of steps of each sliding window
|
||||
|
||||
const int steps = divRndUp(sy, 15 * WIN_SY);
|
||||
|
||||
int gx = divRndUp(sx, WIN_SX);
|
||||
int gy = divRndUp(sy, WIN_SY * steps);
|
||||
|
||||
printf("\n sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
|
||||
|
||||
// prepare grid size
|
||||
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
||||
// printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
|
||||
|
||||
// run kernel, possibly measure time and finally check the call
|
||||
// PERF_BEGIN
|
||||
fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
||||
// PERF_END(" FDWT53", sx, sy)
|
||||
// CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
|
||||
printf("fdwt53Kernel in launchFDWT53Kernel has finished");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Forward 5/3 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Expected to be normalized into range [-128, 127].
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
|
||||
// select right width of kernel for the size of the image
|
||||
|
||||
if(sizeX >= 960) {
|
||||
launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
|
||||
} else if (sizeX >= 480) {
|
||||
launchFDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
|
||||
} else {
|
||||
launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
|
||||
}
|
||||
|
||||
// if this was not the last level, continue recursively with other levels
|
||||
if(levels > 1) {
|
||||
// copy output's LL band back into input buffer
|
||||
const int llSizeX = divRndUp(sizeX, 2);
|
||||
const int llSizeY = divRndUp(sizeY, 2);
|
||||
// printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
|
||||
memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
|
||||
|
||||
// run remaining levels of FDWT
|
||||
fdwt53(in, out, llSizeX, llSizeY, levels - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
|
@ -0,0 +1,383 @@
|
|||
///
|
||||
/// @file fdwt97.cu
|
||||
/// @brief CUDA implementation of forward 9/7 2D DWT.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-01-20 13:18
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "transform_buffer.h"
|
||||
#include "io.h"
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
|
||||
/// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
|
||||
/// of specified size. Template arguments specify this size.
|
||||
/// @tparam WIN_SIZE_X width of sliding window
|
||||
/// @tparam WIN_SIZE_Y height of sliding window
|
||||
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
||||
class FDWT97 {
|
||||
private:
|
||||
/// Type of shared memory buffer used for 9/7 DWT.
|
||||
typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> FDWT97Buffer;
|
||||
|
||||
/// Actual shared buffer used for forward 9/7 DWT.
|
||||
FDWT97Buffer buffer;
|
||||
|
||||
/// Difference of indices of two vertically neighboring items in buffer.
|
||||
enum { STRIDE = FDWT97Buffer::VERTICAL_STRIDE };
|
||||
|
||||
|
||||
/// One thread's info about loading input image
|
||||
/// @tparam CHECKED true if loader should check for image boundaries
|
||||
template <bool CHECKED>
|
||||
struct FDWT97ColumnLoadingInfo {
|
||||
/// Loader of pixels from some input image.
|
||||
VerticalDWTPixelLoader<float, CHECKED> loader;
|
||||
|
||||
/// Offset of column loaded by loader. (Offset in shared buffer.)
|
||||
int offset;
|
||||
};
|
||||
|
||||
|
||||
/// Horizontal 9/7 FDWT on specified lines of transform buffer.
|
||||
/// @param lines number of lines to be transformed
|
||||
/// @param firstLine index of the first line to be transformed
|
||||
__device__ void horizontalFDWT97(const int lines, const int firstLine) {
|
||||
__syncthreads();
|
||||
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict1));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update1));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict2));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update2));
|
||||
__syncthreads();
|
||||
|
||||
buffer.scaleHorizontal(scale97Div, scale97Mul, firstLine, lines);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Initializes one column of shared transform buffer with 7 input pixels.
|
||||
/// Those 7 pixels will not be transformed. Also initializes given loader.
|
||||
/// @tparam CHECKED true if loader should check for image boundaries
|
||||
/// @param column (uninitialized) object for loading input pixels
|
||||
/// @param columnIndex index (not offset!) of the column to be loaded
|
||||
/// (relative to threadblock's first column)
|
||||
/// @param input pointer to input image in GPU memory
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param firstY index of first row to be loaded from image
|
||||
template <bool CHECKED>
|
||||
__device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
|
||||
const int columnIndex, const float * const input,
|
||||
const int sizeX, const int sizeY,
|
||||
const int firstY) {
|
||||
// get offset of the column with index 'columnIndex'
|
||||
column.offset = buffer.getColumnOffset(columnIndex);
|
||||
|
||||
// printf(" offset: %d , threadIdx: %d, blockIdx.y %d\n ", column.offset, threadIdx.x, blockIdx.y);
|
||||
|
||||
// x-coordinate of the first pixel to be loaded by given loader
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;
|
||||
|
||||
if(blockIdx.y == 0) {
|
||||
// topmost block - apply mirroring rules when loading first 7 rows
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// load pixels in mirrored way
|
||||
buffer[column.offset + 4 * STRIDE] = column.loader.loadFrom(input);
|
||||
buffer[column.offset + 3 * STRIDE] =
|
||||
buffer[column.offset + 5 * STRIDE] = column.loader.loadFrom(input);
|
||||
buffer[column.offset + 2 * STRIDE] =
|
||||
buffer[column.offset + 6 * STRIDE] = column.loader.loadFrom(input);
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadFrom(input);
|
||||
buffer[column.offset + 0 * STRIDE] = column.loader.loadFrom(input);
|
||||
|
||||
// reinitialize loader to start with pixel #3 again
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY + 3);
|
||||
|
||||
} else {
|
||||
// non-topmost row - regular loading:
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY - 4);
|
||||
|
||||
// load 7 rows into the transform buffer
|
||||
for(int i = 0; i < 7; i++) {
|
||||
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
|
||||
|
||||
}
|
||||
}
|
||||
// Now, the next pixel, which will be loaded by loader, is pixel #3.
|
||||
}
|
||||
|
||||
|
||||
/// Loads another WIN_SIZE_Y pixels into given column using given loader.
|
||||
/// @tparam CHECKED true if loader should check for image boundaries
|
||||
/// @param input input image to load from
|
||||
/// @param column loader and offset of loaded column in shared buffer
|
||||
template <bool CHECKED>
|
||||
inline __device__ void loadWindowIntoColumn(const float * const input,
|
||||
FDWT97ColumnLoadingInfo<CHECKED> & column) {
|
||||
for(int i = 7; i < (7 + WIN_SIZE_Y); i++) {
|
||||
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Main GPU 9/7 FDWT entry point.
|
||||
/// @tparam CHECK_LOADS true if boundaries should be checked when loading
|
||||
/// @tparam CHECK_WRITES true if boundaries should be checked when writing
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param winSteps number of steps of sliding window
|
||||
template <bool CHECK_LOADS, bool CHECK_WRITES>
|
||||
__device__ void transform(const float * const in, float * const out,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
// info about columns loaded by this thread: one main column and possibly
|
||||
// one boundary column. (Only some threads load some boundary column.)
|
||||
FDWT97ColumnLoadingInfo<CHECK_LOADS> loadedColumn;
|
||||
FDWT97ColumnLoadingInfo<CHECK_LOADS> boundaryColumn;
|
||||
|
||||
// Initialize first 7 lines of transform buffer.
|
||||
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
||||
initColumn(loadedColumn, threadIdx.x, in, sizeX, sizeY, firstY);
|
||||
|
||||
// Some threads initialize boundary columns.
|
||||
boundaryColumn.offset = 0;
|
||||
boundaryColumn.loader.clear();
|
||||
if(threadIdx.x < 7) {
|
||||
// each thread among first 7 ones gets index of one of boundary columns
|
||||
const int colId = threadIdx.x + ((threadIdx.x < 3) ? WIN_SIZE_X : -7);
|
||||
|
||||
// Thread initializes offset of the boundary column (in shared buffer),
|
||||
// first 7 pixels of the column and a loader for this column.
|
||||
initColumn(boundaryColumn, colId, in, sizeX, sizeY, firstY);
|
||||
}
|
||||
|
||||
// horizontally transform first 7 rows in all columns
|
||||
horizontalFDWT97(7, 0);
|
||||
|
||||
// Index of column handled by this thread. (First half of threads handle
|
||||
// even columns and others handle odd columns.)
|
||||
const int outColumnIndex = parityIdx<WIN_SIZE_X>();
|
||||
|
||||
// writer of output linear bands - initialize it
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
|
||||
VerticalDWTBandWriter<float, CHECK_WRITES> writer;
|
||||
writer.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// transform buffer offset of column transformed and saved by this thread
|
||||
const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
|
||||
|
||||
// (Each iteration of this loop assumes that first 7 rows of transform
|
||||
// buffer are already loaded with horizontally transformed coefficients.)
|
||||
for(int w = 0; w < winSteps; w++) {
|
||||
// Load another WIN_SIZE_Y lines of thread's column into the buffer.
|
||||
loadWindowIntoColumn(in, loadedColumn);
|
||||
|
||||
// some threads also load boundary columns
|
||||
if(threadIdx.x < 7) {
|
||||
loadWindowIntoColumn(in, boundaryColumn);
|
||||
}
|
||||
|
||||
// horizontally transform all newly loaded lines
|
||||
horizontalFDWT97(WIN_SIZE_Y, 7);
|
||||
|
||||
// Using 7 registers, remember current values of last 7 rows of
|
||||
// transform buffer. These rows are transformed horizontally only
|
||||
// and will be used in next iteration.
|
||||
float last7Lines[7];
|
||||
for(int i = 0; i < 7; i++) {
|
||||
last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
|
||||
}
|
||||
|
||||
// vertically transform all central columns (do not scale yet)
|
||||
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict1));
|
||||
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update1));
|
||||
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict2));
|
||||
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update2));
|
||||
|
||||
// Save all results of current window. Results are in transform buffer
|
||||
// at rows from #4 to #(4 + WIN_SIZE_Y). Other rows are invalid now.
|
||||
// (They only served as a boundary for vertical FDWT.)
|
||||
|
||||
for(int i = 4; i < (4 + WIN_SIZE_Y); i += 2) {
|
||||
const int index = outColumnOffset + i * STRIDE;
|
||||
// Write low coefficients from column into low band ...
|
||||
writer.writeLowInto(out, buffer[index] * scale97Div);
|
||||
// ... and high coeficients into the high band.
|
||||
writer.writeHighInto(out, buffer[index + STRIDE] * scale97Mul);
|
||||
}
|
||||
|
||||
// Use last 7 remembered lines as first 7 lines for next iteration.
|
||||
// As expected, these lines are already horizontally transformed.
|
||||
for(int i = 0; i < 7; i++) {
|
||||
buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
|
||||
|
||||
}
|
||||
|
||||
// Wait for all writing threads before proceeding to loading new
|
||||
// pixels in next iteration. (Not to overwrite those which
|
||||
// are not written yet.)
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
/// Runs one of specialized variants of 9/7 FDWT according to distance of
|
||||
/// processed pixels to image boudnary. Some variants do not check for
|
||||
/// boudnary and thus are slightly faster.
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
/// @param steps number of steps of sliding window
|
||||
__device__ static void run(const float * const input, float * const output,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// object with transform buffer in shared memory
|
||||
__shared__ FDWT97<WIN_SIZE_X, WIN_SIZE_Y> fdwt97;
|
||||
|
||||
// Compute limits of this threadblock's block of pixels and use them to
|
||||
// determine, whether this threadblock will have to deal with boundary.
|
||||
// (3 in next expressions is for radius of impulse response of 9/7 FDWT.)
|
||||
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
|
||||
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
|
||||
const bool atRightBoudary = maxX >= sx;
|
||||
const bool atBottomBoudary = maxY >= sy;
|
||||
|
||||
// Select specialized version of code according to distance of this
|
||||
// threadblock's pixels from image boundary.
|
||||
if(atBottomBoudary) {
|
||||
// near bottom boundary => check both writing and reading
|
||||
// printf("\n atBottomBoudary \n ");
|
||||
fdwt97.transform<true, true>(input, output, sx, sy, steps);
|
||||
} else if(atRightBoudary) {
|
||||
|
||||
// near right boundary only => check writing only
|
||||
fdwt97.transform<false, true>(input, output, sx, sy, steps);
|
||||
} else {
|
||||
|
||||
// no nearby boundary => check nothing
|
||||
fdwt97.transform<false, false>(input, output, sx, sy, steps);
|
||||
}
|
||||
}
|
||||
|
||||
}; // end of class FDWT97
|
||||
|
||||
|
||||
|
||||
/// Main GPU 9/7 FDWT entry point.
|
||||
/// @param input input image
|
||||
/// @parma output output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
/// @param steps number of steps of sliding window
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT97<WIN_SX, WIN_SY>), 8))
|
||||
__global__ void fdwt97Kernel(const float * const input, float * const output,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// Excuse me, dear reader of this code - this call have to be here. If you
|
||||
// try to simply put contents of following method right here, CUDA compiler
|
||||
// (version 3.2) will spit tons of nonsense messy errors ...
|
||||
// Hope they will not break it even more in future releases.
|
||||
FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Only computes optimal number of sliding window steps,
|
||||
/// number of threadblocks and then lanches the 9/7 FDWT kernel.
|
||||
/// @tparam WIN_SX width of sliding window
|
||||
/// @tparam WIN_SY height of sliding window
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
|
||||
// compute optimal number of steps of each sliding window
|
||||
const int steps = divRndUp(sy, 15 * WIN_SY);
|
||||
|
||||
// prepare grid size
|
||||
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
||||
printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
|
||||
|
||||
// run kernel, possibly measure time and finally check the call
|
||||
PERF_BEGIN
|
||||
fdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
||||
PERF_END(" FDWT97", sx, sy)
|
||||
CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
|
||||
/// @param in Input DWT coefficients. Should be normalized (in range
|
||||
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - format specified in common rules
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
|
||||
// select right width of kernel for the size of the image
|
||||
if(sizeX >= 960) {
|
||||
launchFDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
|
||||
} else if (sizeX >= 480) {
|
||||
launchFDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
|
||||
} else {
|
||||
launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
|
||||
}
|
||||
|
||||
// if this was not the last level, continue recursively with other levels
|
||||
if(levels > 1) {
|
||||
// copy output's LL band back into input buffer
|
||||
const int llSizeX = divRndUp(sizeX, 2);
|
||||
const int llSizeY = divRndUp(sizeY, 2);
|
||||
memCopy(in, out, llSizeX, llSizeY);
|
||||
|
||||
// run remaining levels of FDWT
|
||||
fdwt97(in, out, llSizeX, llSizeY, levels - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
|
@ -0,0 +1,483 @@
|
|||
///
|
||||
/// @file: io.h
|
||||
/// @brief Manages loading and saving lineary stored bands and input images.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-01-20 22:38
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#ifndef IO_H
|
||||
#define IO_H
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
/// Base for all IO classes - manages mirroring.
|
||||
class DWTIO {
|
||||
protected:
|
||||
/// Handles mirroring of image at edges in a DWT correct way.
|
||||
/// @param d a position in the image (will be replaced by mirrored d)
|
||||
/// @param sizeD size of the image along the dimension of 'd'
|
||||
__device__ static void mirror(int & d, const int & sizeD) {
|
||||
// TODO: enable multiple mirroring:
|
||||
// if(sizeD > 1) {
|
||||
// if(d < 0) {
|
||||
// const int underflow = -1 - d;
|
||||
// const int phase = (underflow / (sizeD - 1)) & 1;
|
||||
// const int remainder = underflow % (sizeD - 1);
|
||||
// if(phase == 0) {
|
||||
// d = remainder + 1;
|
||||
// } else {
|
||||
// d = sizeD - 2 - remainder;
|
||||
// }
|
||||
// } else if(d >= sizeD) {
|
||||
// const int overflow = d - sizeD;
|
||||
// const int phase = (overflow / (sizeD - 1)) & 1;
|
||||
// const int remainder = overflow % (sizeD - 1);
|
||||
// if(phase == 0) {
|
||||
// d = sizeD - 2 - remainder;
|
||||
// } else {
|
||||
// d = remainder + 1;
|
||||
// }
|
||||
// }
|
||||
// } else {
|
||||
// d = 0;
|
||||
// }
|
||||
//for test the mirror's use Feb 17
|
||||
if(d >= sizeD) {
|
||||
d = 2 * sizeD - 2 - d;
|
||||
} else if(d < 0) {
|
||||
d = -d;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Base class for pixel loader and writer - manages computing start index,
|
||||
/// stride and end of image for loading column of pixels.
|
||||
/// @tparam T type of image pixels
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTPixelIO : protected DWTIO {
|
||||
protected:
|
||||
int end; ///< index of bottom neightbor of last pixel of column
|
||||
int stride; ///< increment of pointer to get to next pixel
|
||||
|
||||
/// Initializes pixel IO - sets end index and a position of first pixel.
|
||||
/// @param sizeX width of the image
|
||||
/// @param sizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to use
|
||||
/// @param firstY y-coordinate of first pixel to use
|
||||
/// @return index of pixel at position [x, y] in the image
|
||||
__device__ int initialize(const int sizeX, const int sizeY,
|
||||
int firstX, int firstY) {
|
||||
// initialize all pointers and stride
|
||||
end = CHECKED ? (sizeY * sizeX + firstX) : 0;
|
||||
stride = sizeX;
|
||||
return firstX + sizeX * firstY;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/// Writes reverse transformed pixels directly into output image.
|
||||
/// @tparam T type of output pixels
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
|
||||
private:
|
||||
int next; // index of the next pixel to be loaded
|
||||
|
||||
public:
|
||||
/// Initializes writer - sets output buffer and a position of first pixel.
|
||||
/// @param sizeX width of the image
|
||||
/// @param sizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to write into
|
||||
/// @param firstY y-coordinate of first pixel to write into
|
||||
__device__ void init(const int sizeX, const int sizeY,
|
||||
int firstX, int firstY) {
|
||||
if(firstX < sizeX) {
|
||||
next = this->initialize(sizeX, sizeY, firstX, firstY);
|
||||
} else {
|
||||
this->end = 0;
|
||||
this->stride = 0;
|
||||
next = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Writes given value at next position and advances internal pointer while
|
||||
/// correctly handling mirroring.
|
||||
/// @param output output image to write pixel into
|
||||
/// @param value value of the pixel to be written
|
||||
__device__ void writeInto(T * const output, const T & value) {
|
||||
if((!CHECKED) || (next != this->end)) {
|
||||
output[next] = value;
|
||||
next += this->stride;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/// Loads pixels from input image.
|
||||
/// @tparam T type of image input pixels
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTPixelLoader
|
||||
: protected VerticalDWTPixelIO<const T, CHECKED> {
|
||||
private:
|
||||
int last; ///< index of last loaded pixel
|
||||
public:
|
||||
|
||||
|
||||
//******************* FOR TEST **********************
|
||||
__device__ int getlast(){
|
||||
return last;
|
||||
}
|
||||
__device__ int getend(){
|
||||
return this->end;
|
||||
}
|
||||
__device__ int getstride(){
|
||||
return this->stride;
|
||||
}
|
||||
__device__ void setend(int a){
|
||||
this->end=a;
|
||||
}
|
||||
//******************* FOR TEST **********************
|
||||
|
||||
|
||||
|
||||
/// Initializes loader - sets input size and a position of first pixel.
|
||||
/// @param sizeX width of the image
|
||||
/// @param sizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to load
|
||||
/// @param firstY y-coordinate of first pixel to load
|
||||
__device__ void init(const int sizeX, const int sizeY,
|
||||
int firstX, int firstY) {
|
||||
// correctly mirror x coordinate
|
||||
this->mirror(firstX, sizeX);
|
||||
|
||||
// 'last' always points to already loaded pixel (subtract sizeX = stride)
|
||||
last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
|
||||
//last = (FirstX + sizeX * FirstY) - sizeX
|
||||
}
|
||||
|
||||
/// Sets all fields to zeros, for compiler not to complain about
|
||||
/// uninitialized stuff.
|
||||
__device__ void clear() {
|
||||
this->end = 0;
|
||||
this->stride = 0;
|
||||
this->last = 0;
|
||||
}
|
||||
|
||||
/// Gets another pixel and advancees internal pointer to following one.
|
||||
/// @param input input image to load next pixel from
|
||||
/// @return next pixel from given image
|
||||
__device__ T loadFrom(const T * const input) {
|
||||
last += this->stride;
|
||||
if(CHECKED && (last == this->end)) {
|
||||
last -= 2 * this->stride;
|
||||
this->stride = -this->stride; // reverse loader's direction
|
||||
}
|
||||
// avoid reading from negative indices if loader is checked
|
||||
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later
|
||||
if(last < 0 ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return input[last];
|
||||
// return this->end;
|
||||
// return last;
|
||||
// return this->stride;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/// Base for band write and loader. Manages computing strides and pointers
|
||||
/// to first and last pixels in a linearly-stored-bands correct way.
|
||||
/// @tparam T type of band coefficients
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTBandIO : protected DWTIO {
|
||||
protected:
|
||||
/// index of bottom neighbor of last pixel of loaded column
|
||||
int end;
|
||||
|
||||
/// increment of index to get from highpass band to the lowpass one
|
||||
int strideHighToLow;
|
||||
|
||||
/// increment of index to get from the lowpass band to the highpass one
|
||||
int strideLowToHigh;
|
||||
|
||||
/// Initializes IO - sets size of image and a position of first pixel.
|
||||
/// @param imageSizeX width of the image
|
||||
/// @param imageSizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to use
|
||||
/// (Parity determines vertically low or high band.)
|
||||
/// @param firstY y-coordinate of first pixel to use
|
||||
/// (Parity determines horizontally low or high band.)
|
||||
/// @return index of first item specified by firstX and firstY
|
||||
__device__ int initialize(const int imageSizeX, const int imageSizeY,
|
||||
int firstX, int firstY) {
|
||||
// index of first pixel (topmost one) of the column with index firstX
|
||||
int columnOffset = firstX / 2;
|
||||
|
||||
// difference between indices of two vertically neighboring pixels
|
||||
// in the same band
|
||||
int verticalStride;
|
||||
|
||||
// resolve index of first pixel according to horizontal parity
|
||||
if(firstX & 1) {
|
||||
// first pixel in one of right bands
|
||||
verticalStride = imageSizeX / 2;
|
||||
columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
|
||||
strideLowToHigh = (imageSizeX * imageSizeY) / 2;
|
||||
} else {
|
||||
// first pixel in one of left bands
|
||||
verticalStride = imageSizeX / 2 + (imageSizeX & 1);
|
||||
strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
|
||||
}
|
||||
|
||||
// set the other stride
|
||||
strideHighToLow = verticalStride - strideLowToHigh;
|
||||
|
||||
// compute index of coefficient which indicates end of image
|
||||
if(CHECKED) {
|
||||
end = columnOffset // right column
|
||||
+ (imageSizeY / 2) * verticalStride // right row
|
||||
+ (imageSizeY & 1) * strideLowToHigh; // possibly in high band
|
||||
} else {
|
||||
end = 0;
|
||||
}
|
||||
|
||||
|
||||
//***********for test**************
|
||||
// end = CHECKED;
|
||||
//***********for test**************
|
||||
|
||||
|
||||
// finally, return index of the first item
|
||||
return columnOffset // right column
|
||||
+ (firstY / 2) * verticalStride // right row
|
||||
+ (firstY & 1) * strideLowToHigh; // possibly in high band
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/// Directly loads coefficients from four consecutively stored transformed
|
||||
/// bands.
|
||||
/// @tparam T type of input band coefficients
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
|
||||
private:
|
||||
int last; ///< index of last loaded pixel
|
||||
|
||||
/// Checks internal index and possibly reverses direction of loader.
|
||||
/// (Handles mirroring at the bottom of the image.)
|
||||
/// @param input input image to load next coefficient from
|
||||
/// @param stride stride to use now (one of two loader's strides)
|
||||
/// @return loaded coefficient
|
||||
__device__ T updateAndLoad(const T * const input, const int & stride) {
|
||||
last += stride;
|
||||
if(CHECKED && (last == this->end)) {
|
||||
// undo last two updates of index (to get to previous mirrored item)
|
||||
last -= (this->strideLowToHigh + this->strideHighToLow);
|
||||
|
||||
// swap and reverse strides (to move up in the loaded column now)
|
||||
const int temp = this->strideLowToHigh;
|
||||
this->strideLowToHigh = -this->strideHighToLow;
|
||||
this->strideHighToLow = -temp;
|
||||
}
|
||||
if(last < 0 ) {
|
||||
return 0;
|
||||
}
|
||||
// avoid reading from negative indices if loader is checked
|
||||
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later
|
||||
return input[last];
|
||||
}
|
||||
public:
|
||||
|
||||
/// Initializes loader - sets input size and a position of first pixel.
|
||||
/// @param imageSizeX width of the image
|
||||
/// @param imageSizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to load
|
||||
/// (Parity determines vertically low or high band.)
|
||||
/// @param firstY y-coordinate of first pixel to load
|
||||
/// (Parity determines horizontally low or high band.)
|
||||
__device__ void init(const int imageSizeX, const int imageSizeY,
|
||||
int firstX, const int firstY) {
|
||||
this->mirror(firstX, imageSizeX);
|
||||
last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
|
||||
|
||||
// adjust to point to previous item
|
||||
last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
|
||||
}
|
||||
|
||||
/// Sets all fields to zeros, for compiler not to complain about
|
||||
/// uninitialized stuff.
|
||||
__device__ void clear() {
|
||||
this->end = 0;
|
||||
this->strideHighToLow = 0;
|
||||
this->strideLowToHigh = 0;
|
||||
this->last = 0;
|
||||
}
|
||||
|
||||
/// Gets another coefficient from lowpass band and advances internal index.
|
||||
/// Call this method first if position of first pixel passed to init
|
||||
/// was in high band.
|
||||
/// @param input input image to load next coefficient from
|
||||
/// @return next coefficient from the lowpass band of the given image
|
||||
__device__ T loadLowFrom(const T * const input) {
|
||||
return updateAndLoad(input, this->strideHighToLow);
|
||||
}
|
||||
|
||||
/// Gets another coefficient from the highpass band and advances index.
|
||||
/// Call this method first if position of first pixel passed to init
|
||||
/// was in high band.
|
||||
/// @param input input image to load next coefficient from
|
||||
/// @return next coefficient from the highbass band of the given image
|
||||
__device__ T loadHighFrom(const T * const input) {
|
||||
return updateAndLoad(input, this->strideLowToHigh);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/// Directly saves coefficients into four transformed bands.
|
||||
/// @tparam T type of output band coefficients
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
|
||||
private:
|
||||
int next; ///< index of last loaded pixel
|
||||
|
||||
/// Checks internal index and possibly stops the writer.
|
||||
/// (Handles mirroring at edges of the image.)
|
||||
/// @param output output buffer
|
||||
/// @param item item to put into the output
|
||||
/// @param stride increment of the pointer to get to next output index
|
||||
__device__ int saveAndUpdate(T * const output, const T & item,
|
||||
const int & stride) {
|
||||
// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){ //test, Mar 20
|
||||
if((!CHECKED) || (next != this->end)) {
|
||||
// if(next == 4) {
|
||||
// printf(" next: %d stride: %d val: %f \n", next, stride, item );
|
||||
// }
|
||||
output[next] = item;
|
||||
next += stride;
|
||||
}
|
||||
// }
|
||||
// if((!CHECKED) || (next != this->end)) { //the real one
|
||||
// output[next] = item;
|
||||
// next += stride; //stride has been test
|
||||
// }
|
||||
return next;
|
||||
}
|
||||
public:
|
||||
|
||||
/// Initializes writer - sets output size and a position of first pixel.
|
||||
/// @param output output image
|
||||
/// @param imageSizeX width of the image
|
||||
/// @param imageSizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to write
|
||||
/// (Parity determines vertically low or high band.)
|
||||
/// @param firstY y-coordinate of first pixel to write
|
||||
/// (Parity determines horizontally low or high band.)
|
||||
__device__ void init(const int imageSizeX, const int imageSizeY,
|
||||
const int firstX, const int firstY) {
|
||||
if (firstX < imageSizeX) {
|
||||
next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
|
||||
} else {
|
||||
clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets all fields to zeros, for compiler not to complain about
|
||||
/// uninitialized stuff.
|
||||
__device__ void clear() {
|
||||
this->end = 0;
|
||||
this->strideHighToLow = 0;
|
||||
this->strideLowToHigh = 0;
|
||||
this->next = 0;
|
||||
}
|
||||
|
||||
/// Writes another coefficient into the band which was specified using
|
||||
/// init's firstX and firstY parameters and advances internal pointer.
|
||||
/// Call this method first if position of first pixel passed to init
|
||||
/// was in lowpass band.
|
||||
/// @param output output image
|
||||
/// @param low lowpass coefficient to save into the lowpass band
|
||||
__device__ int writeLowInto(T * const output, const T & primary) {
|
||||
return saveAndUpdate(output, primary, this->strideLowToHigh);
|
||||
}
|
||||
|
||||
/// Writes another coefficient from the other band and advances pointer.
|
||||
/// Call this method first if position of first pixel passed to init
|
||||
/// was in highpass band.
|
||||
/// @param output output image
|
||||
/// @param high highpass coefficient to save into the highpass band
|
||||
__device__ int writeHighInto(T * const output, const T & other) {
|
||||
return saveAndUpdate(output, other, this->strideHighToLow);
|
||||
}
|
||||
|
||||
//*******Add three functions to get private values*******
|
||||
__device__ int getnext(){
|
||||
return next;
|
||||
}
|
||||
|
||||
__device__ int getend(){
|
||||
return this->end;
|
||||
}
|
||||
|
||||
__device__ int getstrideHighToLow(){
|
||||
return this->strideHighToLow;
|
||||
}
|
||||
|
||||
__device__ int getstrideLowToHigh(){
|
||||
return this->strideLowToHigh;
|
||||
}
|
||||
|
||||
//*******Add three functions to get private values*******
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // namespace dwt_cuda
|
||||
|
||||
|
||||
#endif // IO_H
|
||||
|
|
@ -0,0 +1,360 @@
|
|||
///
|
||||
/// @file rdwt53.cu
|
||||
/// @brief CUDA implementation of reverse 5/3 2D DWT.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-02-04 14:19
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "transform_buffer.h"
|
||||
#include "io.h"
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
|
||||
/// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
|
||||
/// using sliding window and lifting schema.
|
||||
/// @tparam WIN_SIZE_X width of sliding window
|
||||
/// @tparam WIN_SIZE_Y height of sliding window
|
||||
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
||||
class RDWT53 {
|
||||
private:
|
||||
|
||||
/// Shared memory buffer used for 5/3 DWT transforms.
|
||||
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
|
||||
|
||||
/// Shared buffer used for reverse 5/3 DWT.
|
||||
RDWT53Buffer buffer;
|
||||
|
||||
/// Difference between indices of two vertically neighboring items in buffer.
|
||||
enum { STRIDE = RDWT53Buffer::VERTICAL_STRIDE };
|
||||
|
||||
|
||||
/// Info needed for loading of one input column from input image.
|
||||
/// @tparam CHECKED true if loader should check boundaries
|
||||
template <bool CHECKED>
|
||||
struct RDWT53Column {
|
||||
/// loader of pixels from column in input image
|
||||
VerticalDWTBandLoader<int, CHECKED> loader;
|
||||
|
||||
/// Offset of corresponding column in shared buffer.
|
||||
int offset;
|
||||
|
||||
/// Sets all fields to some values to avoid 'uninitialized' warnings.
|
||||
__device__ void clear() {
|
||||
offset = 0;
|
||||
loader.clear();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// 5/3 DWT reverse update operation.
|
||||
struct Reverse53Update {
|
||||
__device__ void operator() (const int p, int & c, const int n) const {
|
||||
c -= (p + n + 2) / 4; // F.3, page 118, ITU-T Rec. T.800 final draft
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// 5/3 DWT reverse predict operation.
|
||||
struct Reverse53Predict {
|
||||
__device__ void operator() (const int p, int & c, const int n) const {
|
||||
c += (p + n) / 2; // F.4, page 118, ITU-T Rec. T.800 final draft
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Horizontal 5/3 RDWT on specified lines of transform buffer.
|
||||
/// @param lines number of lines to be transformed
|
||||
/// @param firstLine index of the first line to be transformed
|
||||
__device__ void horizontalTransform(const int lines, const int firstLine) {
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, Reverse53Update());
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, Reverse53Predict());
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
/// Using given loader, it loads another WIN_SIZE_Y coefficients
|
||||
/// into specified column.
|
||||
/// @tparam CHECKED true if loader should check image boundaries
|
||||
/// @param input input coefficients to load from
|
||||
/// @param col info about loaded column
|
||||
template <bool CHECKED>
|
||||
inline __device__ void loadWindowIntoColumn(const int * const input,
|
||||
RDWT53Column<CHECKED> & col) {
|
||||
for(int i = 3; i < (3 + WIN_SIZE_Y); i += 2) {
|
||||
buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
|
||||
buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Initializes one column of shared transform buffer with 7 input pixels.
|
||||
/// Those 7 pixels will not be transformed. Also initializes given loader.
|
||||
/// @tparam CHECKED true if loader should check image boundaries
|
||||
/// @param columnX x coordinate of column in shared transform buffer
|
||||
/// @param input input image
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param loader (uninitialized) info about loaded column
|
||||
template <bool CHECKED>
|
||||
__device__ void initColumn(const int columnX, const int * const input,
|
||||
const int sizeX, const int sizeY,
|
||||
RDWT53Column<CHECKED> & column,
|
||||
const int firstY) {
|
||||
// coordinates of the first coefficient to be loaded
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + columnX;
|
||||
|
||||
// offset of the column with index 'colIndex' in the transform buffer
|
||||
column.offset = buffer.getColumnOffset(columnX);
|
||||
|
||||
if(blockIdx.y == 0) {
|
||||
// topmost block - apply mirroring rules when loading first 3 rows
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// load pixels in mirrored way
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 0 * STRIDE] =
|
||||
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
} else {
|
||||
// non-topmost row - regular loading:
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY - 1);
|
||||
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
}
|
||||
// Now, the next coefficient, which will be loaded by loader, is #2.
|
||||
}
|
||||
|
||||
|
||||
/// Actual GPU 5/3 RDWT implementation.
|
||||
/// @tparam CHECKED_LOADS true if boundaries must be checked when reading
|
||||
/// @tparam CHECKED_WRITES true if boundaries must be checked when writing
|
||||
/// @param in input image (5/3 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
/// @param winSteps number of sliding window steps
|
||||
template<bool CHECKED_LOADS, bool CHECKED_WRITES>
|
||||
__device__ void transform(const int * const in, int * const out,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
// info about one main and one boundary column
|
||||
RDWT53Column<CHECKED_LOADS> column, boundaryColumn;
|
||||
|
||||
// index of first row to be transformed
|
||||
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
||||
|
||||
// some threads initialize boundary columns
|
||||
boundaryColumn.clear();
|
||||
if(threadIdx.x < 3) {
|
||||
// First 3 threads also handle boundary columns. Thread #0 gets right
|
||||
// column #0, thread #1 get right column #1 and thread #2 left column.
|
||||
const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
|
||||
|
||||
// Thread initializes offset of the boundary column (in shared
|
||||
// buffer), first 3 pixels of the column and a loader for this column.
|
||||
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
|
||||
}
|
||||
|
||||
// All threads initialize central columns.
|
||||
initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
|
||||
|
||||
// horizontally transform first 3 rows
|
||||
horizontalTransform(3, 0);
|
||||
|
||||
// writer of output pixels - initialize it
|
||||
const int outX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
|
||||
VerticalDWTPixelWriter<int, CHECKED_WRITES> writer;
|
||||
writer.init(sizeX, sizeY, outX, firstY);
|
||||
|
||||
// offset of column (in transform buffer) saved by this thread
|
||||
const int outputColumnOffset = buffer.getColumnOffset(threadIdx.x);
|
||||
|
||||
// (Each iteration assumes that first 3 rows of transform buffer are
|
||||
// already loaded with horizontally transformed pixels.)
|
||||
for(int w = 0; w < winSteps; w++) {
|
||||
// Load another WIN_SIZE_Y lines of this thread's column
|
||||
// into the transform buffer.
|
||||
loadWindowIntoColumn(in, column);
|
||||
|
||||
// possibly load boundary columns
|
||||
if(threadIdx.x < 3) {
|
||||
loadWindowIntoColumn(in, boundaryColumn);
|
||||
}
|
||||
|
||||
// horizontally transform all newly loaded lines
|
||||
horizontalTransform(WIN_SIZE_Y, 3);
|
||||
|
||||
// Using 3 registers, remember current values of last 3 rows
|
||||
// of transform buffer. These rows are transformed horizontally
|
||||
// only and will be used in next iteration.
|
||||
int last3Lines[3];
|
||||
last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
|
||||
last3Lines[1] = buffer[outputColumnOffset + (WIN_SIZE_Y + 1) * STRIDE];
|
||||
last3Lines[2] = buffer[outputColumnOffset + (WIN_SIZE_Y + 2) * STRIDE];
|
||||
|
||||
// vertically transform all central columns
|
||||
buffer.forEachVerticalOdd(outputColumnOffset, Reverse53Update());
|
||||
buffer.forEachVerticalEven(outputColumnOffset, Reverse53Predict());
|
||||
|
||||
// Save all results of current window. Results are in transform buffer
|
||||
// at rows from #1 to #(1 + WIN_SIZE_Y). Other rows are invalid now.
|
||||
// (They only served as a boundary for vertical RDWT.)
|
||||
for(int i = 1; i < (1 + WIN_SIZE_Y); i++) {
|
||||
writer.writeInto(out, buffer[outputColumnOffset + i * STRIDE]);
|
||||
}
|
||||
|
||||
// Use last 3 remembered lines as first 3 lines for next iteration.
|
||||
// As expected, these lines are already horizontally transformed.
|
||||
buffer[outputColumnOffset + 0 * STRIDE] = last3Lines[0];
|
||||
buffer[outputColumnOffset + 1 * STRIDE] = last3Lines[1];
|
||||
buffer[outputColumnOffset + 2 * STRIDE] = last3Lines[2];
|
||||
|
||||
// Wait for all writing threads before proceeding to loading new
|
||||
// coeficients in next iteration. (Not to overwrite those which
|
||||
// are not written yet.)
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
/// Main GPU 5/3 RDWT entry point.
|
||||
/// @param in input image (5/3 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
/// @param winSteps number of sliding window steps
|
||||
__device__ static void run(const int * const input, int * const output,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// prepare instance with buffer in shared memory
|
||||
__shared__ RDWT53<WIN_SIZE_X, WIN_SIZE_Y> rdwt53;
|
||||
|
||||
// Compute limits of this threadblock's block of pixels and use them to
|
||||
// determine, whether this threadblock will have to deal with boundary.
|
||||
// (1 in next expressions is for radius of impulse response of 5/3 RDWT.)
|
||||
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
|
||||
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
|
||||
const bool atRightBoudary = maxX >= sx;
|
||||
const bool atBottomBoudary = maxY >= sy;
|
||||
|
||||
// Select specialized version of code according to distance of this
|
||||
// threadblock's pixels from image boundary.
|
||||
if(atBottomBoudary) {
|
||||
// near bottom boundary => check both writing and reading
|
||||
rdwt53.transform<true, true>(input, output, sx, sy, steps);
|
||||
} else if(atRightBoudary) {
|
||||
// near right boundary only => check writing only
|
||||
rdwt53.transform<false, true>(input, output, sx, sy, steps);
|
||||
} else {
|
||||
// no nearby boundary => check nothing
|
||||
rdwt53.transform<false, false>(input, output, sx, sy, steps);
|
||||
}
|
||||
}
|
||||
|
||||
}; // end of class RDWT53
|
||||
|
||||
|
||||
|
||||
/// Main GPU 5/3 RDWT entry point.
|
||||
/// @param in input image (5/3 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
/// @param winSteps number of sliding window steps
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT53<WIN_SX, WIN_SY>), 8))
|
||||
__global__ void rdwt53Kernel(const int * const in, int * const out,
|
||||
const int sx, const int sy, const int steps) {
|
||||
RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Only computes optimal number of sliding window steps,
|
||||
/// number of threadblocks and then lanches the 5/3 RDWT kernel.
|
||||
/// @tparam WIN_SX width of sliding window
|
||||
/// @tparam WIN_SY height of sliding window
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
|
||||
// compute optimal number of steps of each sliding window
|
||||
const int steps = divRndUp(sy, 15 * WIN_SY);
|
||||
|
||||
// prepare grid size
|
||||
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
||||
|
||||
// finally transform this level
|
||||
PERF_BEGIN
|
||||
rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
||||
PERF_END(" RDWT53", sx, sy)
|
||||
CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Input DWT coefficients. Format described in common rules.
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - will contain original image
|
||||
/// in normalized range [-128, 127].
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
|
||||
if(levels > 1) {
|
||||
// let this function recursively reverse transform deeper levels first
|
||||
const int llSizeX = divRndUp(sizeX, 2);
|
||||
const int llSizeY = divRndUp(sizeY, 2);
|
||||
rdwt53(in, out, llSizeX, llSizeY, levels - 1);
|
||||
|
||||
// copy reverse transformed LL band from output back into the input
|
||||
memCopy(in, out, llSizeX, llSizeY);
|
||||
}
|
||||
|
||||
// select right width of kernel for the size of the image
|
||||
if(sizeX >= 960) {
|
||||
launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
|
||||
} else if (sizeX >= 480) {
|
||||
launchRDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
|
||||
} else {
|
||||
launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
|
@ -0,0 +1,363 @@
|
|||
///
|
||||
/// @file rdwt97.cu
|
||||
/// @brief CUDA implementation of reverse 9/7 2D DWT.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-02-03 21:59
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "transform_buffer.h"
|
||||
#include "io.h"
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
/// Wraps shared memory buffer and methods for computing 9/7 RDWT using
|
||||
/// lifting schema and sliding window.
|
||||
/// @tparam WIN_SIZE_X width of the sliding window
|
||||
/// @tparam WIN_SIZE_Y height of the sliding window
|
||||
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
||||
class RDWT97 {
|
||||
private:
|
||||
|
||||
/// Info related to loading of one input column.
|
||||
/// @tparam CHECKED true if boundary chould be checked,
|
||||
/// false if there is no near boudnary
|
||||
template <bool CHECKED>
|
||||
struct RDWT97Column {
|
||||
/// laoder of input pxels for given column.
|
||||
VerticalDWTBandLoader<float, CHECKED> loader;
|
||||
|
||||
/// Offset of loaded column in shared memory buffer.
|
||||
int offset;
|
||||
|
||||
/// Sets all fields to some values to avoid 'uninitialized' warnings.
|
||||
__device__ void clear() {
|
||||
loader.clear();
|
||||
offset = 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Shared memory buffer used for 9/7 DWT transforms.
|
||||
typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> RDWT97Buffer;
|
||||
|
||||
/// Shared buffer used for reverse 9/7 DWT.
|
||||
RDWT97Buffer buffer;
|
||||
|
||||
/// Difference between indices of two vertical neighbors in buffer.
|
||||
enum { STRIDE = RDWT97Buffer::VERTICAL_STRIDE };
|
||||
|
||||
|
||||
/// Horizontal 9/7 RDWT on specified lines of transform buffer.
|
||||
/// @param lines number of lines to be transformed
|
||||
/// @param firstLine index of the first line to be transformed
|
||||
__device__ void horizontalRDWT97(int lines, int firstLine) {
|
||||
__syncthreads();
|
||||
buffer.scaleHorizontal(scale97Mul, scale97Div, firstLine, lines);
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update2));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97predict2));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update1));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97Predict1));
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
/// Initializes one column of shared transform buffer with 7 input pixels.
|
||||
/// Those 7 pixels will not be transformed. Also initializes given loader.
|
||||
/// @tparam CHECKED true if there are near image boundaries
|
||||
/// @param colIndex index of column in shared transform buffer
|
||||
/// @param input input image
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param column (uninitialized) info about loading one column
|
||||
/// @param firstY index of first image row to be transformed
|
||||
template <bool CHECKED>
|
||||
__device__ void initColumn(const int colIndex, const float * const input,
|
||||
const int sizeX, const int sizeY,
|
||||
RDWT97Column<CHECKED> & column,
|
||||
const int firstY) {
|
||||
// coordinates of the first coefficient to be loaded
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
|
||||
|
||||
// offset of the column with index 'colIndex' in the transform buffer
|
||||
column.offset = buffer.getColumnOffset(colIndex);
|
||||
|
||||
if(blockIdx.y == 0) {
|
||||
// topmost block - apply mirroring rules when loading first 7 rows
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// load pixels in mirrored way
|
||||
buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 4 * STRIDE] =
|
||||
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 5 * STRIDE] =
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 6 * STRIDE] =
|
||||
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
} else {
|
||||
// non-topmost row - regular loading:
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY - 3);
|
||||
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 4 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 5 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 6 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
}
|
||||
// Now, the next coefficient, which will be loaded by loader, is #4.
|
||||
}
|
||||
|
||||
|
||||
/// Using given loader, it loads another WIN_SIZE_Y coefficients
|
||||
/// into specified column.
|
||||
/// @tparam CHECKED true if there are near image boundaries
|
||||
/// @param col info about loaded column
|
||||
/// @param input buffer with input coefficients
|
||||
template <bool CHECKED>
|
||||
inline __device__ void loadWindowIntoColumn(RDWT97Column<CHECKED> & col,
|
||||
const float * const input) {
|
||||
for(int i = 7; i < (7 + WIN_SIZE_Y); i += 2) {
|
||||
buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
|
||||
buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Actual GPU 9/7 RDWT sliding window lifting schema implementation.
|
||||
/// @tparam CHECKED_LOADS true if loader should check boundaries
|
||||
/// @tparam CHECKED_WRITES true if boundaries should be taken into account
|
||||
/// when writing into output buffer
|
||||
/// @param in input image (9/7 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
/// @param winSteps number of steps of sliding window
|
||||
template <bool CHECKED_LOADS, bool CHECKED_WRITES>
|
||||
__device__ void transform(const float * const in, float * const out,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
// info about one main column and one boundary column
|
||||
RDWT97Column<CHECKED_LOADS> column;
|
||||
RDWT97Column<CHECKED_LOADS> boundaryColumn;
|
||||
|
||||
// index of first image row to be transformed
|
||||
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
||||
|
||||
// initialize boundary columns
|
||||
boundaryColumn.clear();
|
||||
if(threadIdx.x < 7) {
|
||||
// each thread among first 7 ones gets index of one of boundary columns
|
||||
const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7);
|
||||
|
||||
// Thread initializes offset of the boundary column (in shared
|
||||
// buffer), first 7 pixels of the column and a loader for this column.
|
||||
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
|
||||
}
|
||||
|
||||
// All threads initialize central columns.
|
||||
initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
|
||||
|
||||
// horizontally transform first 7 rows
|
||||
horizontalRDWT97(7, 0);
|
||||
|
||||
// writer of output pixels - initialize it
|
||||
const int outputX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
|
||||
VerticalDWTPixelWriter<float, CHECKED_WRITES> writer;
|
||||
writer.init(sizeX, sizeY, outputX, firstY);
|
||||
|
||||
// offset of column (in transform buffer) saved by this thread
|
||||
const int outColumnOffset = buffer.getColumnOffset(threadIdx.x);
|
||||
|
||||
// (Each iteration assumes that first 7 rows of transform buffer are
|
||||
// already loaded with horizontally transformed pixels.)
|
||||
for(int w = 0; w < winSteps; w++) {
|
||||
// Load another WIN_SIZE_Y lines of this thread's column
|
||||
// into the transform buffer.
|
||||
loadWindowIntoColumn(column, in);
|
||||
|
||||
// possibly load boundary columns
|
||||
if(threadIdx.x < 7) {
|
||||
loadWindowIntoColumn(boundaryColumn, in);
|
||||
}
|
||||
|
||||
// horizontally transform all newly loaded lines
|
||||
horizontalRDWT97(WIN_SIZE_Y, 7);
|
||||
|
||||
// Using 7 registers, remember current values of last 7 rows
|
||||
// of transform buffer. These rows are transformed horizontally
|
||||
// only and will be used in next iteration.
|
||||
float last7Lines[7];
|
||||
for(int i = 0; i < 7; i++) {
|
||||
last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
|
||||
}
|
||||
|
||||
// vertically transform all central columns
|
||||
buffer.scaleVertical(scale97Div, scale97Mul, outColumnOffset,
|
||||
WIN_SIZE_Y + 7, 0);
|
||||
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update2));
|
||||
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97predict2));
|
||||
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update1));
|
||||
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97Predict1));
|
||||
|
||||
// Save all results of current window. Results are in transform buffer
|
||||
// at rows from #3 to #(3 + WIN_SIZE_Y). Other rows are invalid now.
|
||||
// (They only served as a boundary for vertical RDWT.)
|
||||
for(int i = 3; i < (3 + WIN_SIZE_Y); i++) {
|
||||
writer.writeInto(out, buffer[outColumnOffset + i * STRIDE]);
|
||||
}
|
||||
|
||||
// Use last 7 remembered lines as first 7 lines for next iteration.
|
||||
// As expected, these lines are already horizontally transformed.
|
||||
for(int i = 0; i < 7; i++) {
|
||||
buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
|
||||
}
|
||||
|
||||
// Wait for all writing threads before proceeding to loading new
|
||||
// coeficients in next iteration. (Not to overwrite those which
|
||||
// are not written yet.)
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
/// Main GPU 9/7 RDWT entry point.
|
||||
/// @param in input image (9/7 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
__device__ static void run(const float * const input, float * const output,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// prepare instance with buffer in shared memory
|
||||
__shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97;
|
||||
|
||||
// Compute limits of this threadblock's block of pixels and use them to
|
||||
// determine, whether this threadblock will have to deal with boundary.
|
||||
// (3 in next expressions is for radius of impulse response of 9/7 RDWT.)
|
||||
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
|
||||
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
|
||||
const bool atRightBoudary = maxX >= sx;
|
||||
const bool atBottomBoudary = maxY >= sy;
|
||||
|
||||
// Select specialized version of code according to distance of this
|
||||
// threadblock's pixels from image boundary.
|
||||
if(atBottomBoudary) {
|
||||
// near bottom boundary => check both writing and reading
|
||||
rdwt97.transform<true, true>(input, output, sx, sy, steps);
|
||||
} else if(atRightBoudary) {
|
||||
// near right boundary only => check writing only
|
||||
rdwt97.transform<false, true>(input, output, sx, sy, steps);
|
||||
} else {
|
||||
// no nearby boundary => check nothing
|
||||
rdwt97.transform<false, false>(input, output, sx, sy, steps);
|
||||
}
|
||||
}
|
||||
|
||||
}; // end of class RDWT97
|
||||
|
||||
|
||||
|
||||
/// Main GPU 9/7 RDWT entry point.
|
||||
/// @param in input image (9/7 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8))
|
||||
__global__ void rdwt97Kernel(const float * const in, float * const out,
|
||||
const int sx, const int sy, const int steps) {
|
||||
RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Only computes optimal number of sliding window steps,
|
||||
/// number of threadblocks and then lanches the 9/7 RDWT kernel.
|
||||
/// @tparam WIN_SX width of sliding window
|
||||
/// @tparam WIN_SY height of sliding window
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
void launchRDWT97Kernel (float * in, float * out, int sx, int sy) {
|
||||
// compute optimal number of steps of each sliding window
|
||||
const int steps = divRndUp(sy, 15 * WIN_SY);
|
||||
|
||||
// prepare grid size
|
||||
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
||||
|
||||
// finally launch kernel
|
||||
PERF_BEGIN
|
||||
rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
||||
PERF_END(" RDWT97", sx, sy)
|
||||
CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details.
|
||||
/// @param in Input DWT coefficients. Format described in common rules.
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - will contain original image
|
||||
/// in normalized range [-0.5, 0.5].
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
|
||||
if(levels > 1) {
|
||||
// let this function recursively reverse transform deeper levels first
|
||||
const int llSizeX = divRndUp(sizeX, 2);
|
||||
const int llSizeY = divRndUp(sizeY, 2);
|
||||
rdwt97(in, out, llSizeX, llSizeY, levels - 1);
|
||||
|
||||
// copy reverse transformed LL band from output back into the input
|
||||
memCopy(in, out, llSizeX, llSizeY);
|
||||
}
|
||||
|
||||
// select right width of kernel for the size of the image
|
||||
if(sizeX >= 960) {
|
||||
launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
|
||||
} else if (sizeX >= 480) {
|
||||
launchRDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
|
||||
} else {
|
||||
launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
|
@ -0,0 +1,373 @@
|
|||
/// line 248 the index
|
||||
/// @file transform_buffer.h
|
||||
/// @brief Buffer with separated even and odd columns and related algorithms.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-01-20 18:33
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#ifndef TRANSFORM_BUFFER_H
|
||||
#define TRANSFORM_BUFFER_H
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
/// Buffer (in shared memory of GPU) where block of input image is stored,
|
||||
/// but odd and even lines are separated. (Generates less bank conflicts when
|
||||
/// using lifting schema.) All operations expect SIZE_X threads.
|
||||
/// Also implements basic building blocks of lifting schema.
|
||||
/// @tparam SIZE_X width of the buffer excluding two boundaries (Also
|
||||
/// a number of threads participating on all operations.)
|
||||
/// Must be divisible by 4.
|
||||
/// @tparam SIZE_Y height of buffer (total number of lines)
|
||||
/// @tparam BOUNDARY_X number of extra pixels at the left and right side
|
||||
/// boundary is expected to be smaller than half SIZE_X
|
||||
/// Must be divisible by 2.
|
||||
template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
|
||||
class TransformBuffer {
|
||||
public:
|
||||
enum {
|
||||
/// difference between pointers to two vertical neigbors
|
||||
VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
|
||||
};
|
||||
|
||||
private:
|
||||
enum {
|
||||
/// number of shared memory banks - needed for correct padding
|
||||
#ifdef __CUDA_ARCH__
|
||||
SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
|
||||
#else
|
||||
SHM_BANKS = 16, // for host code only - can be anything, won't be used
|
||||
#endif
|
||||
|
||||
/// size of one of two buffers (odd or even)
|
||||
BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
|
||||
|
||||
/// unused space between two buffers
|
||||
PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
|
||||
|
||||
/// offset of the odd columns buffer from the beginning of data buffer
|
||||
ODD_OFFSET = BUFFER_SIZE + PADDING,
|
||||
};
|
||||
|
||||
/// buffer for both even and odd columns
|
||||
T data[2 * BUFFER_SIZE + PADDING];
|
||||
|
||||
|
||||
|
||||
/// Applies specified function to all central elements while also passing
|
||||
/// previous and next elements as parameters.
|
||||
/// @param count count of central elements to apply function to
|
||||
/// @param prevOffset offset of first central element
|
||||
/// @param midOffset offset of first central element's predecessor
|
||||
/// @param nextOffset offset of first central element's successor
|
||||
/// @param function the function itself
|
||||
template <typename FUNC>
|
||||
__device__ void horizontalStep(const int count, const int prevOffset,
|
||||
const int midOffset, const int nextOffset,
|
||||
const FUNC & function) {
|
||||
// number of unchecked iterations
|
||||
const int STEPS = count / SIZE_X;
|
||||
|
||||
// items remaining after last unchecked iteration
|
||||
const int finalCount = count % SIZE_X;
|
||||
|
||||
// offset of items processed in last (checked) iteration
|
||||
const int finalOffset = count - finalCount;
|
||||
|
||||
// all threads perform fixed number of iterations ...
|
||||
for(int i = 0; i < STEPS; i++) {
|
||||
// for(int i = 0; i < 3; i++) {
|
||||
const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
|
||||
const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
|
||||
T & center = data[midOffset + i * SIZE_X + threadIdx.x];
|
||||
// function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
|
||||
function(previous, center, next);// the real one
|
||||
}
|
||||
|
||||
// ... but not all threads participate on final iteration
|
||||
if(threadIdx.x < finalCount) {
|
||||
const T previous = data[prevOffset + finalOffset + threadIdx.x];
|
||||
const T next = data[nextOffset + finalOffset + threadIdx.x];
|
||||
T & center = data[midOffset + finalOffset + threadIdx.x];
|
||||
// function(previous, center, (nextOffset+finalOffset+threadIdx.x));
|
||||
// kaixi
|
||||
function(previous, center, next);//the real one
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
__device__ void getPrintData() {
|
||||
//
|
||||
for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) {
|
||||
printf(" index: %d data: %f \n ", i ,data[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Gets offset of the column with given index. Central columns have
|
||||
/// indices from 0 to NUM_LINES - 1, left boundary columns have negative
|
||||
/// indices and right boundary columns indices start with NUM_LINES.
|
||||
/// @param columnIndex index of column to get pointer to
|
||||
/// @return offset of the first item of column with specified index
|
||||
__device__ int getColumnOffset(int columnIndex) {
|
||||
columnIndex += BOUNDARY_X; // skip boundary
|
||||
return columnIndex / 2 // select right column
|
||||
+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
|
||||
}
|
||||
|
||||
|
||||
/// Provides access to data of the transform buffer.
|
||||
/// @param index index of the item to work with
|
||||
/// @return reference to item at given index
|
||||
__device__ T & operator[] (const int index) {
|
||||
return data[index];
|
||||
}
|
||||
|
||||
|
||||
/// Applies specified function to all horizontally even elements in
|
||||
/// specified lines. (Including even elements in boundaries except
|
||||
/// first even element in first left boundary.) SIZE_X threads participate
|
||||
/// and synchronization is needed before result can be used.
|
||||
/// @param firstLine index of first line
|
||||
/// @param numLines count of lines
|
||||
/// @param func function to be applied on all even elements
|
||||
/// parameters: previous (odd) element, the even
|
||||
/// element itself and finally next (odd) element
|
||||
template <typename FUNC>
|
||||
__device__ void forEachHorizontalEven(const int firstLine,
|
||||
const int numLines,
|
||||
const FUNC & func) {
|
||||
// number of even elemens to apply function to
|
||||
const int count = numLines * VERTICAL_STRIDE - 1;
|
||||
// offset of first even element
|
||||
const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
|
||||
// offset of odd predecessor of first even element
|
||||
const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
|
||||
// offset of odd successor of first even element
|
||||
const int nextOffset = prevOffset + 1;
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
|
||||
// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
|
||||
// }
|
||||
|
||||
// call generic horizontal step function
|
||||
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
|
||||
}
|
||||
|
||||
|
||||
/// Applies given function to all horizontally odd elements in specified
|
||||
/// lines. (Including odd elements in boundaries except last odd element
|
||||
/// in last right boundary.) SIZE_X threads participate and synchronization
|
||||
/// is needed before result can be used.
|
||||
/// @param firstLine index of first line
|
||||
/// @param numLines count of lines
|
||||
/// @param func function to be applied on all odd elements
|
||||
/// parameters: previous (even) element, the odd
|
||||
/// element itself and finally next (even) element
|
||||
template <typename FUNC>
|
||||
__device__ void forEachHorizontalOdd(const int firstLine,
|
||||
const int numLines,
|
||||
const FUNC & func) {
|
||||
// numbet of odd elements to apply function to
|
||||
const int count = numLines * VERTICAL_STRIDE - 1;
|
||||
// offset of even predecessor of first odd element
|
||||
const int prevOffset = firstLine * VERTICAL_STRIDE;
|
||||
// offset of first odd element
|
||||
const int centerOffset = prevOffset + ODD_OFFSET;
|
||||
// offset of even successor of first odd element
|
||||
const int nextOffset = prevOffset + 1;
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
|
||||
// }
|
||||
|
||||
|
||||
// call generic horizontal step function
|
||||
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
|
||||
}
|
||||
|
||||
|
||||
/// Applies specified function to all even elements (except element #0)
|
||||
/// of given column. Each thread takes care of one column, so there's
|
||||
/// no need for synchronization.
|
||||
/// @param columnOffset offset of thread's column
|
||||
/// @param f function to be applied on all even elements
|
||||
/// parameters: previous (odd) element, the even
|
||||
/// element itself and finally next (odd) element
|
||||
template <typename F>
|
||||
__device__ void forEachVerticalEven(const int columnOffset, const F & f) {
|
||||
if(SIZE_Y > 3) { // makes no sense otherwise
|
||||
const int steps = SIZE_Y / 2 - 1;
|
||||
for(int i = 0; i < steps; i++) {
|
||||
const int row = 2 + i * 2;
|
||||
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
|
||||
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
|
||||
f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next);
|
||||
|
||||
//--------------- FOR TEST -----------------
|
||||
/* __syncthreads();
|
||||
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
|
||||
diffOut[2500]++;
|
||||
diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
|
||||
}
|
||||
__syncthreads();
|
||||
*/ //--------------- FOR TEST -----------------
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Applies specified function to all odd elements of given column.
|
||||
/// Each thread takes care of one column, so there's no need for
|
||||
/// synchronization.
|
||||
/// @param columnOffset offset of thread's column
|
||||
/// @param f function to be applied on all odd elements
|
||||
/// parameters: previous (even) element, the odd
|
||||
/// element itself and finally next (even) element
|
||||
template <typename F>
|
||||
__device__ void forEachVerticalOdd(const int columnOffset, const F & f) {
|
||||
const int steps = (SIZE_Y - 1) / 2;
|
||||
for(int i = 0; i < steps; i++) {
|
||||
const int row = i * 2 + 1;
|
||||
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
|
||||
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
|
||||
|
||||
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
|
||||
|
||||
|
||||
//--------------- FOR TEST -----------------
|
||||
/* __syncthreads();
|
||||
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
|
||||
diffOut[2500]++;
|
||||
diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
*/ //--------------- FOR TEST -----------------
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Scales elements at specified lines.
|
||||
/// @param evenScale scaling factor for horizontally even elements
|
||||
/// @param oddScale scaling factor for horizontally odd elements
|
||||
/// @param numLines number of lines, whose elements should be scaled
|
||||
/// @param firstLine index of first line to scale elements in
|
||||
__device__ void scaleHorizontal(const T evenScale, const T oddScale,
|
||||
const int firstLine, const int numLines) {
|
||||
const int offset = firstLine * VERTICAL_STRIDE;
|
||||
const int count = numLines * VERTICAL_STRIDE;
|
||||
const int steps = count / SIZE_X;
|
||||
const int finalCount = count % SIZE_X;
|
||||
const int finalOffset = count - finalCount;
|
||||
|
||||
// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);
|
||||
|
||||
// run iterations, whete all threads participate
|
||||
for(int i = 0; i < steps; i++) {
|
||||
data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
|
||||
// if(threadIdx.x + i * SIZE_X + offset == 531) {
|
||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||
// }
|
||||
// if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
|
||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||
// }
|
||||
data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
|
||||
}
|
||||
|
||||
// some threads also finish remaining unscaled items
|
||||
if(threadIdx.x < finalCount) {
|
||||
data[threadIdx.x + finalOffset + offset] *= evenScale;
|
||||
// if(threadIdx.x + finalOffset + offset == 531) {
|
||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||
// }
|
||||
// if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
|
||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||
// }
|
||||
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Scales elements in specified column.
|
||||
/// @param evenScale scaling factor for vertically even elements
|
||||
/// @param oddScale scaling factor for vertically odd elements
|
||||
/// @param columnOffset offset of the column to work with
|
||||
/// @param numLines number of lines, whose elements should be scaled
|
||||
/// @param firstLine index of first line to scale elements in
|
||||
__device__ void scaleVertical(const T evenScale, const T oddScale,
|
||||
const int columnOffset, const int numLines,
|
||||
const int firstLine) {
|
||||
for(int i = firstLine; i < (numLines + firstLine); i++) {
|
||||
if(i & 1) {
|
||||
data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
|
||||
} else {
|
||||
data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//****************For Test(Feb23), test inter parameters*************
|
||||
__device__ int getVERTICAL_STRIDE(){
|
||||
return VERTICAL_STRIDE;
|
||||
}
|
||||
__device__ int getSHM_BANKS(){
|
||||
return SHM_BANKS;
|
||||
}
|
||||
__device__ int getBuffersize(){
|
||||
return BUFFER_SIZE;
|
||||
}
|
||||
__device__ int getPADDING(){
|
||||
return PADDING;
|
||||
}
|
||||
__device__ int getODD_OFFSET(){
|
||||
return ODD_OFFSET;
|
||||
}
|
||||
|
||||
|
||||
//****************For Test(Feb23), test inter parameters*************
|
||||
|
||||
|
||||
}; // end of class TransformBuffer
|
||||
|
||||
|
||||
} // namespace dwt_cuda
|
||||
|
||||
|
||||
#endif // TRANSFORM_BUFFER_H
|
||||
|
|
@ -0,0 +1,401 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <error.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <sys/time.h>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "components.h"
|
||||
#include "dwt.h"
|
||||
|
||||
struct dwt {
|
||||
char * srcFilename;
|
||||
char * outFilename;
|
||||
unsigned char *srcImg;
|
||||
int pixWidth;
|
||||
int pixHeight;
|
||||
int components;
|
||||
int dwtLvls;
|
||||
};
|
||||
|
||||
int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
|
||||
{
|
||||
// printf("Loading ipnput: %s\n", srcFilename);
|
||||
char *path = "../../data/dwt2d/";
|
||||
char *newSrc = NULL;
|
||||
|
||||
if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
|
||||
{
|
||||
newSrc[0] = '\0';
|
||||
strcat(newSrc, path);
|
||||
strcat(newSrc, srcFilename);
|
||||
srcFilename= newSrc;
|
||||
}
|
||||
printf("Loading ipnput: %s\n", srcFilename);
|
||||
|
||||
//srcFilename = strcat("../../data/dwt2d/",srcFilename);
|
||||
//read image
|
||||
int i = open(srcFilename, O_RDONLY, 0644);
|
||||
if (i == -1) {
|
||||
error(0,errno,"cannot access %s", srcFilename);
|
||||
return -1;
|
||||
}
|
||||
int ret = read(i, srcImg, inputSize);
|
||||
printf("precteno %d, inputsize %d\n", ret, inputSize);
|
||||
close(i);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void usage() {
|
||||
printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
|
||||
-d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
|
||||
-c, --components\t\tnumber of color components, default 3\n\
|
||||
-b, --depth\t\t\tbit depth, default 8\n\
|
||||
-l, --level\t\t\tDWT level, default 3\n\
|
||||
-D, --device\t\t\tcuda device\n\
|
||||
-f, --forward\t\t\tforward transform\n\
|
||||
-r, --reverse\t\t\treverse transform\n\
|
||||
-9, --97\t\t\t9/7 transform\n\
|
||||
-5, --53\t\t\t5/3 transform\n\
|
||||
-w --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void processDWT(struct dwt *d, int forward, int writeVisual)
|
||||
{
|
||||
int componentSize = d->pixWidth*d->pixHeight*sizeof(T);
|
||||
|
||||
T *c_r_out, *backup ;
|
||||
cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_r_out, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
cudaMalloc((void**)&backup, componentSize); //< aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(backup, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
if (d->components == 3) {
|
||||
/* Alloc two more buffers for G and B */
|
||||
T *c_g_out, *c_b_out;
|
||||
cudaMalloc((void**)&c_g_out, componentSize); //< aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_g_out, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_b_out, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
/* Load components */
|
||||
T *c_r, *c_g, *c_b;
|
||||
cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_r, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
cudaMalloc((void**)&c_g, componentSize); //< G, aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_g, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
cudaMalloc((void**)&c_b, componentSize); //< B, aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_b, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight);
|
||||
|
||||
|
||||
/* Compute DWT and always store into file */
|
||||
nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
|
||||
nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
|
||||
nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
|
||||
|
||||
// -------test----------
|
||||
// T *h_r_out=(T*)malloc(componentSize);
|
||||
// cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost);
|
||||
// int ii;
|
||||
// for(ii=0;ii<componentSize/sizeof(T);ii++) {
|
||||
// fprintf(stderr, "%d ", h_r_out[ii]);
|
||||
// if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
|
||||
// }
|
||||
// -------test----------
|
||||
|
||||
|
||||
/* Store DWT to file */
|
||||
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
|
||||
// writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
|
||||
// writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
|
||||
#ifdef OUTPUT
|
||||
if (writeVisual) {
|
||||
writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
|
||||
writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
|
||||
writeNStage2DDWT(c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
|
||||
} else {
|
||||
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
|
||||
writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
|
||||
writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
cudaFree(c_r);
|
||||
cudaCheckError("Cuda free");
|
||||
cudaFree(c_g);
|
||||
cudaCheckError("Cuda free");
|
||||
cudaFree(c_b);
|
||||
cudaCheckError("Cuda free");
|
||||
cudaFree(c_g_out);
|
||||
cudaCheckError("Cuda free");
|
||||
cudaFree(c_b_out);
|
||||
cudaCheckError("Cuda free");
|
||||
|
||||
}
|
||||
else if (d->components == 1) {
|
||||
//Load component
|
||||
T *c_r;
|
||||
cudaMalloc((void**)&(c_r), componentSize); //< R, aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_r, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight);
|
||||
|
||||
// Compute DWT
|
||||
nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
|
||||
|
||||
// Store DWT to file
|
||||
// #ifdef OUTPUT
|
||||
if (writeVisual) {
|
||||
writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out");
|
||||
} else {
|
||||
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".lin.out");
|
||||
}
|
||||
// #endif
|
||||
cudaFree(c_r);
|
||||
cudaCheckError("Cuda free");
|
||||
}
|
||||
|
||||
cudaFree(c_r_out);
|
||||
cudaCheckError("Cuda free device");
|
||||
cudaFree(backup);
|
||||
cudaCheckError("Cuda free device");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int optindex = 0;
|
||||
char ch;
|
||||
struct option longopts[] = {
|
||||
{"dimension", required_argument, 0, 'd'}, //dimensions of src img
|
||||
{"components", required_argument, 0, 'c'}, //numger of components of src img
|
||||
{"depth", required_argument, 0, 'b'}, //bit depth of src img
|
||||
{"level", required_argument, 0, 'l'}, //level of dwt
|
||||
{"device", required_argument, 0, 'D'}, //cuda device
|
||||
{"forward", no_argument, 0, 'f'}, //forward transform
|
||||
{"reverse", no_argument, 0, 'r'}, //reverse transform
|
||||
{"97", no_argument, 0, '9'}, //9/7 transform
|
||||
{"53", no_argument, 0, '5' }, //5/3transform
|
||||
{"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
|
||||
{"help", no_argument, 0, 'h'}
|
||||
};
|
||||
|
||||
int pixWidth = 0; //<real pixWidth
|
||||
int pixHeight = 0; //<real pixHeight
|
||||
int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
|
||||
int bitDepth = 8;
|
||||
int dwtLvls = 3; //default numuber of DWT levels
|
||||
int device = 0;
|
||||
int forward = 1; //forward transform
|
||||
int dwt97 = 1; //1=dwt9/7, 0=dwt5/3 transform
|
||||
int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
|
||||
char * pos;
|
||||
|
||||
while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) {
|
||||
switch (ch) {
|
||||
case 'd':
|
||||
pixWidth = atoi(optarg);
|
||||
pos = strstr(optarg, "x");
|
||||
if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) {
|
||||
usage();
|
||||
return -1;
|
||||
}
|
||||
pixHeight = atoi(pos+1);
|
||||
break;
|
||||
case 'c':
|
||||
compCount = atoi(optarg);
|
||||
break;
|
||||
case 'b':
|
||||
bitDepth = atoi(optarg);
|
||||
break;
|
||||
case 'l':
|
||||
dwtLvls = atoi(optarg);
|
||||
break;
|
||||
case 'D':
|
||||
device = atoi(optarg);
|
||||
break;
|
||||
case 'f':
|
||||
forward = 1;
|
||||
break;
|
||||
case 'r':
|
||||
forward = 0;
|
||||
break;
|
||||
case '9':
|
||||
dwt97 = 1;
|
||||
break;
|
||||
case '5':
|
||||
dwt97 = 0;
|
||||
break;
|
||||
case 'w':
|
||||
writeVisual = 1;
|
||||
break;
|
||||
case 'h':
|
||||
usage();
|
||||
return 0;
|
||||
case '?':
|
||||
return -1;
|
||||
default :
|
||||
usage();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
argc -= optind;
|
||||
argv += optind;
|
||||
|
||||
if (argc == 0) { // at least one filename is expected
|
||||
printf("Please supply src file name\n");
|
||||
usage();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (pixWidth <= 0 || pixHeight <=0) {
|
||||
printf("Wrong or missing dimensions\n");
|
||||
usage();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (forward == 0) {
|
||||
writeVisual = 0; //do not write visual when RDWT
|
||||
}
|
||||
|
||||
// device init
|
||||
int devCount;
|
||||
cudaSetDevice(0);
|
||||
cudaGetDeviceCount(&devCount);
|
||||
cudaCheckError("Get device count");
|
||||
if (devCount == 0) {
|
||||
printf("No CUDA enabled device\n");
|
||||
return -1;
|
||||
}
|
||||
if (device < 0 || device > devCount -1) {
|
||||
printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
|
||||
device, 0, devCount -1);
|
||||
return -1;
|
||||
}
|
||||
cudaDeviceProp devProp;
|
||||
cudaGetDeviceProperties(&devProp, device);
|
||||
cudaCheckError("Get device properties");
|
||||
// if (devProp.major < 1) {
|
||||
// printf("Device %d does not support CUDA\n", device);
|
||||
// return -1;
|
||||
// }
|
||||
printf("Using device %d: %s\n", device, devProp.name);
|
||||
cudaSetDevice(device);
|
||||
cudaCheckError("Set selected device");
|
||||
|
||||
struct dwt *d;
|
||||
d = (struct dwt *)malloc(sizeof(struct dwt));
|
||||
d->srcImg = NULL;
|
||||
d->pixWidth = pixWidth;
|
||||
d->pixHeight = pixHeight;
|
||||
d->components = compCount;
|
||||
d->dwtLvls = dwtLvls;
|
||||
|
||||
// file names
|
||||
d->srcFilename = (char *)malloc(strlen(argv[0]));
|
||||
strcpy(d->srcFilename, argv[0]);
|
||||
if (argc == 1) { // only one filename supplyed
|
||||
d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
|
||||
strcpy(d->outFilename, d->srcFilename);
|
||||
strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
|
||||
} else {
|
||||
d->outFilename = strdup(argv[1]);
|
||||
}
|
||||
|
||||
//Input review
|
||||
printf("Source file:\t\t%s\n", d->srcFilename);
|
||||
printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
|
||||
printf(" Components count:\t%d\n", compCount);
|
||||
printf(" Bit depth:\t\t%d\n", bitDepth);
|
||||
printf(" DWT levels:\t\t%d\n", dwtLvls);
|
||||
printf(" Forward transform:\t%d\n", forward);
|
||||
printf(" 9/7 transform:\t\t%d\n", dwt97);
|
||||
|
||||
//data sizes
|
||||
int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
|
||||
|
||||
//load img source image
|
||||
cudaMallocHost((void **)&d->srcImg, inputSize);
|
||||
cudaCheckError("Alloc host memory");
|
||||
if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
|
||||
return -1;
|
||||
|
||||
/* DWT */
|
||||
if (forward == 1) {
|
||||
if(dwt97 == 1 )
|
||||
processDWT<float>(d, forward, writeVisual);
|
||||
else // 5/3
|
||||
processDWT<int>(d, forward, writeVisual);
|
||||
}
|
||||
else { // reverse
|
||||
if(dwt97 == 1 )
|
||||
processDWT<float>(d, forward, writeVisual);
|
||||
else // 5/3
|
||||
processDWT<int>(d, forward, writeVisual);
|
||||
}
|
||||
|
||||
//writeComponent(r_cuda, pixWidth, pixHeight, srcFilename, ".g");
|
||||
//writeComponent(g_wave_cuda, 512000, ".g");
|
||||
//writeComponent(g_cuda, componentSize, ".g");
|
||||
//writeComponent(b_wave_cuda, componentSize, ".b");
|
||||
cudaFreeHost(d->srcImg);
|
||||
cudaCheckError("Cuda free host");
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
./dwt2d 4.bmp z.dwt -d 4x4 -f -5 -l 3
|
||||
# ./dwt2d 8.bmp -d 8x8 -f -5 -l 3
|
||||
# ./dwt2d 16.bmp -d 16x16 -f -5 -l 3
|
||||
# ./dwt2d 64.bmp -d 64x64 -f -5 -l 3
|
||||
|
||||
# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
||||
# ls
|
||||
# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
|
|
@ -0,0 +1,8 @@
|
|||
# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
||||
# ls
|
||||
# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
|
||||
# ./dwt2d 16.bmp -d 16x16 -f -9 -l 3\
|
||||
./dwt2d 4.bmp -d 4x4 -r -5 -l 3
|
||||
# ./dwt2d 4.bmp -d 4x4 -r -9 -l 3
|
||||
# ./dwt2d 8.bmp -d 8x8 -f -9 -l 3
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
||||
# ls
|
||||
# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
|
||||
# ./nvcc_dwt2d 4.bmp -d 4x4 -f -9 -l 3
|
||||
./nvcc_dwt2d 4.bmp -d 4x4 -f -5 -l 3
|
||||
# ./nvcc_dwt2d 8.bmp -d 8x8 -f -9 -l 3
|
||||
# ./nvcc_dwt2d 16.bmp -d 16x16 -f -5 -l 3
|
||||
# ./nvcc_dwt2d 16.bmp -d 16x16 -r -5 -l 3
|
||||
# ./nvcc_dwt2d 16.bmp -d 16x16 -f -9 -l 3
|
||||
# ./nvcc_dwt2d 4.bmp -d 4x4 -r -9 -l 3
|
||||
# ./nvcc_dwt2d 64.bmp -d 64x64 -f -5 -l 3
|
||||
# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
||||
# ls
|
||||
# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
|
|
@ -0,0 +1,51 @@
|
|||
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu dwt_cuda/common.cu dwt_cuda/rdwt97.cu dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -I. -I/include -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
|
||||
../../build/compilation/kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc
|
||||
../../build/compilation/kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc
|
||||
../../build/compilation/kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc
|
||||
|
||||
../../build/compilation/kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc
|
||||
|
||||
../../build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
../../build/compilation/hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
|
||||
../../build/compilation/hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
|
||||
../../build/compilation/hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
|
||||
../../build/compilation/hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
|
||||
|
||||
../../build/compilation/hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
|
||||
../../build/compilation/hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
|
||||
../../build/compilation/hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
|
||||
../../build/compilation/kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc
|
||||
../../build/compilation/kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc
|
||||
../../build/compilation/kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj common.bc
|
||||
llc --relocation-model=pic --filetype=obj components.bc
|
||||
llc --relocation-model=pic --filetype=obj fdwt53.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj dwt.bc
|
||||
|
||||
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj common_host.bc
|
||||
llc --relocation-model=pic --filetype=obj components_host.bc
|
||||
llc --relocation-model=pic --filetype=obj fdwt53_host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj dwt_host.bc
|
||||
|
||||
|
||||
llc --relocation-model=pic --filetype=obj fdwt97_host.bc
|
||||
llc --relocation-model=pic --filetype=obj rdwt97_host.bc
|
||||
llc --relocation-model=pic --filetype=obj rdwt53_host.bc
|
||||
llc --relocation-model=pic --filetype=obj fdwt97.bc
|
||||
llc --relocation-model=pic --filetype=obj rdwt97.bc
|
||||
llc --relocation-model=pic --filetype=obj rdwt53.bc
|
||||
|
||||
g++ -g -Wall -L../../build/runtime -L../../build/runtime/threadPool -o dwt2d -fPIC -no-pie common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lc -lx86Runtime -lthreadPool -lpthread
|
|
@ -0,0 +1,18 @@
|
|||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c main.cu -o main.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt.cu -o dwt.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c components.cu -o components.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt53.cu -o dwt_cuda/fdwt53.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt97.cu -o dwt_cuda/fdwt97.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/common.cu -o dwt_cuda/common.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
|
||||
g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -39,6 +39,12 @@ cudaError_t cudaMalloc(void **devPtr, size_t size) {
|
|||
return cudaErrorMemoryAllocation;
|
||||
return cudaSuccess;
|
||||
}
|
||||
cudaError_t cudaMallocHost(void **devPtr, size_t size) {
|
||||
*devPtr = malloc(size);
|
||||
if (devPtr == NULL)
|
||||
return cudaErrorMemoryAllocation;
|
||||
return cudaSuccess;
|
||||
}
|
||||
cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
|
||||
memset(devPtr, value, count);
|
||||
return cudaSuccess;
|
||||
|
@ -58,7 +64,7 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
|||
memcpy(dst, src, count);
|
||||
} else if (kind == cudaMemcpyDeviceToDevice) {
|
||||
|
||||
memcpy(dst, dst, count);
|
||||
memcpy(dst, src, count);
|
||||
} else if (kind == cudaMemcpyDefault) {
|
||||
memcpy(dst, src, count);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue