CuPBoP/examples/dwt2d/dwt.cu

386 lines
13 KiB
Plaintext
Executable File

/*
* Copyright (c) 2009, Jiri Matela
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <fcntl.h>
#include <assert.h>
#include <errno.h>
#include <sys/time.h>
#include <unistd.h>
#include <error.h>
#include "dwt_cuda/dwt.h"
#include "dwt_cuda/common.h"
#include "dwt.h"
#include "common.h"
#include <iostream>
#include <fstream>
inline void fdwt(float *in, float *out, int width, int height, int levels)
{
printf(" Running fdwt97 Float \n");
dwt_cuda::fdwt97(in, out, width, height, levels);
}
/*
inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut)
{
dwt_cuda::fdwt97(in, out, width, height, levels, diffOut);
}
*/
inline void fdwt(int *in, int *out, int width, int height, int levels)
{
printf(" Running fdwt53 Int \n");
dwt_cuda::fdwt53(in, out, width, height, levels);
}
/*
inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut)
{
dwt_cuda::fdwt53(in, out, width, height, levels, diffOut);
}
*/
inline void rdwt(float *in, float *out, int width, int height, int levels)
{
printf(" Running rdwt97 Float \n");
dwt_cuda::rdwt97(in, out, width, height, levels);
}
inline void rdwt(int *in, int *out, int width, int height, int levels)
{
printf(" Running rdwt53 Int \n");
dwt_cuda::rdwt53(in, out, width, height, levels);
}
template<typename T>
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
{
printf("\n*** %d stages of 2D forward DWT:\n", stages);
/* create backup of input, because each test iteration overwrites it */
const int size = pixHeight * pixWidth * sizeof(T);
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device");
/* Measure time of individual levels. */
if(forward)
fdwt(in, out, pixWidth, pixHeight, stages);
else
rdwt(in, out, pixWidth, pixHeight, stages);
// Measure overall time of DWT.
/* #ifdef GPU_DWT_TESTING_1
dwt_cuda::CudaDWTTester tester;
for(int i = tester.getNumIterations(); i--; ) {
// Recover input and measure one overall DWT run.
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device");
tester.beginTestIteration();
if(forward)
fdwt(in, out, pixWidth, pixHeight, stages);
else
rdwt(in, out, pixWidth, pixHeight, stages);
tester.endTestIteration();
}
tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
#endif // GPU_DWT_TESTING
cudaCheckAsyncError("DWT Kernel calls");
*/ return 0;
}
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
/*
template<typename T>
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
{
printf("*** %d stages of 2D forward DWT:\n", stages);
// create backup of input, because each test iteration overwrites it
const int size = pixHeight * pixWidth * sizeof(T);
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device");
// Measure time of individual levels.
if(forward)
fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
else
rdwt(in, out, pixWidth, pixHeight, stages);
// Measure overall time of DWT.
#ifdef GPU_DWT_TESTING_1
dwt_cuda::CudaDWTTester tester;
for(int i = tester.getNumIterations(); i--; ) {
// Recover input and measure one overall DWT run.
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device");
tester.beginTestIteration();
if(forward)
fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
else
rdwt(in, out, pixWidth, pixHeight, stages);
tester.endTestIteration();
}
tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
#endif // GPU_DWT_TESTING
cudaCheckAsyncError("DWT Kernel calls");
return 0;
}
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool, float*);
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool, int*);
*/
void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename)
{
int i;
std::ofstream outputFile;
char outfile[strlen(filename)+strlen(".txt")];
strcpy(outfile, filename);
strcpy(outfile+strlen(filename), ".txt");
outputFile.open(outfile);
for(i = 0; i < samplesNum; i++) {
float r = (src[i]+0.5f) * 255;
if (r > 255) r = 255;
if (r < 0) r = 0;
dst[i] = (unsigned char)r;
outputFile << "index: " << i << " val: "<< r <<" \n";
}
outputFile.close();
}
void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename)
{
int i;
std::ofstream outputFile;
char outfile[strlen(filename)+strlen(".txt")];
strcpy(outfile, filename);
strcpy(outfile+strlen(filename), ".txt");
outputFile.open(outfile);
for(i = 0; i < samplesNum; i++) {
int r = src[i]+128;
if (r > 255) r = 255;
if (r < 0) r = 0;
dst[i] = (unsigned char)r;
// added this line to output check
outputFile << "index: " << i << " val: "<< r <<" \n";
}
outputFile.close();
}
///* Write output linear orderd*/
template<typename T>
int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
const char * filename, const char * suffix)
{
unsigned char * result;
T *gpu_output;
int i;
int size;
int samplesNum = pixWidth*pixHeight;
size = samplesNum*sizeof(T);
cudaMallocHost((void **)&gpu_output, size);
cudaCheckError("Malloc host");
memset(gpu_output, 0, size);
result = (unsigned char *)malloc(samplesNum);
cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost);
cudaCheckError("Memcopy device to host");
/* T to char */
samplesToChar(result, gpu_output, samplesNum, filename);
/* Write component */
char outfile[strlen(filename)+strlen(suffix)];
strcpy(outfile, filename);
strcpy(outfile+strlen(filename), suffix);
i = open(outfile, O_CREAT|O_WRONLY, 0644);
if (i == -1) {
error(0,errno,"cannot access %s", outfile);
return -1;
}
printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
ssize_t x ;
x = write(i, result, samplesNum);
close(i);
/* Clean up */
cudaFreeHost(gpu_output);
cudaCheckError("Cuda free host memory");
free(result);
if(x == 0) return 1;
return 0;
}
template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
/* Write output visual ordered */
template<typename T>
int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
int stages, const char * filename, const char * suffix)
{
struct band {
int dimX;
int dimY;
};
struct dimensions {
struct band LL;
struct band HL;
struct band LH;
struct band HH;
};
unsigned char * result;
T *src, *dst;
int i,s;
int size;
int offset;
int yOffset;
int samplesNum = pixWidth*pixHeight;
struct dimensions * bandDims;
bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
bandDims[0].HL.dimY = bandDims[0].LL.dimY;
bandDims[0].LH.dimX = bandDims[0].LL.dimX;
bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
bandDims[0].HH.dimX = bandDims[0].HL.dimX;
bandDims[0].HH.dimY = bandDims[0].LH.dimY;
for (i = 1; i < stages; i++) {
bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
bandDims[i].HL.dimY = bandDims[i].LL.dimY;
bandDims[i].LH.dimX = bandDims[i].LL.dimX;
bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
bandDims[i].HH.dimX = bandDims[i].HL.dimX;
bandDims[i].HH.dimY = bandDims[i].LH.dimY;
}
#if 0
printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
for (i = 0; i < stages; i++) {
printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
}
#endif
size = samplesNum*sizeof(T);
cudaMallocHost((void **)&src, size);
cudaCheckError("Malloc host");
dst = (T*)malloc(size);
memset(src, 0, size);
memset(dst, 0, size);
result = (unsigned char *)malloc(samplesNum);
cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost);
cudaCheckError("Memcopy device to host");
// LL Band
size = bandDims[stages-1].LL.dimX * sizeof(T);
for (i = 0; i < bandDims[stages-1].LL.dimY; i++) {
memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
}
for (s = stages - 1; s >= 0; s--) {
// HL Band
size = bandDims[s].HL.dimX * sizeof(T);
offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
for (i = 0; i < bandDims[s].HL.dimY; i++) {
memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
src+offset+i*bandDims[s].HL.dimX,
size);
}
// LH band
size = bandDims[s].LH.dimX * sizeof(T);
offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
yOffset = bandDims[s].LL.dimY;
for (i = 0; i < bandDims[s].HL.dimY; i++) {
memcpy(dst+(yOffset+i)*pixWidth,
src+offset+i*bandDims[s].LH.dimX,
size);
}
//HH band
size = bandDims[s].HH.dimX * sizeof(T);
offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
yOffset = bandDims[s].HL.dimY;
for (i = 0; i < bandDims[s].HH.dimY; i++) {
memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
src+offset+i*bandDims[s].HH.dimX,
size);
}
}
/* Write component */
samplesToChar(result, dst, samplesNum, filename);
char outfile[strlen(filename)+strlen(suffix)];
strcpy(outfile, filename);
strcpy(outfile+strlen(filename), suffix);
i = open(outfile, O_CREAT|O_WRONLY, 0644);
if (i == -1) {
error(0,errno,"cannot access %s", outfile);
return -1;
}
printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
ssize_t x;
x = write(i, result, samplesNum);
close(i);
cudaFreeHost(src);
cudaCheckError("Cuda free host memory");
free(dst);
free(result);
free(bandDims);
if (x == 0) return 1;
return 0;
}
template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);