/* * Copyright (c) 2009, Jiri Matela * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include "dwt_cuda/dwt.h" #include "dwt_cuda/common.h" #include "dwt.h" #include "common.h" #include #include inline void fdwt(float *in, float *out, int width, int height, int levels) { printf(" Running fdwt97 Float \n"); dwt_cuda::fdwt97(in, out, width, height, levels); } /* inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut) { dwt_cuda::fdwt97(in, out, width, height, levels, diffOut); } */ inline void fdwt(int *in, int *out, int width, int height, int levels) { printf(" Running fdwt53 Int \n"); dwt_cuda::fdwt53(in, out, width, height, levels); } /* inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut) { dwt_cuda::fdwt53(in, out, width, height, levels, diffOut); } */ inline void rdwt(float *in, float *out, int width, int height, int levels) { printf(" Running rdwt97 Float \n"); dwt_cuda::rdwt97(in, out, width, height, levels); } inline void rdwt(int *in, int *out, int width, int height, int levels) { printf(" Running rdwt53 Int \n"); dwt_cuda::rdwt53(in, out, width, height, levels); } template int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward) { printf("\n*** %d stages of 2D forward DWT:\n", stages); /* create backup of input, because each test iteration overwrites it */ const int size = pixHeight * pixWidth * sizeof(T); cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice); cudaCheckError("Memcopy device to device"); /* Measure time of individual levels. */ if(forward) fdwt(in, out, pixWidth, pixHeight, stages); else rdwt(in, out, pixWidth, pixHeight, stages); // Measure overall time of DWT. /* #ifdef GPU_DWT_TESTING_1 dwt_cuda::CudaDWTTester tester; for(int i = tester.getNumIterations(); i--; ) { // Recover input and measure one overall DWT run. cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); cudaCheckError("Memcopy device to device"); tester.beginTestIteration(); if(forward) fdwt(in, out, pixWidth, pixHeight, stages); else rdwt(in, out, pixWidth, pixHeight, stages); tester.endTestIteration(); } tester.showPerformance(" Overall DWT", pixWidth, pixHeight); #endif // GPU_DWT_TESTING cudaCheckAsyncError("DWT Kernel calls"); */ return 0; } template int nStage2dDWT(float*, float*, float*, int, int, int, bool); template int nStage2dDWT(int*, int*, int*, int, int, int, bool); /* template int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut) { printf("*** %d stages of 2D forward DWT:\n", stages); // create backup of input, because each test iteration overwrites it const int size = pixHeight * pixWidth * sizeof(T); cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice); cudaCheckError("Memcopy device to device"); // Measure time of individual levels. if(forward) fdwt(in, out, pixWidth, pixHeight, stages, diffOut); else rdwt(in, out, pixWidth, pixHeight, stages); // Measure overall time of DWT. #ifdef GPU_DWT_TESTING_1 dwt_cuda::CudaDWTTester tester; for(int i = tester.getNumIterations(); i--; ) { // Recover input and measure one overall DWT run. cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice); cudaCheckError("Memcopy device to device"); tester.beginTestIteration(); if(forward) fdwt(in, out, pixWidth, pixHeight, stages, diffOut); else rdwt(in, out, pixWidth, pixHeight, stages); tester.endTestIteration(); } tester.showPerformance(" Overall DWT", pixWidth, pixHeight); #endif // GPU_DWT_TESTING cudaCheckAsyncError("DWT Kernel calls"); return 0; } template int nStage2dDWT(float*, float*, float*, int, int, int, bool, float*); template int nStage2dDWT(int*, int*, int*, int, int, int, bool, int*); */ void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename) { int i; std::ofstream outputFile; char outfile[strlen(filename)+strlen(".txt")]; strcpy(outfile, filename); strcpy(outfile+strlen(filename), ".txt"); outputFile.open(outfile); for(i = 0; i < samplesNum; i++) { float r = (src[i]+0.5f) * 255; if (r > 255) r = 255; if (r < 0) r = 0; dst[i] = (unsigned char)r; outputFile << "index: " << i << " val: "<< r <<" \n"; } outputFile.close(); } void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename) { int i; std::ofstream outputFile; char outfile[strlen(filename)+strlen(".txt")]; strcpy(outfile, filename); strcpy(outfile+strlen(filename), ".txt"); outputFile.open(outfile); for(i = 0; i < samplesNum; i++) { int r = src[i]+128; if (r > 255) r = 255; if (r < 0) r = 0; dst[i] = (unsigned char)r; // added this line to output check outputFile << "index: " << i << " val: "<< r <<" \n"; } outputFile.close(); } ///* Write output linear orderd*/ template int writeLinear(T *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix) { unsigned char * result; T *gpu_output; int i; int size; int samplesNum = pixWidth*pixHeight; size = samplesNum*sizeof(T); cudaMallocHost((void **)&gpu_output, size); cudaCheckError("Malloc host"); memset(gpu_output, 0, size); result = (unsigned char *)malloc(samplesNum); cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost); cudaCheckError("Memcopy device to host"); /* T to char */ samplesToChar(result, gpu_output, samplesNum, filename); /* Write component */ char outfile[strlen(filename)+strlen(suffix)]; strcpy(outfile, filename); strcpy(outfile+strlen(filename), suffix); i = open(outfile, O_CREAT|O_WRONLY, 0644); if (i == -1) { error(0,errno,"cannot access %s", outfile); return -1; } printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight); ssize_t x ; x = write(i, result, samplesNum); close(i); /* Clean up */ cudaFreeHost(gpu_output); cudaCheckError("Cuda free host memory"); free(result); if(x == 0) return 1; return 0; } template int writeLinear(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); template int writeLinear(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); /* Write output visual ordered */ template int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix) { struct band { int dimX; int dimY; }; struct dimensions { struct band LL; struct band HL; struct band LH; struct band HH; }; unsigned char * result; T *src, *dst; int i,s; int size; int offset; int yOffset; int samplesNum = pixWidth*pixHeight; struct dimensions * bandDims; bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions)); bandDims[0].LL.dimX = DIVANDRND(pixWidth,2); bandDims[0].LL.dimY = DIVANDRND(pixHeight,2); bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX; bandDims[0].HL.dimY = bandDims[0].LL.dimY; bandDims[0].LH.dimX = bandDims[0].LL.dimX; bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY; bandDims[0].HH.dimX = bandDims[0].HL.dimX; bandDims[0].HH.dimY = bandDims[0].LH.dimY; for (i = 1; i < stages; i++) { bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2); bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2); bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX; bandDims[i].HL.dimY = bandDims[i].LL.dimY; bandDims[i].LH.dimX = bandDims[i].LL.dimX; bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY; bandDims[i].HH.dimX = bandDims[i].HL.dimX; bandDims[i].HH.dimY = bandDims[i].LH.dimY; } #if 0 printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight); for (i = 0; i < stages; i++) { printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY); printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY); printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY); printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY); } #endif size = samplesNum*sizeof(T); cudaMallocHost((void **)&src, size); cudaCheckError("Malloc host"); dst = (T*)malloc(size); memset(src, 0, size); memset(dst, 0, size); result = (unsigned char *)malloc(samplesNum); cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost); cudaCheckError("Memcopy device to host"); // LL Band size = bandDims[stages-1].LL.dimX * sizeof(T); for (i = 0; i < bandDims[stages-1].LL.dimY; i++) { memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size); } for (s = stages - 1; s >= 0; s--) { // HL Band size = bandDims[s].HL.dimX * sizeof(T); offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY; for (i = 0; i < bandDims[s].HL.dimY; i++) { memcpy(dst+i*pixWidth+bandDims[s].LL.dimX, src+offset+i*bandDims[s].HL.dimX, size); } // LH band size = bandDims[s].LH.dimX * sizeof(T); offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY; yOffset = bandDims[s].LL.dimY; for (i = 0; i < bandDims[s].HL.dimY; i++) { memcpy(dst+(yOffset+i)*pixWidth, src+offset+i*bandDims[s].LH.dimX, size); } //HH band size = bandDims[s].HH.dimX * sizeof(T); offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY; yOffset = bandDims[s].HL.dimY; for (i = 0; i < bandDims[s].HH.dimY; i++) { memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX, src+offset+i*bandDims[s].HH.dimX, size); } } /* Write component */ samplesToChar(result, dst, samplesNum, filename); char outfile[strlen(filename)+strlen(suffix)]; strcpy(outfile, filename); strcpy(outfile+strlen(filename), suffix); i = open(outfile, O_CREAT|O_WRONLY, 0644); if (i == -1) { error(0,errno,"cannot access %s", outfile); return -1; } printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight); ssize_t x; x = write(i, result, samplesNum); close(i); cudaFreeHost(src); cudaCheckError("Cuda free host memory"); free(dst); free(result); free(bandDims); if (x == 0) return 1; return 0; } template int writeNStage2DDWT(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); template int writeNStage2DDWT(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);