401 lines
16 KiB
Plaintext
Executable File
401 lines
16 KiB
Plaintext
Executable File
/// @file fdwt53.cu
|
|
/// @brief CUDA implementation of forward 5/3 2D DWT.
|
|
/// @author Martin Jirman (207962@mail.muni.cz)
|
|
/// @date 2011-02-04 13:23
|
|
///
|
|
///
|
|
/// Copyright (c) 2011 Martin Jirman
|
|
/// All rights reserved.
|
|
///
|
|
/// Redistribution and use in source and binary forms, with or without
|
|
/// modification, are permitted provided that the following conditions are met:
|
|
///
|
|
/// * Redistributions of source code must retain the above copyright
|
|
/// notice, this list of conditions and the following disclaimer.
|
|
/// * Redistributions in binary form must reproduce the above copyright
|
|
/// notice, this list of conditions and the following disclaimer in the
|
|
/// documentation and/or other materials provided with the distribution.
|
|
///
|
|
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
/// POSSIBILITY OF SUCH DAMAGE.
|
|
///
|
|
|
|
|
|
#include "common.h"
|
|
#include "transform_buffer.h"
|
|
#include "io.h"
|
|
|
|
namespace dwt_cuda {
|
|
|
|
|
|
/// Wraps buffer and methods needed for computing one level of 5/3 FDWT
|
|
/// using sliding window approach.
|
|
/// @tparam WIN_SIZE_X width of sliding window
|
|
/// @tparam WIN_SIZE_Y height of sliding window
|
|
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
|
class FDWT53 {
|
|
private:
|
|
|
|
/// Info needed for processing of one input column.
|
|
/// @tparam CHECKED_LOADER true if column's loader should check boundaries
|
|
/// false if there are no near boudnaries to check
|
|
template <bool CHECKED_LOADER>
|
|
struct FDWT53Column {
|
|
/// loader for the column
|
|
VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
|
|
|
|
/// offset of the column in shared buffer
|
|
int offset;
|
|
|
|
// backup of first 3 loaded pixels (not transformed)
|
|
int pixel0, pixel1, pixel2;
|
|
|
|
/// Sets all fields to anything to prevent 'uninitialized' warnings.
|
|
__device__ void clear() {
|
|
offset = pixel0 = pixel1 = pixel2 = 0;
|
|
loader.clear();
|
|
}
|
|
};
|
|
|
|
|
|
/// Type of shared memory buffer for 5/3 FDWT transforms.
|
|
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> FDWT53Buffer;
|
|
|
|
/// Actual shared buffer used for forward 5/3 DWT.
|
|
FDWT53Buffer buffer;
|
|
|
|
/// Difference between indices of two vertical neighbors in buffer.
|
|
enum { STRIDE = FDWT53Buffer::VERTICAL_STRIDE };
|
|
|
|
|
|
/// Forward 5/3 DWT predict operation.
|
|
struct Forward53Predict {
|
|
__device__ void operator() (const int p, int & c, const int n) const {
|
|
// c = n;
|
|
c -= (p + n) / 2; // F.8, page 126, ITU-T Rec. T.800 final draft the real one
|
|
}
|
|
};
|
|
|
|
|
|
/// Forward 5/3 DWT update operation.
|
|
struct Forward53Update {
|
|
__device__ void operator() (const int p, int & c, const int n) const {
|
|
c += (p + n + 2) / 4; // F.9, page 126, ITU-T Rec. T.800 final draft
|
|
}
|
|
};
|
|
|
|
|
|
/// Initializes one column: computes offset of the column in shared memory
|
|
/// buffer, initializes loader and finally uses it to load first 3 pixels.
|
|
/// @tparam CHECKED true if loader of the column checks boundaries
|
|
/// @param column (uninitialized) column info to be initialized
|
|
/// @param input input image
|
|
/// @param sizeX width of the input image
|
|
/// @param sizeY height of the input image
|
|
/// @param colIndex x-axis coordinate of the column (relative to the left
|
|
/// side of this threadblock's block of input pixels)
|
|
/// @param firstY y-axis coordinate of first image row to be transformed
|
|
|
|
template <bool CHECKED>
|
|
__device__ void initColumn(FDWT53Column<CHECKED> & column,
|
|
const int * const input,
|
|
const int sizeX, const int sizeY,
|
|
const int colIndex, const int firstY) {
|
|
// get offset of the column with index 'cId'
|
|
column.offset = buffer.getColumnOffset(colIndex);
|
|
|
|
// coordinates of the first pixel to be loaded
|
|
const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
|
|
|
|
if(blockIdx.y == 0) {
|
|
// topmost block - apply mirroring rules when loading first 3 rows
|
|
column.loader.init(sizeX, sizeY, firstX, firstY);
|
|
|
|
// load pixels in mirrored way
|
|
column.pixel2 = column.loader.loadFrom(input); // loaded pixel #0
|
|
column.pixel1 = column.loader.loadFrom(input); // loaded pixel #1
|
|
column.pixel0 = column.loader.loadFrom(input); // loaded pixel #2
|
|
|
|
// reinitialize loader to start with pixel #1 again
|
|
column.loader.init(sizeX, sizeY, firstX, firstY + 1);
|
|
} else {
|
|
// non-topmost row - regular loading:
|
|
column.loader.init(sizeX, sizeY, firstX, firstY - 2);
|
|
|
|
// load 3 rows into the column
|
|
column.pixel0 = column.loader.loadFrom(input);
|
|
column.pixel1 = column.loader.loadFrom(input);
|
|
column.pixel2 = column.loader.loadFrom(input);
|
|
// Now, the next pixel, which will be loaded by loader, is pixel #1.
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/// Loads and vertically transforms given column. Assumes that first 3
|
|
/// pixels are already loaded in column fields pixel0 ... pixel2.
|
|
/// @tparam CHECKED true if loader of the column checks boundaries
|
|
/// @param column column to be loaded and vertically transformed
|
|
/// @param input pointer to input image data
|
|
template <bool CHECKED>
|
|
__device__ void loadAndVerticallyTransform(FDWT53Column<CHECKED> & column,
|
|
const int * const input) {
|
|
// take 3 loaded pixels and put them into shared memory transform buffer
|
|
buffer[column.offset + 0 * STRIDE] = column.pixel0;
|
|
buffer[column.offset + 1 * STRIDE] = column.pixel1;
|
|
buffer[column.offset + 2 * STRIDE] = column.pixel2;
|
|
|
|
// load remaining pixels to be able to vertically transform the window
|
|
|
|
for(int i = 3; i < (3 + WIN_SIZE_Y); i++)
|
|
{
|
|
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
|
|
}
|
|
|
|
// remember last 3 pixels for use in next iteration
|
|
column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
|
|
column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
|
|
column.pixel2 = buffer[column.offset + (WIN_SIZE_Y + 2) * STRIDE];
|
|
|
|
// vertically transform the column in transform buffer
|
|
buffer.forEachVerticalOdd(column.offset, Forward53Predict());
|
|
buffer.forEachVerticalEven(column.offset, Forward53Update());
|
|
|
|
}
|
|
|
|
|
|
/// Actual implementation of 5/3 FDWT.
|
|
/// @tparam CHECK_LOADS true if input loader must check boundaries
|
|
/// @tparam CHECK_WRITES true if output writer must check boundaries
|
|
/// @param in input image
|
|
/// @param out output buffer
|
|
/// @param sizeX width of the input image
|
|
/// @param sizeY height of the input image
|
|
/// @param winSteps number of sliding window steps
|
|
template <bool CHECK_LOADS, bool CHECK_WRITES>
|
|
__device__ void transform(const int * const in, int * const out,
|
|
const int sizeX, const int sizeY,
|
|
const int winSteps) {
|
|
// info about one main and one boundary columns processed by this thread
|
|
FDWT53Column<CHECK_LOADS> column;
|
|
FDWT53Column<CHECK_LOADS> boundaryColumn; // only few threads use this
|
|
|
|
// Initialize all column info: initialize loaders, compute offset of
|
|
// column in shared buffer and initialize loader of column.
|
|
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
|
initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
|
|
|
|
|
|
// first 3 threads initialize boundary columns, others do not use them
|
|
boundaryColumn.clear();
|
|
if(threadIdx.x < 3) {
|
|
// index of boundary column (relative x-axis coordinate of the column)
|
|
const int colId = threadIdx.x + ((threadIdx.x == 0) ? WIN_SIZE_X : -3);
|
|
|
|
// initialize the column
|
|
initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
|
|
|
|
}
|
|
|
|
|
|
// index of column which will be written into output by this thread
|
|
const int outColumnIndex = parityIdx<WIN_SIZE_X>();
|
|
|
|
// offset of column which will be written by this thread into output
|
|
const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
|
|
|
|
// initialize output writer for this thread
|
|
const int outputFirstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
|
|
VerticalDWTBandWriter<int, CHECK_WRITES> writer;
|
|
writer.init(sizeX, sizeY, outputFirstX, firstY);
|
|
__syncthreads();
|
|
|
|
|
|
// Sliding window iterations:
|
|
// Each iteration assumes that first 3 pixels of each column are loaded.
|
|
for(int w = 0; w < winSteps; w++) {
|
|
|
|
// For each column (including boundary columns): load and vertically
|
|
// transform another WIN_SIZE_Y lines.
|
|
loadAndVerticallyTransform(column, in);
|
|
if(threadIdx.x < 3) {
|
|
loadAndVerticallyTransform(boundaryColumn, in);
|
|
}
|
|
|
|
// wait for all columns to be vertically transformed and transform all
|
|
// output rows horizontally
|
|
__syncthreads();
|
|
|
|
|
|
buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
|
|
__syncthreads();
|
|
|
|
buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
|
|
|
|
// wait for all output rows to be transformed horizontally and write
|
|
// them into output buffer
|
|
__syncthreads();
|
|
|
|
|
|
for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
|
|
// Write low coefficients from output column into low band ...
|
|
writer.writeLowInto(out, buffer[outColumnOffset + r * STRIDE]);
|
|
// ... and high coeficients into the high band.
|
|
writer.writeHighInto(out, buffer[outColumnOffset + (r+1) * STRIDE]);
|
|
}
|
|
|
|
// before proceeding to next iteration, wait for all output columns
|
|
// to be written into the output
|
|
__syncthreads();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
public:
|
|
/// Determines, whether this block's pixels touch boundary and selects
|
|
/// right version of algorithm according to it - for many threadblocks, it
|
|
/// selects version which does not deal with boundary mirroring and thus is
|
|
/// slightly faster.
|
|
/// @param in input image
|
|
/// @param out output buffer
|
|
/// @param sx width of the input image
|
|
/// @param sy height of the input image
|
|
/// @param steps number of sliding window steps
|
|
__device__ static void run(const int * const in, int * const out,
|
|
const int sx, const int sy, const int steps) {
|
|
// if(blockIdx.x==0 && blockIdx.y ==11 && threadIdx.x >=0&&threadIdx.x <64){
|
|
// object with transform buffer in shared memory
|
|
__shared__ FDWT53<WIN_SIZE_X, WIN_SIZE_Y> fdwt53;
|
|
|
|
// Compute limits of this threadblock's block of pixels and use them to
|
|
// determine, whether this threadblock will have to deal with boundary.
|
|
// (1 in next expressions is for radius of impulse response of 9/7 FDWT.)
|
|
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
|
|
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
|
|
const bool atRightBoudary = maxX >= sx;
|
|
const bool atBottomBoudary = maxY >= sy;
|
|
|
|
// Select specialized version of code according to distance of this
|
|
// threadblock's pixels from image boundary.
|
|
|
|
// if(threadIdx.x == 0) {
|
|
// printf("fdwt53 run");
|
|
// }
|
|
if(atBottomBoudary)
|
|
{
|
|
// near bottom boundary => check both writing and reading
|
|
fdwt53.transform<true, true>(in, out, sx, sy, steps);
|
|
} else if(atRightBoudary)
|
|
{
|
|
// near right boundary only => check writing only
|
|
fdwt53.transform<false, true>(in, out, sx, sy, steps);
|
|
} else
|
|
{
|
|
// no nearby boundary => check nothing
|
|
fdwt53.transform<false, false>(in, out, sx, sy, steps);
|
|
}
|
|
}
|
|
// }
|
|
|
|
}; // end of class FDWT53
|
|
|
|
|
|
|
|
/// Main GPU 5/3 FDWT entry point.
|
|
/// @tparam WIN_SX width of sliding window to be used
|
|
/// @tparam WIN_SY height of sliding window to be used
|
|
/// @param input input image
|
|
/// @param output output buffer
|
|
/// @param sizeX width of the input image
|
|
/// @param sizeY height of the input image
|
|
/// @param winSteps number of sliding window steps
|
|
template <int WIN_SX, int WIN_SY>
|
|
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT53<WIN_SX, WIN_SY>), 8))
|
|
__global__ void fdwt53Kernel(const int * const input, int * const output,
|
|
const int sizeX, const int sizeY,
|
|
const int winSteps) {
|
|
FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
|
|
}
|
|
|
|
|
|
|
|
/// Only computes optimal number of sliding window steps,
|
|
/// number of threadblocks and then lanches the 5/3 FDWT kernel.
|
|
/// @tparam WIN_SX width of sliding window
|
|
/// @tparam WIN_SY height of sliding window
|
|
/// @param in input image
|
|
/// @param out output buffer
|
|
/// @param sx width of the input image
|
|
/// @param sy height of the input image
|
|
template <int WIN_SX, int WIN_SY>
|
|
void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
|
|
// compute optimal number of steps of each sliding window
|
|
|
|
const int steps = divRndUp(sy, 15 * WIN_SY);
|
|
|
|
int gx = divRndUp(sx, WIN_SX);
|
|
int gy = divRndUp(sy, WIN_SY * steps);
|
|
|
|
printf("\n sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
|
|
|
|
// prepare grid size
|
|
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
|
// printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
|
|
|
|
// run kernel, possibly measure time and finally check the call
|
|
// PERF_BEGIN
|
|
fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
|
// PERF_END(" FDWT53", sx, sy)
|
|
// CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
|
|
printf("fdwt53Kernel in launchFDWT53Kernel has finished");
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Forward 5/3 2D DWT. See common rules (above) for more details.
|
|
/// @param in Expected to be normalized into range [-128, 127].
|
|
/// Will not be preserved (will be overwritten).
|
|
/// @param out output buffer on GPU
|
|
/// @param sizeX width of input image (in pixels)
|
|
/// @param sizeY height of input image (in pixels)
|
|
/// @param levels number of recursive DWT levels
|
|
void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
|
|
// select right width of kernel for the size of the image
|
|
|
|
if(sizeX >= 960) {
|
|
launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
|
|
} else if (sizeX >= 480) {
|
|
launchFDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
|
|
} else {
|
|
launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
|
|
}
|
|
|
|
// if this was not the last level, continue recursively with other levels
|
|
if(levels > 1) {
|
|
// copy output's LL band back into input buffer
|
|
const int llSizeX = divRndUp(sizeX, 2);
|
|
const int llSizeY = divRndUp(sizeY, 2);
|
|
// printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
|
|
memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
|
|
|
|
// run remaining levels of FDWT
|
|
fdwt53(in, out, llSizeX, llSizeY, levels - 1);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
} // end of namespace dwt_cuda
|