361 lines
15 KiB
Plaintext
Executable File
361 lines
15 KiB
Plaintext
Executable File
///
|
|
/// @file rdwt53.cu
|
|
/// @brief CUDA implementation of reverse 5/3 2D DWT.
|
|
/// @author Martin Jirman (207962@mail.muni.cz)
|
|
/// @date 2011-02-04 14:19
|
|
///
|
|
///
|
|
/// Copyright (c) 2011 Martin Jirman
|
|
/// All rights reserved.
|
|
///
|
|
/// Redistribution and use in source and binary forms, with or without
|
|
/// modification, are permitted provided that the following conditions are met:
|
|
///
|
|
/// * Redistributions of source code must retain the above copyright
|
|
/// notice, this list of conditions and the following disclaimer.
|
|
/// * Redistributions in binary form must reproduce the above copyright
|
|
/// notice, this list of conditions and the following disclaimer in the
|
|
/// documentation and/or other materials provided with the distribution.
|
|
///
|
|
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
/// POSSIBILITY OF SUCH DAMAGE.
|
|
///
|
|
|
|
|
|
#include "common.h"
|
|
#include "transform_buffer.h"
|
|
#include "io.h"
|
|
|
|
|
|
namespace dwt_cuda {
|
|
|
|
|
|
|
|
/// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
|
|
/// using sliding window and lifting schema.
|
|
/// @tparam WIN_SIZE_X width of sliding window
|
|
/// @tparam WIN_SIZE_Y height of sliding window
|
|
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
|
class RDWT53 {
|
|
private:
|
|
|
|
/// Shared memory buffer used for 5/3 DWT transforms.
|
|
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
|
|
|
|
/// Shared buffer used for reverse 5/3 DWT.
|
|
RDWT53Buffer buffer;
|
|
|
|
/// Difference between indices of two vertically neighboring items in buffer.
|
|
enum { STRIDE = RDWT53Buffer::VERTICAL_STRIDE };
|
|
|
|
|
|
/// Info needed for loading of one input column from input image.
|
|
/// @tparam CHECKED true if loader should check boundaries
|
|
template <bool CHECKED>
|
|
struct RDWT53Column {
|
|
/// loader of pixels from column in input image
|
|
VerticalDWTBandLoader<int, CHECKED> loader;
|
|
|
|
/// Offset of corresponding column in shared buffer.
|
|
int offset;
|
|
|
|
/// Sets all fields to some values to avoid 'uninitialized' warnings.
|
|
__device__ void clear() {
|
|
offset = 0;
|
|
loader.clear();
|
|
}
|
|
};
|
|
|
|
|
|
/// 5/3 DWT reverse update operation.
|
|
struct Reverse53Update {
|
|
__device__ void operator() (const int p, int & c, const int n) const {
|
|
c -= (p + n + 2) / 4; // F.3, page 118, ITU-T Rec. T.800 final draft
|
|
}
|
|
};
|
|
|
|
|
|
/// 5/3 DWT reverse predict operation.
|
|
struct Reverse53Predict {
|
|
__device__ void operator() (const int p, int & c, const int n) const {
|
|
c += (p + n) / 2; // F.4, page 118, ITU-T Rec. T.800 final draft
|
|
}
|
|
};
|
|
|
|
|
|
/// Horizontal 5/3 RDWT on specified lines of transform buffer.
|
|
/// @param lines number of lines to be transformed
|
|
/// @param firstLine index of the first line to be transformed
|
|
__device__ void horizontalTransform(const int lines, const int firstLine) {
|
|
__syncthreads();
|
|
buffer.forEachHorizontalEven(firstLine, lines, Reverse53Update());
|
|
__syncthreads();
|
|
buffer.forEachHorizontalOdd(firstLine, lines, Reverse53Predict());
|
|
__syncthreads();
|
|
}
|
|
|
|
|
|
/// Using given loader, it loads another WIN_SIZE_Y coefficients
|
|
/// into specified column.
|
|
/// @tparam CHECKED true if loader should check image boundaries
|
|
/// @param input input coefficients to load from
|
|
/// @param col info about loaded column
|
|
template <bool CHECKED>
|
|
inline __device__ void loadWindowIntoColumn(const int * const input,
|
|
RDWT53Column<CHECKED> & col) {
|
|
for(int i = 3; i < (3 + WIN_SIZE_Y); i += 2) {
|
|
buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
|
|
buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
|
|
}
|
|
}
|
|
|
|
|
|
/// Initializes one column of shared transform buffer with 7 input pixels.
|
|
/// Those 7 pixels will not be transformed. Also initializes given loader.
|
|
/// @tparam CHECKED true if loader should check image boundaries
|
|
/// @param columnX x coordinate of column in shared transform buffer
|
|
/// @param input input image
|
|
/// @param sizeX width of the input image
|
|
/// @param sizeY height of the input image
|
|
/// @param loader (uninitialized) info about loaded column
|
|
template <bool CHECKED>
|
|
__device__ void initColumn(const int columnX, const int * const input,
|
|
const int sizeX, const int sizeY,
|
|
RDWT53Column<CHECKED> & column,
|
|
const int firstY) {
|
|
// coordinates of the first coefficient to be loaded
|
|
const int firstX = blockIdx.x * WIN_SIZE_X + columnX;
|
|
|
|
// offset of the column with index 'colIndex' in the transform buffer
|
|
column.offset = buffer.getColumnOffset(columnX);
|
|
|
|
if(blockIdx.y == 0) {
|
|
// topmost block - apply mirroring rules when loading first 3 rows
|
|
column.loader.init(sizeX, sizeY, firstX, firstY);
|
|
|
|
// load pixels in mirrored way
|
|
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
|
buffer[column.offset + 0 * STRIDE] =
|
|
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
|
} else {
|
|
// non-topmost row - regular loading:
|
|
column.loader.init(sizeX, sizeY, firstX, firstY - 1);
|
|
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
|
|
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
|
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
|
}
|
|
// Now, the next coefficient, which will be loaded by loader, is #2.
|
|
}
|
|
|
|
|
|
/// Actual GPU 5/3 RDWT implementation.
|
|
/// @tparam CHECKED_LOADS true if boundaries must be checked when reading
|
|
/// @tparam CHECKED_WRITES true if boundaries must be checked when writing
|
|
/// @param in input image (5/3 transformed coefficients)
|
|
/// @param out output buffer (for reverse transformed image)
|
|
/// @param sizeX width of the output image
|
|
/// @param sizeY height of the output image
|
|
/// @param winSteps number of sliding window steps
|
|
template<bool CHECKED_LOADS, bool CHECKED_WRITES>
|
|
__device__ void transform(const int * const in, int * const out,
|
|
const int sizeX, const int sizeY,
|
|
const int winSteps) {
|
|
// info about one main and one boundary column
|
|
RDWT53Column<CHECKED_LOADS> column, boundaryColumn;
|
|
|
|
// index of first row to be transformed
|
|
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
|
|
|
// some threads initialize boundary columns
|
|
boundaryColumn.clear();
|
|
if(threadIdx.x < 3) {
|
|
// First 3 threads also handle boundary columns. Thread #0 gets right
|
|
// column #0, thread #1 get right column #1 and thread #2 left column.
|
|
const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
|
|
|
|
// Thread initializes offset of the boundary column (in shared
|
|
// buffer), first 3 pixels of the column and a loader for this column.
|
|
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
|
|
}
|
|
|
|
// All threads initialize central columns.
|
|
initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
|
|
|
|
// horizontally transform first 3 rows
|
|
horizontalTransform(3, 0);
|
|
|
|
// writer of output pixels - initialize it
|
|
const int outX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
|
|
VerticalDWTPixelWriter<int, CHECKED_WRITES> writer;
|
|
writer.init(sizeX, sizeY, outX, firstY);
|
|
|
|
// offset of column (in transform buffer) saved by this thread
|
|
const int outputColumnOffset = buffer.getColumnOffset(threadIdx.x);
|
|
|
|
// (Each iteration assumes that first 3 rows of transform buffer are
|
|
// already loaded with horizontally transformed pixels.)
|
|
for(int w = 0; w < winSteps; w++) {
|
|
// Load another WIN_SIZE_Y lines of this thread's column
|
|
// into the transform buffer.
|
|
loadWindowIntoColumn(in, column);
|
|
|
|
// possibly load boundary columns
|
|
if(threadIdx.x < 3) {
|
|
loadWindowIntoColumn(in, boundaryColumn);
|
|
}
|
|
|
|
// horizontally transform all newly loaded lines
|
|
horizontalTransform(WIN_SIZE_Y, 3);
|
|
|
|
// Using 3 registers, remember current values of last 3 rows
|
|
// of transform buffer. These rows are transformed horizontally
|
|
// only and will be used in next iteration.
|
|
int last3Lines[3];
|
|
last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
|
|
last3Lines[1] = buffer[outputColumnOffset + (WIN_SIZE_Y + 1) * STRIDE];
|
|
last3Lines[2] = buffer[outputColumnOffset + (WIN_SIZE_Y + 2) * STRIDE];
|
|
|
|
// vertically transform all central columns
|
|
buffer.forEachVerticalOdd(outputColumnOffset, Reverse53Update());
|
|
buffer.forEachVerticalEven(outputColumnOffset, Reverse53Predict());
|
|
|
|
// Save all results of current window. Results are in transform buffer
|
|
// at rows from #1 to #(1 + WIN_SIZE_Y). Other rows are invalid now.
|
|
// (They only served as a boundary for vertical RDWT.)
|
|
for(int i = 1; i < (1 + WIN_SIZE_Y); i++) {
|
|
writer.writeInto(out, buffer[outputColumnOffset + i * STRIDE]);
|
|
}
|
|
|
|
// Use last 3 remembered lines as first 3 lines for next iteration.
|
|
// As expected, these lines are already horizontally transformed.
|
|
buffer[outputColumnOffset + 0 * STRIDE] = last3Lines[0];
|
|
buffer[outputColumnOffset + 1 * STRIDE] = last3Lines[1];
|
|
buffer[outputColumnOffset + 2 * STRIDE] = last3Lines[2];
|
|
|
|
// Wait for all writing threads before proceeding to loading new
|
|
// coeficients in next iteration. (Not to overwrite those which
|
|
// are not written yet.)
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
|
|
public:
|
|
/// Main GPU 5/3 RDWT entry point.
|
|
/// @param in input image (5/3 transformed coefficients)
|
|
/// @param out output buffer (for reverse transformed image)
|
|
/// @param sizeX width of the output image
|
|
/// @param sizeY height of the output image
|
|
/// @param winSteps number of sliding window steps
|
|
__device__ static void run(const int * const input, int * const output,
|
|
const int sx, const int sy, const int steps) {
|
|
// prepare instance with buffer in shared memory
|
|
__shared__ RDWT53<WIN_SIZE_X, WIN_SIZE_Y> rdwt53;
|
|
|
|
// Compute limits of this threadblock's block of pixels and use them to
|
|
// determine, whether this threadblock will have to deal with boundary.
|
|
// (1 in next expressions is for radius of impulse response of 5/3 RDWT.)
|
|
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
|
|
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
|
|
const bool atRightBoudary = maxX >= sx;
|
|
const bool atBottomBoudary = maxY >= sy;
|
|
|
|
// Select specialized version of code according to distance of this
|
|
// threadblock's pixels from image boundary.
|
|
if(atBottomBoudary) {
|
|
// near bottom boundary => check both writing and reading
|
|
rdwt53.transform<true, true>(input, output, sx, sy, steps);
|
|
} else if(atRightBoudary) {
|
|
// near right boundary only => check writing only
|
|
rdwt53.transform<false, true>(input, output, sx, sy, steps);
|
|
} else {
|
|
// no nearby boundary => check nothing
|
|
rdwt53.transform<false, false>(input, output, sx, sy, steps);
|
|
}
|
|
}
|
|
|
|
}; // end of class RDWT53
|
|
|
|
|
|
|
|
/// Main GPU 5/3 RDWT entry point.
|
|
/// @param in input image (5/3 transformed coefficients)
|
|
/// @param out output buffer (for reverse transformed image)
|
|
/// @param sizeX width of the output image
|
|
/// @param sizeY height of the output image
|
|
/// @param winSteps number of sliding window steps
|
|
template <int WIN_SX, int WIN_SY>
|
|
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT53<WIN_SX, WIN_SY>), 8))
|
|
__global__ void rdwt53Kernel(const int * const in, int * const out,
|
|
const int sx, const int sy, const int steps) {
|
|
RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
|
|
}
|
|
|
|
|
|
|
|
/// Only computes optimal number of sliding window steps,
|
|
/// number of threadblocks and then lanches the 5/3 RDWT kernel.
|
|
/// @tparam WIN_SX width of sliding window
|
|
/// @tparam WIN_SY height of sliding window
|
|
/// @param in input image
|
|
/// @param out output buffer
|
|
/// @param sx width of the input image
|
|
/// @param sy height of the input image
|
|
template <int WIN_SX, int WIN_SY>
|
|
void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
|
|
// compute optimal number of steps of each sliding window
|
|
const int steps = divRndUp(sy, 15 * WIN_SY);
|
|
|
|
// prepare grid size
|
|
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
|
|
|
// finally transform this level
|
|
PERF_BEGIN
|
|
rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
|
PERF_END(" RDWT53", sx, sy)
|
|
CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
|
|
}
|
|
|
|
|
|
|
|
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
|
|
/// @param in Input DWT coefficients. Format described in common rules.
|
|
/// Will not be preserved (will be overwritten).
|
|
/// @param out output buffer on GPU - will contain original image
|
|
/// in normalized range [-128, 127].
|
|
/// @param sizeX width of input image (in pixels)
|
|
/// @param sizeY height of input image (in pixels)
|
|
/// @param levels number of recursive DWT levels
|
|
void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
|
|
if(levels > 1) {
|
|
// let this function recursively reverse transform deeper levels first
|
|
const int llSizeX = divRndUp(sizeX, 2);
|
|
const int llSizeY = divRndUp(sizeY, 2);
|
|
rdwt53(in, out, llSizeX, llSizeY, levels - 1);
|
|
|
|
// copy reverse transformed LL band from output back into the input
|
|
memCopy(in, out, llSizeX, llSizeY);
|
|
}
|
|
|
|
// select right width of kernel for the size of the image
|
|
if(sizeX >= 960) {
|
|
launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
|
|
} else if (sizeX >= 480) {
|
|
launchRDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
|
|
} else {
|
|
launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
|
|
}
|
|
}
|
|
|
|
|
|
} // end of namespace dwt_cuda
|