CuPBoP/examples/dwt2d/dwt_cuda/transform_buffer.h

339 lines
15 KiB
C++

/// line 248 the index
/// @file transform_buffer.h
/// @brief Buffer with separated even and odd columns and related algorithms.
/// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-01-20 18:33
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#ifndef TRANSFORM_BUFFER_H
#define TRANSFORM_BUFFER_H
namespace dwt_cuda {
/// Buffer (in shared memory of GPU) where block of input image is stored,
/// but odd and even lines are separated. (Generates less bank conflicts when
/// using lifting schema.) All operations expect SIZE_X threads.
/// Also implements basic building blocks of lifting schema.
/// @tparam SIZE_X width of the buffer excluding two boundaries (Also
/// a number of threads participating on all operations.)
/// Must be divisible by 4.
/// @tparam SIZE_Y height of buffer (total number of lines)
/// @tparam BOUNDARY_X number of extra pixels at the left and right side
/// boundary is expected to be smaller than half SIZE_X
/// Must be divisible by 2.
template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
class TransformBuffer {
public:
enum {
/// difference between pointers to two vertical neigbors
VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
};
private:
enum {
/// number of shared memory banks - needed for correct padding
#ifdef __CUDA_ARCH__
SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
#else
SHM_BANKS = 16, // for host code only - can be anything, won't be used
#endif
/// size of one of two buffers (odd or even)
BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
/// unused space between two buffers
PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
/// offset of the odd columns buffer from the beginning of data buffer
ODD_OFFSET = BUFFER_SIZE + PADDING,
};
/// buffer for both even and odd columns
T data[2 * BUFFER_SIZE + PADDING];
/// Applies specified function to all central elements while also passing
/// previous and next elements as parameters.
/// @param count count of central elements to apply function to
/// @param prevOffset offset of first central element
/// @param midOffset offset of first central element's predecessor
/// @param nextOffset offset of first central element's successor
/// @param function the function itself
template <typename FUNC>
__device__ void horizontalStep(const int count, const int prevOffset,
const int midOffset, const int nextOffset,
const FUNC &function) {
// number of unchecked iterations
const int STEPS = count / SIZE_X;
// items remaining after last unchecked iteration
const int finalCount = count % SIZE_X;
// offset of items processed in last (checked) iteration
const int finalOffset = count - finalCount;
// all threads perform fixed number of iterations ...
for (int i = 0; i < STEPS; i++) {
// for(int i = 0; i < 3; i++) {
const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
T &center = data[midOffset + i * SIZE_X + threadIdx.x];
// function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
function(previous, center, next); // the real one
}
// ... but not all threads participate on final iteration
if (threadIdx.x < finalCount) {
const T previous = data[prevOffset + finalOffset + threadIdx.x];
const T next = data[nextOffset + finalOffset + threadIdx.x];
T &center = data[midOffset + finalOffset + threadIdx.x];
// function(previous, center, (nextOffset+finalOffset+threadIdx.x));
// kaixi
function(previous, center, next); // the real one
}
}
public:
__device__ void getPrintData() {
//
for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
printf(" index: %d data: %f \n ", i, data[i]);
}
}
/// Gets offset of the column with given index. Central columns have
/// indices from 0 to NUM_LINES - 1, left boundary columns have negative
/// indices and right boundary columns indices start with NUM_LINES.
/// @param columnIndex index of column to get pointer to
/// @return offset of the first item of column with specified index
__device__ int getColumnOffset(int columnIndex) {
columnIndex += BOUNDARY_X; // skip boundary
return columnIndex / 2 // select right column
+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
}
/// Provides access to data of the transform buffer.
/// @param index index of the item to work with
/// @return reference to item at given index
__device__ T &operator[](const int index) { return data[index]; }
/// Applies specified function to all horizontally even elements in
/// specified lines. (Including even elements in boundaries except
/// first even element in first left boundary.) SIZE_X threads participate
/// and synchronization is needed before result can be used.
/// @param firstLine index of first line
/// @param numLines count of lines
/// @param func function to be applied on all even elements
/// parameters: previous (odd) element, the even
/// element itself and finally next (odd) element
template <typename FUNC>
__device__ void forEachHorizontalEven(const int firstLine, const int numLines,
const FUNC &func) {
// number of even elemens to apply function to
const int count = numLines * VERTICAL_STRIDE - 1;
// offset of first even element
const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
// offset of odd predecessor of first even element
const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
// offset of odd successor of first even element
const int nextOffset = prevOffset + 1;
// if(threadIdx.x == 0) {
// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
// }
// call generic horizontal step function
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
}
/// Applies given function to all horizontally odd elements in specified
/// lines. (Including odd elements in boundaries except last odd element
/// in last right boundary.) SIZE_X threads participate and synchronization
/// is needed before result can be used.
/// @param firstLine index of first line
/// @param numLines count of lines
/// @param func function to be applied on all odd elements
/// parameters: previous (even) element, the odd
/// element itself and finally next (even) element
template <typename FUNC>
__device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
const FUNC &func) {
// numbet of odd elements to apply function to
const int count = numLines * VERTICAL_STRIDE - 1;
// offset of even predecessor of first odd element
const int prevOffset = firstLine * VERTICAL_STRIDE;
// offset of first odd element
const int centerOffset = prevOffset + ODD_OFFSET;
// offset of even successor of first odd element
const int nextOffset = prevOffset + 1;
// if(threadIdx.x == 0) {
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
// }
// call generic horizontal step function
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
}
/// Applies specified function to all even elements (except element #0)
/// of given column. Each thread takes care of one column, so there's
/// no need for synchronization.
/// @param columnOffset offset of thread's column
/// @param f function to be applied on all even elements
/// parameters: previous (odd) element, the even
/// element itself and finally next (odd) element
template <typename F>
__device__ void forEachVerticalEven(const int columnOffset, const F &f) {
if (SIZE_Y > 3) { // makes no sense otherwise
const int steps = SIZE_Y / 2 - 1;
for (int i = 0; i < steps; i++) {
const int row = 2 + i * 2;
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
//--------------- FOR TEST -----------------
/* __syncthreads();
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
diffOut[2500]++;
diffOut[diffOut[2500]] = 2;//data[columnOffset +
row * VERTICAL_STRIDE];
}
__syncthreads();
*/ //--------------- FOR TEST -----------------
}
}
}
/// Applies specified function to all odd elements of given column.
/// Each thread takes care of one column, so there's no need for
/// synchronization.
/// @param columnOffset offset of thread's column
/// @param f function to be applied on all odd elements
/// parameters: previous (even) element, the odd
/// element itself and finally next (even) element
template <typename F>
__device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
const int steps = (SIZE_Y - 1) / 2;
for (int i = 0; i < steps; i++) {
const int row = i * 2 + 1;
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
//--------------- FOR TEST -----------------
/* __syncthreads();
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
diffOut[2500]++;
diffOut[diffOut[2500]] = 1; //data[columnOffset +
row * VERTICAL_STRIDE];
}
__syncthreads();
*/ //--------------- FOR TEST -----------------
}
}
/// Scales elements at specified lines.
/// @param evenScale scaling factor for horizontally even elements
/// @param oddScale scaling factor for horizontally odd elements
/// @param numLines number of lines, whose elements should be scaled
/// @param firstLine index of first line to scale elements in
__device__ void scaleHorizontal(const T evenScale, const T oddScale,
const int firstLine, const int numLines) {
const int offset = firstLine * VERTICAL_STRIDE;
const int count = numLines * VERTICAL_STRIDE;
const int steps = count / SIZE_X;
const int finalCount = count % SIZE_X;
const int finalOffset = count - finalCount;
// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d,
// finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
// finalCount, finalOffset);
// run iterations, whete all threads participate
for (int i = 0; i < steps; i++) {
data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
// if(threadIdx.x + i * SIZE_X + offset == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
// if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
}
// some threads also finish remaining unscaled items
if (threadIdx.x < finalCount) {
data[threadIdx.x + finalOffset + offset] *= evenScale;
// if(threadIdx.x + finalOffset + offset == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
// if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
}
}
/// Scales elements in specified column.
/// @param evenScale scaling factor for vertically even elements
/// @param oddScale scaling factor for vertically odd elements
/// @param columnOffset offset of the column to work with
/// @param numLines number of lines, whose elements should be scaled
/// @param firstLine index of first line to scale elements in
__device__ void scaleVertical(const T evenScale, const T oddScale,
const int columnOffset, const int numLines,
const int firstLine) {
for (int i = firstLine; i < (numLines + firstLine); i++) {
if (i & 1) {
data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
} else {
data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
}
}
}
//****************For Test(Feb23), test inter parameters*************
__device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
__device__ int getSHM_BANKS() { return SHM_BANKS; }
__device__ int getBuffersize() { return BUFFER_SIZE; }
__device__ int getPADDING() { return PADDING; }
__device__ int getODD_OFFSET() { return ODD_OFFSET; }
//****************For Test(Feb23), test inter parameters*************
}; // end of class TransformBuffer
} // namespace dwt_cuda
#endif // TRANSFORM_BUFFER_H