/// line 248 the index /// @file transform_buffer.h /// @brief Buffer with separated even and odd columns and related algorithms. /// @author Martin Jirman (207962@mail.muni.cz) /// @date 2011-01-20 18:33 /// /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: /// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. /// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE /// POSSIBILITY OF SUCH DAMAGE. /// #ifndef TRANSFORM_BUFFER_H #define TRANSFORM_BUFFER_H namespace dwt_cuda { /// Buffer (in shared memory of GPU) where block of input image is stored, /// but odd and even lines are separated. (Generates less bank conflicts when /// using lifting schema.) All operations expect SIZE_X threads. /// Also implements basic building blocks of lifting schema. /// @tparam SIZE_X width of the buffer excluding two boundaries (Also /// a number of threads participating on all operations.) /// Must be divisible by 4. /// @tparam SIZE_Y height of buffer (total number of lines) /// @tparam BOUNDARY_X number of extra pixels at the left and right side /// boundary is expected to be smaller than half SIZE_X /// Must be divisible by 2. template class TransformBuffer { public: enum { /// difference between pointers to two vertical neigbors VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2) }; private: enum { /// number of shared memory banks - needed for correct padding #ifdef __CUDA_ARCH__ SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16), #else SHM_BANKS = 16, // for host code only - can be anything, won't be used #endif /// size of one of two buffers (odd or even) BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y, /// unused space between two buffers PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS), /// offset of the odd columns buffer from the beginning of data buffer ODD_OFFSET = BUFFER_SIZE + PADDING, }; /// buffer for both even and odd columns T data[2 * BUFFER_SIZE + PADDING]; /// Applies specified function to all central elements while also passing /// previous and next elements as parameters. /// @param count count of central elements to apply function to /// @param prevOffset offset of first central element /// @param midOffset offset of first central element's predecessor /// @param nextOffset offset of first central element's successor /// @param function the function itself template __device__ void horizontalStep(const int count, const int prevOffset, const int midOffset, const int nextOffset, const FUNC &function) { // number of unchecked iterations const int STEPS = count / SIZE_X; // items remaining after last unchecked iteration const int finalCount = count % SIZE_X; // offset of items processed in last (checked) iteration const int finalOffset = count - finalCount; // all threads perform fixed number of iterations ... for (int i = 0; i < STEPS; i++) { // for(int i = 0; i < 3; i++) { const T previous = data[prevOffset + i * SIZE_X + threadIdx.x]; const T next = data[nextOffset + i * SIZE_X + threadIdx.x]; T ¢er = data[midOffset + i * SIZE_X + threadIdx.x]; // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x)); function(previous, center, next); // the real one } // ... but not all threads participate on final iteration if (threadIdx.x < finalCount) { const T previous = data[prevOffset + finalOffset + threadIdx.x]; const T next = data[nextOffset + finalOffset + threadIdx.x]; T ¢er = data[midOffset + finalOffset + threadIdx.x]; // function(previous, center, (nextOffset+finalOffset+threadIdx.x)); // kaixi function(previous, center, next); // the real one } } public: __device__ void getPrintData() { // for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) { printf(" index: %d data: %f \n ", i, data[i]); } } /// Gets offset of the column with given index. Central columns have /// indices from 0 to NUM_LINES - 1, left boundary columns have negative /// indices and right boundary columns indices start with NUM_LINES. /// @param columnIndex index of column to get pointer to /// @return offset of the first item of column with specified index __device__ int getColumnOffset(int columnIndex) { columnIndex += BOUNDARY_X; // skip boundary return columnIndex / 2 // select right column + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer } /// Provides access to data of the transform buffer. /// @param index index of the item to work with /// @return reference to item at given index __device__ T &operator[](const int index) { return data[index]; } /// Applies specified function to all horizontally even elements in /// specified lines. (Including even elements in boundaries except /// first even element in first left boundary.) SIZE_X threads participate /// and synchronization is needed before result can be used. /// @param firstLine index of first line /// @param numLines count of lines /// @param func function to be applied on all even elements /// parameters: previous (odd) element, the even /// element itself and finally next (odd) element template __device__ void forEachHorizontalEven(const int firstLine, const int numLines, const FUNC &func) { // number of even elemens to apply function to const int count = numLines * VERTICAL_STRIDE - 1; // offset of first even element const int centerOffset = firstLine * VERTICAL_STRIDE + 1; // offset of odd predecessor of first even element const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET; // offset of odd successor of first even element const int nextOffset = prevOffset + 1; // if(threadIdx.x == 0) { // printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d // nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); // } // call generic horizontal step function horizontalStep(count, prevOffset, centerOffset, nextOffset, func); } /// Applies given function to all horizontally odd elements in specified /// lines. (Including odd elements in boundaries except last odd element /// in last right boundary.) SIZE_X threads participate and synchronization /// is needed before result can be used. /// @param firstLine index of first line /// @param numLines count of lines /// @param func function to be applied on all odd elements /// parameters: previous (even) element, the odd /// element itself and finally next (even) element template __device__ void forEachHorizontalOdd(const int firstLine, const int numLines, const FUNC &func) { // numbet of odd elements to apply function to const int count = numLines * VERTICAL_STRIDE - 1; // offset of even predecessor of first odd element const int prevOffset = firstLine * VERTICAL_STRIDE; // offset of first odd element const int centerOffset = prevOffset + ODD_OFFSET; // offset of even successor of first odd element const int nextOffset = prevOffset + 1; // if(threadIdx.x == 0) { // printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d // nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); // } // call generic horizontal step function horizontalStep(count, prevOffset, centerOffset, nextOffset, func); } /// Applies specified function to all even elements (except element #0) /// of given column. Each thread takes care of one column, so there's /// no need for synchronization. /// @param columnOffset offset of thread's column /// @param f function to be applied on all even elements /// parameters: previous (odd) element, the even /// element itself and finally next (odd) element template __device__ void forEachVerticalEven(const int columnOffset, const F &f) { if (SIZE_Y > 3) { // makes no sense otherwise const int steps = SIZE_Y / 2 - 1; for (int i = 0; i < steps; i++) { const int row = 2 + i * 2; const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); //--------------- FOR TEST ----------------- /* __syncthreads(); if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ diffOut[2500]++; diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE]; } __syncthreads(); */ //--------------- FOR TEST ----------------- } } } /// Applies specified function to all odd elements of given column. /// Each thread takes care of one column, so there's no need for /// synchronization. /// @param columnOffset offset of thread's column /// @param f function to be applied on all odd elements /// parameters: previous (even) element, the odd /// element itself and finally next (even) element template __device__ void forEachVerticalOdd(const int columnOffset, const F &f) { const int steps = (SIZE_Y - 1) / 2; for (int i = 0; i < steps; i++) { const int row = i * 2 + 1; const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE]; const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE]; f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); //--------------- FOR TEST ----------------- /* __syncthreads(); if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ diffOut[2500]++; diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE]; } __syncthreads(); */ //--------------- FOR TEST ----------------- } } /// Scales elements at specified lines. /// @param evenScale scaling factor for horizontally even elements /// @param oddScale scaling factor for horizontally odd elements /// @param numLines number of lines, whose elements should be scaled /// @param firstLine index of first line to scale elements in __device__ void scaleHorizontal(const T evenScale, const T oddScale, const int firstLine, const int numLines) { const int offset = firstLine * VERTICAL_STRIDE; const int count = numLines * VERTICAL_STRIDE; const int steps = count / SIZE_X; const int finalCount = count % SIZE_X; const int finalOffset = count - finalCount; // printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, // finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, // finalCount, finalOffset); // run iterations, whete all threads participate for (int i = 0; i < steps; i++) { data[threadIdx.x + i * SIZE_X + offset] *= evenScale; // if(threadIdx.x + i * SIZE_X + offset == 531) { // printf("threadidx 531: %d \n", threadIdx.x); // } // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) { // printf("threadidx 531: %d \n", threadIdx.x); // } data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale; } // some threads also finish remaining unscaled items if (threadIdx.x < finalCount) { data[threadIdx.x + finalOffset + offset] *= evenScale; // if(threadIdx.x + finalOffset + offset == 531) { // printf("threadidx 531: %d \n", threadIdx.x); // } // if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) { // printf("threadidx 531: %d \n", threadIdx.x); // } data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale; } } /// Scales elements in specified column. /// @param evenScale scaling factor for vertically even elements /// @param oddScale scaling factor for vertically odd elements /// @param columnOffset offset of the column to work with /// @param numLines number of lines, whose elements should be scaled /// @param firstLine index of first line to scale elements in __device__ void scaleVertical(const T evenScale, const T oddScale, const int columnOffset, const int numLines, const int firstLine) { for (int i = firstLine; i < (numLines + firstLine); i++) { if (i & 1) { data[columnOffset + i * VERTICAL_STRIDE] *= oddScale; } else { data[columnOffset + i * VERTICAL_STRIDE] *= evenScale; } } } //****************For Test(Feb23), test inter parameters************* __device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; } __device__ int getSHM_BANKS() { return SHM_BANKS; } __device__ int getBuffersize() { return BUFFER_SIZE; } __device__ int getPADDING() { return PADDING; } __device__ int getODD_OFFSET() { return ODD_OFFSET; } //****************For Test(Feb23), test inter parameters************* }; // end of class TransformBuffer } // namespace dwt_cuda #endif // TRANSFORM_BUFFER_H