CuPBoP/examples/dwt2d/dwt_cuda/transform_buffer.h

/// line 248 the index
/// @file    transform_buffer.h
/// @brief   Buffer with separated even and odd columns and related algorithms.
/// @author  Martin Jirman (207962@mail.muni.cz)
/// @date    2011-01-20 18:33
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
/// 
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
/// 
///     * Redistributions of source code must retain the above copyright
///       notice, this list of conditions and the following disclaimer.
///     * Redistributions in binary form must reproduce the above copyright
///       notice, this list of conditions and the following disclaimer in the
///       documentation and/or other materials provided with the distribution.
/// 
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///


#ifndef TRANSFORM_BUFFER_H
#define	TRANSFORM_BUFFER_H


namespace dwt_cuda {
  
  
  /// Buffer (in shared memory of GPU) where block of input image is stored,
  /// but odd and even lines are separated. (Generates less bank conflicts when 
  /// using lifting schema.) All operations expect SIZE_X threads.
  /// Also implements basic building blocks of lifting schema.
  /// @tparam SIZE_X      width of the buffer excluding two boundaries (Also
  ///                     a number of threads participating on all operations.)
  ///                     Must be divisible by 4.
  /// @tparam SIZE_Y      height of buffer (total number of lines)
  /// @tparam BOUNDARY_X  number of extra pixels at the left and right side
  ///                     boundary is expected to be smaller than half SIZE_X
  ///                     Must be divisible by 2.
  template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
  class TransformBuffer {
  public:
    enum {
      /// difference between pointers to two vertical neigbors
      VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
    };
    
  private:
    enum {
      /// number of shared memory banks - needed for correct padding
      #ifdef __CUDA_ARCH__
      SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
      #else
      SHM_BANKS = 16,  // for host code only - can be anything, won't be used
      #endif
      
      /// size of one of two buffers (odd or even)
      BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
      
      /// unused space between two buffers
      PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
      
      /// offset of the odd columns buffer from the beginning of data buffer
      ODD_OFFSET = BUFFER_SIZE + PADDING,
    };

    /// buffer for both even and odd columns
    T data[2 * BUFFER_SIZE + PADDING];
    
    
    /// Applies specified function to all central elements while also passing
    /// previous and next elements as parameters.
    /// @param count         count of central elements to apply function to
    /// @param prevOffset    offset of first central element
    /// @param midOffset     offset of first central element's predecessor
    /// @param nextOffset    offset of first central element's successor
    /// @param function      the function itself
    template <typename FUNC>
    __device__ void horizontalStep(const int count, const int prevOffset, 
                                   const int midOffset, const int nextOffset,
                                   const FUNC & function) {
      // number of unchecked iterations
      const int STEPS = count / SIZE_X;
      
      // items remaining after last unchecked iteration
      const int finalCount = count % SIZE_X; 
      
      // offset of items processed in last (checked) iteration
      const int finalOffset = count - finalCount;  
      
      // all threads perform fixed number of iterations ...
      for(int i = 0; i < STEPS; i++) {
      // for(int i = 0; i < 3; i++) {
        const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
        const T next     = data[nextOffset + i * SIZE_X + threadIdx.x];
        T & center       = data[midOffset  + i * SIZE_X + threadIdx.x];
        // function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
        function(previous, center, next);// the real one
      }
      
      // ... but not all threads participate on final iteration
      if(threadIdx.x < finalCount) {
        const T previous = data[prevOffset + finalOffset + threadIdx.x];
        const T next     = data[nextOffset + finalOffset + threadIdx.x];
        T & center = data[midOffset + finalOffset + threadIdx.x];
        // function(previous, center, (nextOffset+finalOffset+threadIdx.x));
        // kaixi
        function(previous, center, next);//the real one
      }
    }

  public:

    __device__ void getPrintData() {
      //
      for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) {     
          printf(" index: %d  data: %f \n ", i ,data[i]);  
      }

   }

    
    /// Gets offset of the column with given index. Central columns have 
    /// indices from 0 to NUM_LINES - 1, left boundary columns have negative 
    /// indices and right boundary columns indices start with NUM_LINES.
    /// @param columnIndex  index of column to get pointer to
    /// @return  offset of the first item of column with specified index
    __device__ int getColumnOffset(int columnIndex) {
      columnIndex += BOUNDARY_X;             // skip boundary
      return columnIndex / 2                 // select right column
          + (columnIndex & 1) * ODD_OFFSET;  // select odd or even buffer
    }
    
    
    /// Provides access to data of the transform buffer.
    /// @param index  index of the item to work with
    /// @return reference to item at given index
    __device__ T & operator[] (const int index) {
      return data[index];
    }
    
            
    /// Applies specified function to all horizontally even elements in 
    /// specified lines. (Including even elements in boundaries except 
    /// first even element in first left boundary.) SIZE_X threads participate 
    /// and synchronization is needed before result can be used.
    /// @param firstLine  index of first line
    /// @param numLines   count of lines
    /// @param func       function to be applied on all even elements
    ///                   parameters: previous (odd) element, the even
    ///                   element itself and finally next (odd) element
    template <typename FUNC>
    __device__ void forEachHorizontalEven(const int firstLine,
                                          const int numLines,
                                          const FUNC & func) {
      // number of even elemens to apply function to
      const int count = numLines * VERTICAL_STRIDE - 1;
      // offset of first even element
      const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
      // offset of odd predecessor of first even element
      const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
      // offset of odd successor of first even element
      const int nextOffset = prevOffset + 1;

      // if(threadIdx.x == 0) {

      //   printf("forEachHorizontalEven count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
      // }
      
      // call generic horizontal step function
      horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
    }
    
    
    /// Applies given function to all horizontally odd elements in specified
    /// lines. (Including odd elements in boundaries except last odd element
    /// in last right boundary.) SIZE_X threads participate and synchronization
    /// is needed before result can be used.
    /// @param firstLine  index of first line
    /// @param numLines   count of lines
    /// @param func       function to be applied on all odd elements
    ///                   parameters: previous (even) element, the odd
    ///                   element itself and finally next (even) element
    template <typename FUNC>
    __device__ void forEachHorizontalOdd(const int firstLine,
                                         const int numLines,
                                         const FUNC & func) {
      // numbet of odd elements to apply function to
      const int count = numLines * VERTICAL_STRIDE - 1;
      // offset of even predecessor of first odd element
      const int prevOffset = firstLine * VERTICAL_STRIDE;
      // offset of first odd element
      const int centerOffset = prevOffset + ODD_OFFSET;
      // offset of even successor of first odd element
      const int nextOffset = prevOffset + 1;

      //  if(threadIdx.x == 0) {
      //   printf("forEachHorizontalOdd count  %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
      // }
      
      
      // call generic horizontal step function
      horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
    }
    
    
    /// Applies specified function to all even elements (except element #0)
    /// of given column. Each thread takes care of one column, so there's 
    /// no need for synchronization.
    /// @param columnOffset  offset of thread's column
    /// @param f             function to be applied on all even elements
    ///                      parameters: previous (odd) element, the even
    ///                      element itself and finally next (odd) element
    template <typename F>
    __device__ void forEachVerticalEven(const int columnOffset, const F & f) {
      if(SIZE_Y > 3) { // makes no sense otherwise
        const int steps = SIZE_Y / 2 - 1;
        for(int i = 0; i < steps; i++) {
          const int row = 2 + i * 2;
          const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
          const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
          f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next);
		  
		  //--------------- FOR TEST -----------------
/*		__syncthreads();
		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
			diffOut[2500]++;
			diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
		}	
		__syncthreads();
*/		  //--------------- FOR TEST -----------------
		  
		  
        }
      }
    }
    
    
    /// Applies specified function to all odd elements of given column.
    /// Each thread takes care of one column, so there's no need for
    /// synchronization.
    /// @param columnOffset  offset of thread's column
    /// @param f             function to be applied on all odd elements
    ///                      parameters: previous (even) element, the odd
    ///                      element itself and finally next (even) element
    template <typename F>
    __device__ void forEachVerticalOdd(const int columnOffset, const F & f) {
      const int steps = (SIZE_Y - 1) / 2;
      for(int i = 0; i < steps; i++) {
        const int row = i * 2 + 1;
        const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
        const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];

		f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
		
		
		  //--------------- FOR TEST -----------------
/*		__syncthreads();
		if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
			diffOut[2500]++;
			diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];
		}

		__syncthreads();
*/		  //--------------- FOR TEST -----------------
      }
    }
    
    
    /// Scales elements at specified lines.
    /// @param evenScale  scaling factor for horizontally even elements
    /// @param oddScale   scaling factor for horizontally odd elements
    /// @param numLines   number of lines, whose elements should be scaled
    /// @param firstLine  index of first line to scale elements in
    __device__ void scaleHorizontal(const T evenScale, const T oddScale,
                                    const int firstLine, const int numLines) {
      const int offset = firstLine * VERTICAL_STRIDE;
      const int count = numLines * VERTICAL_STRIDE;
      const int steps = count / SIZE_X;
      const int finalCount = count % SIZE_X;
      const int finalOffset = count - finalCount;

      // printf("scaleHorizontal sizeX: %d  offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);
      
      // run iterations, whete all threads participate
      for(int i = 0; i < steps; i++) {
        data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
        // if(threadIdx.x + i * SIZE_X + offset == 531) {
        //   printf("threadidx 531: %d \n", threadIdx.x);
        // }
        // if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
        //   printf("threadidx 531: %d \n", threadIdx.x);
        // }
        data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
      }
      
      // some threads also finish remaining unscaled items
      if(threadIdx.x < finalCount) {
        data[threadIdx.x + finalOffset + offset] *= evenScale;
        // if(threadIdx.x + finalOffset + offset == 531) {
        //   printf("threadidx 531: %d \n", threadIdx.x);
        // }
        //  if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
        //   printf("threadidx 531: %d \n", threadIdx.x);
        // }
        data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
      }

    }
    
    
    /// Scales elements in specified column.
    /// @param evenScale     scaling factor for vertically even elements
    /// @param oddScale      scaling factor for vertically odd elements
    /// @param columnOffset  offset of the column to work with
    /// @param numLines      number of lines, whose elements should be scaled
    /// @param firstLine     index of first line to scale elements in
    __device__ void scaleVertical(const T evenScale, const T oddScale,
                                  const int columnOffset, const int numLines,
                                  const int firstLine) {
      for(int i = firstLine; i < (numLines + firstLine); i++) {
        if(i & 1) {
          data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
        } else {
          data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
        }
      }
    }
	
	
	//****************For Test(Feb23), test inter parameters*************
	__device__ int getVERTICAL_STRIDE(){
		return VERTICAL_STRIDE;
	}
	__device__ int getSHM_BANKS(){
		return SHM_BANKS;
	}
	__device__ int  getBuffersize(){		
		return BUFFER_SIZE;
	}
	__device__ int getPADDING(){
		return PADDING;
	}
	__device__ int getODD_OFFSET(){
		return ODD_OFFSET;
	}


    //****************For Test(Feb23), test inter parameters*************
	
	
  };  // end of class TransformBuffer


} // namespace dwt_cuda


#endif	// TRANSFORM_BUFFER_H
add dwt2d example and fixes 2022-05-22 03:55:49 +08:00			`/// line 248 the index`
			`/// @file transform_buffer.h`
			`/// @brief Buffer with separated even and odd columns and related algorithms.`
			`/// @author Martin Jirman (207962@mail.muni.cz)`
			`/// @date 2011-01-20 18:33`
			`///`
			`///`
			`/// Copyright (c) 2011 Martin Jirman`
			`/// All rights reserved.`
			`///`
			`/// Redistribution and use in source and binary forms, with or without`
			`/// modification, are permitted provided that the following conditions are met:`
			`///`
			`/// * Redistributions of source code must retain the above copyright`
			`/// notice, this list of conditions and the following disclaimer.`
			`/// * Redistributions in binary form must reproduce the above copyright`
			`/// notice, this list of conditions and the following disclaimer in the`
			`/// documentation and/or other materials provided with the distribution.`
			`///`
			`/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE`
			`/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`/// POSSIBILITY OF SUCH DAMAGE.`
			`///`


			`#ifndef TRANSFORM_BUFFER_H`
			`#define TRANSFORM_BUFFER_H`


			`namespace dwt_cuda {`


			`/// Buffer (in shared memory of GPU) where block of input image is stored,`
			`/// but odd and even lines are separated. (Generates less bank conflicts when`
			`/// using lifting schema.) All operations expect SIZE_X threads.`
			`/// Also implements basic building blocks of lifting schema.`
			`/// @tparam SIZE_X width of the buffer excluding two boundaries (Also`
			`/// a number of threads participating on all operations.)`
			`/// Must be divisible by 4.`
			`/// @tparam SIZE_Y height of buffer (total number of lines)`
			`/// @tparam BOUNDARY_X number of extra pixels at the left and right side`
			`/// boundary is expected to be smaller than half SIZE_X`
			`/// Must be divisible by 2.`
			`template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>`
			`class TransformBuffer {`
			`public:`
			`enum {`
			`/// difference between pointers to two vertical neigbors`
			`VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)`
			`};`

			`private:`
			`enum {`
			`/// number of shared memory banks - needed for correct padding`
			`#ifdef __CUDA_ARCH__`
			`SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),`
			`#else`
			`SHM_BANKS = 16, // for host code only - can be anything, won't be used`
			`#endif`

			`/// size of one of two buffers (odd or even)`
			`BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,`

			`/// unused space between two buffers`
			`PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),`

			`/// offset of the odd columns buffer from the beginning of data buffer`
			`ODD_OFFSET = BUFFER_SIZE + PADDING,`
			`};`

			`/// buffer for both even and odd columns`
			`T data[2 * BUFFER_SIZE + PADDING];`



			`/// Applies specified function to all central elements while also passing`
			`/// previous and next elements as parameters.`
			`/// @param count count of central elements to apply function to`
			`/// @param prevOffset offset of first central element`
			`/// @param midOffset offset of first central element's predecessor`
			`/// @param nextOffset offset of first central element's successor`
			`/// @param function the function itself`
			`template <typename FUNC>`
			`__device__ void horizontalStep(const int count, const int prevOffset,`
			`const int midOffset, const int nextOffset,`
			`const FUNC & function) {`
			`// number of unchecked iterations`
			`const int STEPS = count / SIZE_X;`

			`// items remaining after last unchecked iteration`
			`const int finalCount = count % SIZE_X;`

			`// offset of items processed in last (checked) iteration`
			`const int finalOffset = count - finalCount;`

			`// all threads perform fixed number of iterations ...`
			`for(int i = 0; i < STEPS; i++) {`
			`// for(int i = 0; i < 3; i++) {`
			`const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];`
			`const T next = data[nextOffset + i * SIZE_X + threadIdx.x];`
			`T & center = data[midOffset + i * SIZE_X + threadIdx.x];`
			`// function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));`
			`function(previous, center, next);// the real one`
			`}`

			`// ... but not all threads participate on final iteration`
			`if(threadIdx.x < finalCount) {`
			`const T previous = data[prevOffset + finalOffset + threadIdx.x];`
			`const T next = data[nextOffset + finalOffset + threadIdx.x];`
			`T & center = data[midOffset + finalOffset + threadIdx.x];`
			`// function(previous, center, (nextOffset+finalOffset+threadIdx.x));`
			`// kaixi`
			`function(previous, center, next);//the real one`
			`}`
			`}`

			`public:`

			`__device__ void getPrintData() {`
			`//`
			`for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) {`
			`printf(" index: %d data: %f \n ", i ,data[i]);`
			`}`

			`}`


			`/// Gets offset of the column with given index. Central columns have`
			`/// indices from 0 to NUM_LINES - 1, left boundary columns have negative`
			`/// indices and right boundary columns indices start with NUM_LINES.`
			`/// @param columnIndex index of column to get pointer to`
			`/// @return offset of the first item of column with specified index`
			`__device__ int getColumnOffset(int columnIndex) {`
			`columnIndex += BOUNDARY_X; // skip boundary`
			`return columnIndex / 2 // select right column`
			`+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer`
			`}`


			`/// Provides access to data of the transform buffer.`
			`/// @param index index of the item to work with`
			`/// @return reference to item at given index`
			`__device__ T & operator[] (const int index) {`
			`return data[index];`
			`}`


			`/// Applies specified function to all horizontally even elements in`
			`/// specified lines. (Including even elements in boundaries except`
			`/// first even element in first left boundary.) SIZE_X threads participate`
			`/// and synchronization is needed before result can be used.`
			`/// @param firstLine index of first line`
			`/// @param numLines count of lines`
			`/// @param func function to be applied on all even elements`
			`/// parameters: previous (odd) element, the even`
			`/// element itself and finally next (odd) element`
			`template <typename FUNC>`
			`__device__ void forEachHorizontalEven(const int firstLine,`
			`const int numLines,`
			`const FUNC & func) {`
			`// number of even elemens to apply function to`
			`const int count = numLines * VERTICAL_STRIDE - 1;`
			`// offset of first even element`
			`const int centerOffset = firstLine * VERTICAL_STRIDE + 1;`
			`// offset of odd predecessor of first even element`
			`const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;`
			`// offset of odd successor of first even element`
			`const int nextOffset = prevOffset + 1;`

			`// if(threadIdx.x == 0) {`

			`// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);`
			`// }`

			`// call generic horizontal step function`
			`horizontalStep(count, prevOffset, centerOffset, nextOffset, func);`
			`}`


			`/// Applies given function to all horizontally odd elements in specified`
			`/// lines. (Including odd elements in boundaries except last odd element`
			`/// in last right boundary.) SIZE_X threads participate and synchronization`
			`/// is needed before result can be used.`
			`/// @param firstLine index of first line`
			`/// @param numLines count of lines`
			`/// @param func function to be applied on all odd elements`
			`/// parameters: previous (even) element, the odd`
			`/// element itself and finally next (even) element`
			`template <typename FUNC>`
			`__device__ void forEachHorizontalOdd(const int firstLine,`
			`const int numLines,`
			`const FUNC & func) {`
			`// numbet of odd elements to apply function to`
			`const int count = numLines * VERTICAL_STRIDE - 1;`
			`// offset of even predecessor of first odd element`
			`const int prevOffset = firstLine * VERTICAL_STRIDE;`
			`// offset of first odd element`
			`const int centerOffset = prevOffset + ODD_OFFSET;`
			`// offset of even successor of first odd element`
			`const int nextOffset = prevOffset + 1;`

			`// if(threadIdx.x == 0) {`
			`// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);`
			`// }`


			`// call generic horizontal step function`
			`horizontalStep(count, prevOffset, centerOffset, nextOffset, func);`
			`}`


			`/// Applies specified function to all even elements (except element #0)`
			`/// of given column. Each thread takes care of one column, so there's`
			`/// no need for synchronization.`
			`/// @param columnOffset offset of thread's column`
			`/// @param f function to be applied on all even elements`
			`/// parameters: previous (odd) element, the even`
			`/// element itself and finally next (odd) element`
			`template <typename F>`
			`__device__ void forEachVerticalEven(const int columnOffset, const F & f) {`
			`if(SIZE_Y > 3) { // makes no sense otherwise`
			`const int steps = SIZE_Y / 2 - 1;`
			`for(int i = 0; i < steps; i++) {`
			`const int row = 2 + i * 2;`
			`const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];`
			`const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];`
			`f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next);`

			`//--------------- FOR TEST -----------------`
			`/* __syncthreads();`
			`if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){`
			`diffOut[2500]++;`
			`diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];`
			`}`
			`__syncthreads();`
			`*/ //--------------- FOR TEST -----------------`


			`}`
			`}`
			`}`


			`/// Applies specified function to all odd elements of given column.`
			`/// Each thread takes care of one column, so there's no need for`
			`/// synchronization.`
			`/// @param columnOffset offset of thread's column`
			`/// @param f function to be applied on all odd elements`
			`/// parameters: previous (even) element, the odd`
			`/// element itself and finally next (even) element`
			`template <typename F>`
			`__device__ void forEachVerticalOdd(const int columnOffset, const F & f) {`
			`const int steps = (SIZE_Y - 1) / 2;`
			`for(int i = 0; i < steps; i++) {`
			`const int row = i * 2 + 1;`
			`const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];`
			`const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];`

			`f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);`


			`//--------------- FOR TEST -----------------`
			`/* __syncthreads();`
			`if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){`
			`diffOut[2500]++;`
			`diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];`
			`}`

			`__syncthreads();`
			`*/ //--------------- FOR TEST -----------------`
			`}`
			`}`



			`/// Scales elements at specified lines.`
			`/// @param evenScale scaling factor for horizontally even elements`
			`/// @param oddScale scaling factor for horizontally odd elements`
			`/// @param numLines number of lines, whose elements should be scaled`
			`/// @param firstLine index of first line to scale elements in`
			`__device__ void scaleHorizontal(const T evenScale, const T oddScale,`
			`const int firstLine, const int numLines) {`
			`const int offset = firstLine * VERTICAL_STRIDE;`
			`const int count = numLines * VERTICAL_STRIDE;`
			`const int steps = count / SIZE_X;`
			`const int finalCount = count % SIZE_X;`
			`const int finalOffset = count - finalCount;`

			`// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);`

			`// run iterations, whete all threads participate`
			`for(int i = 0; i < steps; i++) {`
			`data[threadIdx.x + i * SIZE_X + offset] *= evenScale;`
			`// if(threadIdx.x + i * SIZE_X + offset == 531) {`
			`// printf("threadidx 531: %d \n", threadIdx.x);`
			`// }`
			`// if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {`
			`// printf("threadidx 531: %d \n", threadIdx.x);`
			`// }`
			`data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;`
			`}`

			`// some threads also finish remaining unscaled items`
			`if(threadIdx.x < finalCount) {`
			`data[threadIdx.x + finalOffset + offset] *= evenScale;`
			`// if(threadIdx.x + finalOffset + offset == 531) {`
			`// printf("threadidx 531: %d \n", threadIdx.x);`
			`// }`
			`// if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {`
			`// printf("threadidx 531: %d \n", threadIdx.x);`
			`// }`
			`data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;`
			`}`

			`}`


			`/// Scales elements in specified column.`
			`/// @param evenScale scaling factor for vertically even elements`
			`/// @param oddScale scaling factor for vertically odd elements`
			`/// @param columnOffset offset of the column to work with`
			`/// @param numLines number of lines, whose elements should be scaled`
			`/// @param firstLine index of first line to scale elements in`
			`__device__ void scaleVertical(const T evenScale, const T oddScale,`
			`const int columnOffset, const int numLines,`
			`const int firstLine) {`
			`for(int i = firstLine; i < (numLines + firstLine); i++) {`
			`if(i & 1) {`
			`data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;`
			`} else {`
			`data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;`
			`}`
			`}`
			`}`


			`//**************For Test(Feb23), test inter parameters***********`
			`__device__ int getVERTICAL_STRIDE(){`
			`return VERTICAL_STRIDE;`
			`}`
			`__device__ int getSHM_BANKS(){`
			`return SHM_BANKS;`
			`}`
			`__device__ int getBuffersize(){`
			`return BUFFER_SIZE;`
			`}`
			`__device__ int getPADDING(){`
			`return PADDING;`
			`}`
			`__device__ int getODD_OFFSET(){`
			`return ODD_OFFSET;`
			`}`


			`//**************For Test(Feb23), test inter parameters***********`


			`}; // end of class TransformBuffer`


			`} // namespace dwt_cuda`


			`#endif // TRANSFORM_BUFFER_H`