CuPBoP/examples/dwt2d/dwt_cuda/fdwt97.cu

///
/// @file    fdwt97.cu
/// @brief   CUDA implementation of forward 9/7 2D DWT.
/// @author  Martin Jirman (207962@mail.muni.cz)
/// @date    2011-01-20 13:18
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
///     * Redistributions of source code must retain the above copyright
///       notice, this list of conditions and the following disclaimer.
///     * Redistributions in binary form must reproduce the above copyright
///       notice, this list of conditions and the following disclaimer in the
///       documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///


#include "common.h"
#include "transform_buffer.h"
#include "io.h"


namespace dwt_cuda {


  /// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
  /// of specified size. Template arguments specify this size.
  /// @tparam WIN_SIZE_X  width of sliding window
  /// @tparam WIN_SIZE_Y  height of sliding window
  template <int WIN_SIZE_X, int WIN_SIZE_Y>
  class FDWT97 {
  private:
    /// Type of shared memory buffer used for 9/7 DWT.
    typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> FDWT97Buffer;

    /// Actual shared buffer used for forward 9/7 DWT.
    FDWT97Buffer buffer;

    /// Difference of indices of two vertically neighboring items in buffer.
    enum { STRIDE = FDWT97Buffer::VERTICAL_STRIDE };


    /// One thread's info about loading input image
    /// @tparam CHECKED  true if loader should check for image boundaries
    template <bool CHECKED>
    struct FDWT97ColumnLoadingInfo {
      /// Loader of pixels from some input image.
      VerticalDWTPixelLoader<float, CHECKED> loader;

      /// Offset of column loaded by loader. (Offset in shared buffer.)
      int offset;
    };


    /// Horizontal 9/7 FDWT on specified lines of transform buffer.
    /// @param lines      number of lines to be transformed
    /// @param firstLine  index of the first line to be transformed
    __device__ void horizontalFDWT97(const int lines, const int firstLine) {
      __syncthreads();

      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict1));
      __syncthreads();
      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update1));
      __syncthreads();
      buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict2));
      __syncthreads();
      buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update2));
      __syncthreads();

      buffer.scaleHorizontal(scale97Div, scale97Mul, firstLine, lines);

      __syncthreads();

    }


    /// Initializes one column of shared transform buffer with 7 input pixels.
    /// Those 7 pixels will not be transformed. Also initializes given loader.
    /// @tparam CHECKED     true if loader should check for image boundaries
    /// @param column       (uninitialized) object for loading input pixels
    /// @param columnIndex  index (not offset!) of the column to be loaded
    ///                     (relative to threadblock's first column)
    /// @param input        pointer to input image in GPU memory
    /// @param sizeX        width of the input image
    /// @param sizeY        height of the input image
    /// @param firstY       index of first row to be loaded from image
    template <bool CHECKED>
    __device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
                              const int columnIndex, const float * const input,
                              const int sizeX, const int sizeY,
                              const int firstY) {
      // get offset of the column with index 'columnIndex'
      column.offset = buffer.getColumnOffset(columnIndex);

      // printf(" offset: %d  , threadIdx: %d, blockIdx.y %d\n ", column.offset, threadIdx.x, blockIdx.y);

      // x-coordinate of the first pixel to be loaded by given loader
      const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;

      if(blockIdx.y == 0) {
        // topmost block - apply mirroring rules when loading first 7 rows
        column.loader.init(sizeX, sizeY, firstX, firstY);

        // load pixels in mirrored way
        buffer[column.offset + 4 * STRIDE] = column.loader.loadFrom(input);
        buffer[column.offset + 3 * STRIDE] =
        buffer[column.offset + 5 * STRIDE] = column.loader.loadFrom(input);
        buffer[column.offset + 2 * STRIDE] =
        buffer[column.offset + 6 * STRIDE] = column.loader.loadFrom(input);
        buffer[column.offset + 1 * STRIDE] = column.loader.loadFrom(input);
        buffer[column.offset + 0 * STRIDE] = column.loader.loadFrom(input);

        // reinitialize loader to start with pixel #3 again
        column.loader.init(sizeX, sizeY, firstX, firstY + 3);

      } else {
        // non-topmost row - regular loading:
        column.loader.init(sizeX, sizeY, firstX, firstY - 4);

        // load 7 rows into the transform buffer
        for(int i = 0; i < 7; i++) {
          buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);

        }
      }
      // Now, the next pixel, which will be loaded by loader, is pixel #3.
    }


    /// Loads another WIN_SIZE_Y pixels into given column using given loader.
    /// @tparam CHECKED  true if loader should check for image boundaries
    /// @param input     input image to load from
    /// @param column    loader and offset of loaded column in shared buffer
    template <bool CHECKED>
    inline __device__ void loadWindowIntoColumn(const float * const input,
                                  FDWT97ColumnLoadingInfo<CHECKED> & column) {
      for(int i = 7; i < (7 + WIN_SIZE_Y); i++) {
        buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
      }
    }


    /// Main GPU 9/7 FDWT entry point.
    /// @tparam CHECK_LOADS   true if boundaries should be checked when loading
    /// @tparam CHECK_WRITES  true if boundaries should be checked when writing
    /// @param in        input image
    /// @param out       output buffer
    /// @param sizeX     width of the input image
    /// @param sizeY     height of the input image
    /// @param winSteps  number of steps of sliding window
    template <bool CHECK_LOADS, bool CHECK_WRITES>
    __device__ void transform(const float * const in, float * const out,
                              const int sizeX, const int sizeY,
                              const int winSteps) {
      // info about columns loaded by this thread: one main column and possibly
      // one boundary column. (Only some threads load some boundary column.)
      FDWT97ColumnLoadingInfo<CHECK_LOADS> loadedColumn;
      FDWT97ColumnLoadingInfo<CHECK_LOADS> boundaryColumn;

      // Initialize first 7 lines of transform buffer.
      const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
      initColumn(loadedColumn, threadIdx.x, in, sizeX, sizeY, firstY);

      // Some threads initialize boundary columns.
      boundaryColumn.offset = 0;
      boundaryColumn.loader.clear();
      if(threadIdx.x < 7) {
        // each thread among first 7 ones gets index of one of boundary columns
        const int colId = threadIdx.x + ((threadIdx.x < 3) ? WIN_SIZE_X : -7);

        // Thread initializes offset of the boundary column (in shared buffer),
        // first 7 pixels of the column and a loader for this column.
        initColumn(boundaryColumn, colId, in, sizeX, sizeY, firstY);
      }

      // horizontally transform first 7 rows in all columns
      horizontalFDWT97(7, 0);

      // Index of column handled by this thread. (First half of threads handle
      // even columns and others handle odd columns.)
      const int outColumnIndex = parityIdx<WIN_SIZE_X>();

      // writer of output linear bands - initialize it
      const int firstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
      VerticalDWTBandWriter<float, CHECK_WRITES> writer;
      writer.init(sizeX, sizeY, firstX, firstY);

      // transform buffer offset of column transformed and saved by this thread
      const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);

      // (Each iteration of this loop assumes that first 7 rows of transform
      // buffer are already loaded with horizontally transformed coefficients.)
      for(int w = 0; w < winSteps; w++) {
        // Load another WIN_SIZE_Y lines of thread's column into the buffer.
        loadWindowIntoColumn(in, loadedColumn);

        // some threads also load boundary columns
        if(threadIdx.x < 7) {
          loadWindowIntoColumn(in, boundaryColumn);
        }

        // horizontally transform all newly loaded lines
        horizontalFDWT97(WIN_SIZE_Y, 7);

        // Using 7 registers, remember current values of last 7 rows of
        // transform buffer. These rows are transformed horizontally only
        // and will be used in next iteration.
        float last7Lines[7];
        for(int i = 0; i < 7; i++) {
          last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
        }

        // vertically transform all central columns (do not scale yet)
        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict1));
        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update1));
        buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict2));
        buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update2));

        // Save all results of current window. Results are in transform buffer
        // at rows from #4 to #(4 + WIN_SIZE_Y). Other rows are invalid now.
        // (They only served as a boundary for vertical FDWT.)

        for(int i = 4; i < (4 + WIN_SIZE_Y); i += 2) {
          const int index = outColumnOffset + i * STRIDE;
          // Write low coefficients from column into low band ...
          writer.writeLowInto(out, buffer[index] * scale97Div);
          // ... and high coeficients into the high band.
          writer.writeHighInto(out, buffer[index + STRIDE] * scale97Mul);
        }

        // Use last 7 remembered lines as first 7 lines for next iteration.
        // As expected, these lines are already horizontally transformed.
        for(int i = 0; i < 7; i++) {
          buffer[outColumnOffset + i * STRIDE] = last7Lines[i];

        }

        // Wait for all writing threads before proceeding to loading new
        // pixels in next iteration. (Not to overwrite those which
        // are not written yet.)
        __syncthreads();
      }

    }


  public:
    /// Runs one of specialized variants of 9/7 FDWT according to distance of
    /// processed pixels to image boudnary. Some variants do not check for
    /// boudnary and thus are slightly faster.
    /// @param in     input image
    /// @param out    output buffer
    /// @param sx     width of the input image
    /// @param sy     height of the input image
    /// @param steps  number of steps of sliding window
    __device__ static void run(const float * const input, float * const output,
                               const int sx, const int sy, const int steps) {
      // object with transform buffer in shared memory
      __shared__ FDWT97<WIN_SIZE_X, WIN_SIZE_Y> fdwt97;

      // Compute limits of this threadblock's block of pixels and use them to
      // determine, whether this threadblock will have to deal with boundary.
      // (3 in next expressions is for radius of impulse response of 9/7 FDWT.)
      const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
      const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
      const bool atRightBoudary = maxX >= sx;
      const bool atBottomBoudary = maxY >= sy;

      // Select specialized version of code according to distance of this
      // threadblock's pixels from image boundary.
      if(atBottomBoudary) {
        // near bottom boundary => check both writing and reading
        // printf("\n atBottomBoudary \n ");
        fdwt97.transform<true, true>(input, output, sx, sy, steps);
      } else if(atRightBoudary) {

        // near right boundary only => check writing only
        fdwt97.transform<false, true>(input, output, sx, sy, steps);
      } else {

        // no nearby boundary => check nothing
        fdwt97.transform<false, false>(input, output, sx, sy, steps);
      }
    }

  }; // end of class FDWT97


  /// Main GPU 9/7 FDWT entry point.
  /// @param input   input image
  /// @parma output  output buffer
  /// @param sx      width of the input image
  /// @param sy      height of the input image
  /// @param steps   number of steps of sliding window
  template <int WIN_SX, int WIN_SY>
  __launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT97<WIN_SX, WIN_SY>), 8))
  __global__ void fdwt97Kernel(const float * const input, float * const output,
                               const int sx, const int sy, const int steps) {
    // Excuse me, dear reader of this code - this call have to be here. If you
    // try to simply put contents of following method right here, CUDA compiler
    // (version 3.2) will spit tons of nonsense messy errors ...
    // Hope they will not break it even more in future releases.
    FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
  }


  /// Only computes optimal number of sliding window steps,
  /// number of threadblocks and then lanches the 9/7 FDWT kernel.
  /// @tparam WIN_SX  width of sliding window
  /// @tparam WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
  /// @param sx       width of the input image
  /// @param sy       height of the input image
  template <int WIN_SX, int WIN_SY>
  void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
    // compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);

    // prepare grid size
    dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
    printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);

    // run kernel, possibly measure time and finally check the call
    PERF_BEGIN
    fdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
    PERF_END("        FDWT97", sx, sy)
    CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
  }


  /// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
  /// @param in      Input DWT coefficients. Should be normalized (in range
  ///                [-0.5, 0.5]). Will not be preserved (will be overwritten).
  /// @param out     output buffer on GPU - format specified in common rules
  /// @param sizeX   width of input image (in pixels)
  /// @param sizeY   height of input image (in pixels)
  /// @param levels  number of recursive DWT levels
  void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
    // select right width of kernel for the size of the image
    if(sizeX >= 960) {
      launchFDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
    } else if (sizeX >= 480) {
      launchFDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
    } else {
      launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
    }

    // if this was not the last level, continue recursively with other levels
    if(levels > 1) {
      // copy output's LL band back into input buffer
      const int llSizeX = divRndUp(sizeX, 2);
      const int llSizeY = divRndUp(sizeY, 2);
      memCopy(in, out, llSizeX, llSizeY);

      // run remaining levels of FDWT
      fdwt97(in, out, llSizeX, llSizeY, levels - 1);
    }
  }


} // end of namespace dwt_cuda