CuPBoP/examples/huffman/scanLargeArray_kernel.cu

/*
 * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and
 * international Copyright laws.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 * OR PERFORMANCE OF THIS SOURCE CODE.
 *
 * U.S. Government End Users.  This source code is a "commercial item" as
 * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 * "commercial computer software" and "commercial computer software
 * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 * and is provided to the U.S. Government only as a commercial end item.
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 * source code with only those rights set forth herein.
 */

#ifndef _SCAN_BEST_KERNEL_CU_
#define _SCAN_BEST_KERNEL_CU_

// Define this to more rigorously avoid bank conflicts,
// even at the lower (root) levels of the tree
// Note that due to the higher addressing overhead, performance
// is lower with ZERO_BANK_CONFLICTS enabled.  It is provided
// as an example.
//#define ZERO_BANK_CONFLICTS

// 16 banks on G80
#define NUM_BANKS 16
#define LOG_NUM_BANKS 4

#ifdef ZERO_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(index)                                            \
  ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
#else
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
#endif

///////////////////////////////////////////////////////////////////////////////
// Work-efficient compute implementation of scan, one thread per 2 elements
// Work-efficient: O(log(n)) steps, and O(n) adds.
// Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no
// ping-ponging Also avoids most bank conflicts using single-element offsets
// every NUM_BANKS elements.
//
// In addition, If ZERO_BANK_CONFLICTS is defined, uses
//     n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS)
// shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts
// using single-element offsets every NUM_BANKS elements, plus additional
// single-element offsets after every NUM_BANKS^2 elements.
//
// Uses a balanced tree type algorithm.  See Blelloch, 1990 "Prefix Sums
// and Their Applications", or Prins and Chatterjee PRAM course notes:
// http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf
//
// This work-efficient version is based on the algorithm presented in Guy
// Blelloch's excellent paper "Prefix sums and their applications".
// http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html
//
// Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS
// is defined) Con: More instructions to compute bank-conflict-free shared
// memory addressing, and slightly more shared memory storage used.
//

template <bool isNP2>
__device__ static void
loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n,
                       int baseIndex, int &ai, int &bi, int &mem_ai,
                       int &mem_bi, int &bankOffsetA, int &bankOffsetB) {
  int thid = threadIdx.x;
  mem_ai = baseIndex + threadIdx.x;
  mem_bi = mem_ai + blockDim.x;

  ai = thid;
  bi = thid + blockDim.x;

  // compute spacing to avoid bank conflicts
  bankOffsetA = CONFLICT_FREE_OFFSET(ai);
  bankOffsetB = CONFLICT_FREE_OFFSET(bi);

  // Cache the computational window in shared memory
  // pad values beyond n with zeros
  s_data[ai + bankOffsetA] = g_idata[mem_ai];

  if (isNP2) // compile-time decision
  {
    s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;
  } else {
    s_data[bi + bankOffsetB] = g_idata[mem_bi];
  }
}

template <bool isNP2>
__device__ static void
storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n,
                      int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA,
                      int bankOffsetB) {
  __syncthreads();

  // write results to global memory
  g_odata[mem_ai] = s_data[ai + bankOffsetA];
  if (isNP2) // compile-time decision
  {
    if (bi < n)
      g_odata[mem_bi] = s_data[bi + bankOffsetB];
  } else {
    g_odata[mem_bi] = s_data[bi + bankOffsetB];
  }
}

template <bool storeSum>
__device__ static void clearLastElement(unsigned int *s_data,
                                        unsigned int *g_blockSums,
                                        int blockIndex) {
  if (threadIdx.x == 0) {
    int index = (blockDim.x << 1) - 1;
    index += CONFLICT_FREE_OFFSET(index);

    if (storeSum) // compile-time decision
    {
      // write this block's total sum to the corresponding index in the
      // blockSums array
      g_blockSums[blockIndex] = s_data[index];
    }

    // zero the last element in the scan so it will propagate back to the front
    s_data[index] = 0;
  }
}

__device__ static unsigned int buildSum(unsigned int *s_data) {
  unsigned int thid = threadIdx.x;
  unsigned int stride = 1;

  // build the sum in place up the tree
  for (int d = blockDim.x; d > 0; d >>= 1) {
    __syncthreads();

    if (thid < d) {
      int i = __mul24(__mul24(2, stride), thid);
      int ai = i + stride - 1;
      int bi = ai + stride;

      ai += CONFLICT_FREE_OFFSET(ai);
      bi += CONFLICT_FREE_OFFSET(bi);

      s_data[bi] += s_data[ai];
    }

    stride *= 2;
  }

  return stride;
}

__device__ static void scanRootToLeaves(unsigned int *s_data,
                                        unsigned int stride) {
  unsigned int thid = threadIdx.x;

  // traverse down the tree building the scan in place
  for (int d = 1; d <= blockDim.x; d *= 2) {
    stride >>= 1;

    __syncthreads();

    if (thid < d) {
      int i = __mul24(__mul24(2, stride), thid);
      int ai = i + stride - 1;
      int bi = ai + stride;

      ai += CONFLICT_FREE_OFFSET(ai);
      bi += CONFLICT_FREE_OFFSET(bi);

      unsigned int t = s_data[ai];
      s_data[ai] = s_data[bi];
      s_data[bi] += t;
    }
  }
}

template <bool storeSum>
__device__ static void prescanBlock(unsigned int *data, int blockIndex,
                                    unsigned int *blockSums) {
  int stride = buildSum(data); // build the sum in place up the tree
  clearLastElement<storeSum>(data, blockSums,
                             (blockIndex == 0) ? blockIdx.x : blockIndex);
  scanRootToLeaves(data, stride); // traverse down tree to build the scan
}

template <bool storeSum, bool isNP2>
__global__ static void
prescan(unsigned int *g_odata, const unsigned int *g_idata,
        unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) {
  int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;
  __shared__ unsigned int s_data[3072];

  // load data into shared memory
  loadSharedChunkFromMem<isNP2>(
      s_data, g_idata, n,
      (baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai,
      bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
  // scan the data in each block
  prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);
  // write results to device memory
  storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi,
                               bankOffsetA, bankOffsetB);
}

__global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms,
                                  int n, int blockOffset, int baseIndex) {
  __shared__ unsigned int uni;
  if (threadIdx.x == 0)
    uni = uniforms[blockIdx.x + blockOffset];

  unsigned int address =
      __mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;

  __syncthreads();

  // note two adds per thread
  g_data[address] += uni;
  g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
}

#endif // #ifndef _SCAN_BEST_KERNEL_CU_