CuPBoP/examples/huffman/scanLargeArray_kernel.cu

/*
 * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and
 * international Copyright laws.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 * OR PERFORMANCE OF THIS SOURCE CODE.
 *
 * U.S. Government End Users.  This source code is a "commercial item" as
 * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 * "commercial computer software" and "commercial computer software
 * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 * and is provided to the U.S. Government only as a commercial end item.
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 * source code with only those rights set forth herein.
 */

#ifndef _SCAN_BEST_KERNEL_CU_
#define _SCAN_BEST_KERNEL_CU_

// Define this to more rigorously avoid bank conflicts,
// even at the lower (root) levels of the tree
// Note that due to the higher addressing overhead, performance
// is lower with ZERO_BANK_CONFLICTS enabled.  It is provided
// as an example.
//#define ZERO_BANK_CONFLICTS

// 16 banks on G80
#define NUM_BANKS 16
#define LOG_NUM_BANKS 4

#ifdef ZERO_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(index)                                            \
  ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
#else
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
#endif

///////////////////////////////////////////////////////////////////////////////
// Work-efficient compute implementation of scan, one thread per 2 elements
// Work-efficient: O(log(n)) steps, and O(n) adds.
// Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no
// ping-ponging Also avoids most bank conflicts using single-element offsets
// every NUM_BANKS elements.
//
// In addition, If ZERO_BANK_CONFLICTS is defined, uses
//     n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS)
// shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts
// using single-element offsets every NUM_BANKS elements, plus additional
// single-element offsets after every NUM_BANKS^2 elements.
//
// Uses a balanced tree type algorithm.  See Blelloch, 1990 "Prefix Sums
// and Their Applications", or Prins and Chatterjee PRAM course notes:
// http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf
//
// This work-efficient version is based on the algorithm presented in Guy
// Blelloch's excellent paper "Prefix sums and their applications".
// http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html
//
// Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS
// is defined) Con: More instructions to compute bank-conflict-free shared
// memory addressing, and slightly more shared memory storage used.
//

template <bool isNP2>
__device__ static void
loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n,
                       int baseIndex, int &ai, int &bi, int &mem_ai,
                       int &mem_bi, int &bankOffsetA, int &bankOffsetB) {
  int thid = threadIdx.x;
  mem_ai = baseIndex + threadIdx.x;
  mem_bi = mem_ai + blockDim.x;

  ai = thid;
  bi = thid + blockDim.x;

  // compute spacing to avoid bank conflicts
  bankOffsetA = CONFLICT_FREE_OFFSET(ai);
  bankOffsetB = CONFLICT_FREE_OFFSET(bi);

  // Cache the computational window in shared memory
  // pad values beyond n with zeros
  s_data[ai + bankOffsetA] = g_idata[mem_ai];

  if (isNP2) // compile-time decision
  {
    s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;
  } else {
    s_data[bi + bankOffsetB] = g_idata[mem_bi];
  }
}

template <bool isNP2>
__device__ static void
storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n,
                      int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA,
                      int bankOffsetB) {
  __syncthreads();

  // write results to global memory
  g_odata[mem_ai] = s_data[ai + bankOffsetA];
  if (isNP2) // compile-time decision
  {
    if (bi < n)
      g_odata[mem_bi] = s_data[bi + bankOffsetB];
  } else {
    g_odata[mem_bi] = s_data[bi + bankOffsetB];
  }
}

template <bool storeSum>
__device__ static void clearLastElement(unsigned int *s_data,
                                        unsigned int *g_blockSums,
                                        int blockIndex) {
  if (threadIdx.x == 0) {
    int index = (blockDim.x << 1) - 1;
    index += CONFLICT_FREE_OFFSET(index);

    if (storeSum) // compile-time decision
    {
      // write this block's total sum to the corresponding index in the
      // blockSums array
      g_blockSums[blockIndex] = s_data[index];
    }

    // zero the last element in the scan so it will propagate back to the front
    s_data[index] = 0;
  }
}

__device__ static unsigned int buildSum(unsigned int *s_data) {
  unsigned int thid = threadIdx.x;
  unsigned int stride = 1;

  // build the sum in place up the tree
  for (int d = blockDim.x; d > 0; d >>= 1) {
    __syncthreads();

    if (thid < d) {
      int i = __mul24(__mul24(2, stride), thid);
      int ai = i + stride - 1;
      int bi = ai + stride;

      ai += CONFLICT_FREE_OFFSET(ai);
      bi += CONFLICT_FREE_OFFSET(bi);

      s_data[bi] += s_data[ai];
    }

    stride *= 2;
  }

  return stride;
}

__device__ static void scanRootToLeaves(unsigned int *s_data,
                                        unsigned int stride) {
  unsigned int thid = threadIdx.x;

  // traverse down the tree building the scan in place
  for (int d = 1; d <= blockDim.x; d *= 2) {
    stride >>= 1;

    __syncthreads();

    if (thid < d) {
      int i = __mul24(__mul24(2, stride), thid);
      int ai = i + stride - 1;
      int bi = ai + stride;

      ai += CONFLICT_FREE_OFFSET(ai);
      bi += CONFLICT_FREE_OFFSET(bi);

      unsigned int t = s_data[ai];
      s_data[ai] = s_data[bi];
      s_data[bi] += t;
    }
  }
}

template <bool storeSum>
__device__ static void prescanBlock(unsigned int *data, int blockIndex,
                                    unsigned int *blockSums) {
  int stride = buildSum(data); // build the sum in place up the tree
  clearLastElement<storeSum>(data, blockSums,
                             (blockIndex == 0) ? blockIdx.x : blockIndex);
  scanRootToLeaves(data, stride); // traverse down tree to build the scan
}

template <bool storeSum, bool isNP2>
__global__ static void
prescan(unsigned int *g_odata, const unsigned int *g_idata,
        unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) {
  int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;
  __shared__ unsigned int s_data[3072];

  // load data into shared memory
  loadSharedChunkFromMem<isNP2>(
      s_data, g_idata, n,
      (baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai,
      bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
  // scan the data in each block
  prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);
  // write results to device memory
  storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi,
                               bankOffsetA, bankOffsetB);
}

__global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms,
                                  int n, int blockOffset, int baseIndex) {
  __shared__ unsigned int uni;
  if (threadIdx.x == 0)
    uni = uniforms[blockIdx.x + blockOffset];

  unsigned int address =
      __mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;

  __syncthreads();

  // note two adds per thread
  g_data[address] += uni;
  g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
}

#endif // #ifndef _SCAN_BEST_KERNEL_CU_
add codebase for TACO submission 2022-05-04 20:59:38 +08:00			`/*`
			`* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.`
			`*`
			`* NOTICE TO USER:`
			`*`
			`* This source code is subject to NVIDIA ownership rights under U.S. and`
			`* international Copyright laws.`
			`*`
			`* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE`
			`* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR`
			`* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH`
			`* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF`
			`* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.`
			`* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,`
			`* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS`
			`* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE`
			`* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE`
			`* OR PERFORMANCE OF THIS SOURCE CODE.`
			`*`
			`* U.S. Government End Users. This source code is a "commercial item" as`
			`* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of`
			`* "commercial computer software" and "commercial computer software`
			`* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)`
			`* and is provided to the U.S. Government only as a commercial end item.`
			`* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through`
			`* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the`
			`* source code with only those rights set forth herein.`
			`*/`

			`#ifndef _SCAN_BEST_KERNEL_CU_`
			`#define _SCAN_BEST_KERNEL_CU_`

			`// Define this to more rigorously avoid bank conflicts,`
			`// even at the lower (root) levels of the tree`
			`// Note that due to the higher addressing overhead, performance`
			`// is lower with ZERO_BANK_CONFLICTS enabled. It is provided`
			`// as an example.`
			`//#define ZERO_BANK_CONFLICTS`

			`// 16 banks on G80`
			`#define NUM_BANKS 16`
			`#define LOG_NUM_BANKS 4`

			`#ifdef ZERO_BANK_CONFLICTS`
			`#define CONFLICT_FREE_OFFSET(index) \`
			`((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))`
			`#else`
			`#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)`
			`#endif`

			`///////////////////////////////////////////////////////////////////////////////`
			`// Work-efficient compute implementation of scan, one thread per 2 elements`
			`// Work-efficient: O(log(n)) steps, and O(n) adds.`
			`// Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no`
			`// ping-ponging Also avoids most bank conflicts using single-element offsets`
			`// every NUM_BANKS elements.`
			`//`
			`// In addition, If ZERO_BANK_CONFLICTS is defined, uses`
			`// n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS)`
			`// shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts`
			`// using single-element offsets every NUM_BANKS elements, plus additional`
			`// single-element offsets after every NUM_BANKS^2 elements.`
			`//`
			`// Uses a balanced tree type algorithm. See Blelloch, 1990 "Prefix Sums`
			`// and Their Applications", or Prins and Chatterjee PRAM course notes:`
			`// http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf`
			`//`
			`// This work-efficient version is based on the algorithm presented in Guy`
			`// Blelloch's excellent paper "Prefix sums and their applications".`
			`// http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html`
			`//`
			`// Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS`
			`// is defined) Con: More instructions to compute bank-conflict-free shared`
			`// memory addressing, and slightly more shared memory storage used.`
			`//`

			`template <bool isNP2>`
			`__device__ static void`
			`loadSharedChunkFromMem(unsigned int s_data, const unsigned int g_idata, int n,`
			`int baseIndex, int &ai, int &bi, int &mem_ai,`
			`int &mem_bi, int &bankOffsetA, int &bankOffsetB) {`
			`int thid = threadIdx.x;`
			`mem_ai = baseIndex + threadIdx.x;`
			`mem_bi = mem_ai + blockDim.x;`

			`ai = thid;`
			`bi = thid + blockDim.x;`

			`// compute spacing to avoid bank conflicts`
			`bankOffsetA = CONFLICT_FREE_OFFSET(ai);`
			`bankOffsetB = CONFLICT_FREE_OFFSET(bi);`

			`// Cache the computational window in shared memory`
			`// pad values beyond n with zeros`
			`s_data[ai + bankOffsetA] = g_idata[mem_ai];`

			`if (isNP2) // compile-time decision`
			`{`
			`s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;`
			`} else {`
			`s_data[bi + bankOffsetB] = g_idata[mem_bi];`
			`}`
			`}`

			`template <bool isNP2>`
			`__device__ static void`
			`storeSharedChunkToMem(unsigned int g_odata, const unsigned int s_data, int n,`
			`int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA,`
			`int bankOffsetB) {`
			`__syncthreads();`

			`// write results to global memory`
			`g_odata[mem_ai] = s_data[ai + bankOffsetA];`
			`if (isNP2) // compile-time decision`
			`{`
			`if (bi < n)`
			`g_odata[mem_bi] = s_data[bi + bankOffsetB];`
			`} else {`
			`g_odata[mem_bi] = s_data[bi + bankOffsetB];`
			`}`
			`}`

			`template <bool storeSum>`
			`__device__ static void clearLastElement(unsigned int *s_data,`
			`unsigned int *g_blockSums,`
			`int blockIndex) {`
			`if (threadIdx.x == 0) {`
			`int index = (blockDim.x << 1) - 1;`
			`index += CONFLICT_FREE_OFFSET(index);`

			`if (storeSum) // compile-time decision`
			`{`
			`// write this block's total sum to the corresponding index in the`
			`// blockSums array`
			`g_blockSums[blockIndex] = s_data[index];`
			`}`

			`// zero the last element in the scan so it will propagate back to the front`
			`s_data[index] = 0;`
			`}`
			`}`

			`__device__ static unsigned int buildSum(unsigned int *s_data) {`
			`unsigned int thid = threadIdx.x;`
			`unsigned int stride = 1;`

			`// build the sum in place up the tree`
			`for (int d = blockDim.x; d > 0; d >>= 1) {`
			`__syncthreads();`

			`if (thid < d) {`
			`int i = __mul24(__mul24(2, stride), thid);`
			`int ai = i + stride - 1;`
			`int bi = ai + stride;`

			`ai += CONFLICT_FREE_OFFSET(ai);`
			`bi += CONFLICT_FREE_OFFSET(bi);`

			`s_data[bi] += s_data[ai];`
			`}`

			`stride *= 2;`
			`}`

			`return stride;`
			`}`

			`__device__ static void scanRootToLeaves(unsigned int *s_data,`
			`unsigned int stride) {`
			`unsigned int thid = threadIdx.x;`

			`// traverse down the tree building the scan in place`
			`for (int d = 1; d <= blockDim.x; d *= 2) {`
			`stride >>= 1;`

			`__syncthreads();`

			`if (thid < d) {`
			`int i = __mul24(__mul24(2, stride), thid);`
			`int ai = i + stride - 1;`
			`int bi = ai + stride;`

			`ai += CONFLICT_FREE_OFFSET(ai);`
			`bi += CONFLICT_FREE_OFFSET(bi);`

			`unsigned int t = s_data[ai];`
			`s_data[ai] = s_data[bi];`
			`s_data[bi] += t;`
			`}`
			`}`
			`}`

			`template <bool storeSum>`
			`__device__ static void prescanBlock(unsigned int *data, int blockIndex,`
			`unsigned int *blockSums) {`
			`int stride = buildSum(data); // build the sum in place up the tree`
			`clearLastElement<storeSum>(data, blockSums,`
			`(blockIndex == 0) ? blockIdx.x : blockIndex);`
			`scanRootToLeaves(data, stride); // traverse down tree to build the scan`
			`}`

			`template <bool storeSum, bool isNP2>`
			`__global__ static void`
			`prescan(unsigned int g_odata, const unsigned int g_idata,`
			`unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) {`
			`int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;`
			`__shared__ unsigned int s_data[3072];`

			`// load data into shared memory`
			`loadSharedChunkFromMem<isNP2>(`
			`s_data, g_idata, n,`
			`(baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai,`
			`bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);`
			`// scan the data in each block`
			`prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);`
			`// write results to device memory`
			`storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi,`
			`bankOffsetA, bankOffsetB);`
			`}`

			`__global__ static void uniformAdd(unsigned int g_data, unsigned int uniforms,`
			`int n, int blockOffset, int baseIndex) {`
			`__shared__ unsigned int uni;`
			`if (threadIdx.x == 0)`
			`uni = uniforms[blockIdx.x + blockOffset];`

			`unsigned int address =`
			`__mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;`

			`__syncthreads();`

			`// note two adds per thread`
			`g_data[address] += uni;`
			`g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;`
			`}`

			`#endif // #ifndef _SCAN_BEST_KERNEL_CU_`