CuPBoP/examples/huffman/scanLargeArray_kernel.cu

238 lines
8.4 KiB
Plaintext

/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#ifndef _SCAN_BEST_KERNEL_CU_
#define _SCAN_BEST_KERNEL_CU_
// Define this to more rigorously avoid bank conflicts,
// even at the lower (root) levels of the tree
// Note that due to the higher addressing overhead, performance
// is lower with ZERO_BANK_CONFLICTS enabled. It is provided
// as an example.
//#define ZERO_BANK_CONFLICTS
// 16 banks on G80
#define NUM_BANKS 16
#define LOG_NUM_BANKS 4
#ifdef ZERO_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(index) \
((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
#else
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
#endif
///////////////////////////////////////////////////////////////////////////////
// Work-efficient compute implementation of scan, one thread per 2 elements
// Work-efficient: O(log(n)) steps, and O(n) adds.
// Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no
// ping-ponging Also avoids most bank conflicts using single-element offsets
// every NUM_BANKS elements.
//
// In addition, If ZERO_BANK_CONFLICTS is defined, uses
// n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS)
// shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts
// using single-element offsets every NUM_BANKS elements, plus additional
// single-element offsets after every NUM_BANKS^2 elements.
//
// Uses a balanced tree type algorithm. See Blelloch, 1990 "Prefix Sums
// and Their Applications", or Prins and Chatterjee PRAM course notes:
// http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf
//
// This work-efficient version is based on the algorithm presented in Guy
// Blelloch's excellent paper "Prefix sums and their applications".
// http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html
//
// Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS
// is defined) Con: More instructions to compute bank-conflict-free shared
// memory addressing, and slightly more shared memory storage used.
//
template <bool isNP2>
__device__ static void
loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n,
int baseIndex, int &ai, int &bi, int &mem_ai,
int &mem_bi, int &bankOffsetA, int &bankOffsetB) {
int thid = threadIdx.x;
mem_ai = baseIndex + threadIdx.x;
mem_bi = mem_ai + blockDim.x;
ai = thid;
bi = thid + blockDim.x;
// compute spacing to avoid bank conflicts
bankOffsetA = CONFLICT_FREE_OFFSET(ai);
bankOffsetB = CONFLICT_FREE_OFFSET(bi);
// Cache the computational window in shared memory
// pad values beyond n with zeros
s_data[ai + bankOffsetA] = g_idata[mem_ai];
if (isNP2) // compile-time decision
{
s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;
} else {
s_data[bi + bankOffsetB] = g_idata[mem_bi];
}
}
template <bool isNP2>
__device__ static void
storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n,
int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA,
int bankOffsetB) {
__syncthreads();
// write results to global memory
g_odata[mem_ai] = s_data[ai + bankOffsetA];
if (isNP2) // compile-time decision
{
if (bi < n)
g_odata[mem_bi] = s_data[bi + bankOffsetB];
} else {
g_odata[mem_bi] = s_data[bi + bankOffsetB];
}
}
template <bool storeSum>
__device__ static void clearLastElement(unsigned int *s_data,
unsigned int *g_blockSums,
int blockIndex) {
if (threadIdx.x == 0) {
int index = (blockDim.x << 1) - 1;
index += CONFLICT_FREE_OFFSET(index);
if (storeSum) // compile-time decision
{
// write this block's total sum to the corresponding index in the
// blockSums array
g_blockSums[blockIndex] = s_data[index];
}
// zero the last element in the scan so it will propagate back to the front
s_data[index] = 0;
}
}
__device__ static unsigned int buildSum(unsigned int *s_data) {
unsigned int thid = threadIdx.x;
unsigned int stride = 1;
// build the sum in place up the tree
for (int d = blockDim.x; d > 0; d >>= 1) {
__syncthreads();
if (thid < d) {
int i = __mul24(__mul24(2, stride), thid);
int ai = i + stride - 1;
int bi = ai + stride;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
s_data[bi] += s_data[ai];
}
stride *= 2;
}
return stride;
}
__device__ static void scanRootToLeaves(unsigned int *s_data,
unsigned int stride) {
unsigned int thid = threadIdx.x;
// traverse down the tree building the scan in place
for (int d = 1; d <= blockDim.x; d *= 2) {
stride >>= 1;
__syncthreads();
if (thid < d) {
int i = __mul24(__mul24(2, stride), thid);
int ai = i + stride - 1;
int bi = ai + stride;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
unsigned int t = s_data[ai];
s_data[ai] = s_data[bi];
s_data[bi] += t;
}
}
}
template <bool storeSum>
__device__ static void prescanBlock(unsigned int *data, int blockIndex,
unsigned int *blockSums) {
int stride = buildSum(data); // build the sum in place up the tree
clearLastElement<storeSum>(data, blockSums,
(blockIndex == 0) ? blockIdx.x : blockIndex);
scanRootToLeaves(data, stride); // traverse down tree to build the scan
}
template <bool storeSum, bool isNP2>
__global__ static void
prescan(unsigned int *g_odata, const unsigned int *g_idata,
unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) {
int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;
__shared__ unsigned int s_data[3072];
// load data into shared memory
loadSharedChunkFromMem<isNP2>(
s_data, g_idata, n,
(baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai,
bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
// scan the data in each block
prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);
// write results to device memory
storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi,
bankOffsetA, bankOffsetB);
}
__global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms,
int n, int blockOffset, int baseIndex) {
__shared__ unsigned int uni;
if (threadIdx.x == 0)
uni = uniforms[blockIdx.x + blockOffset];
unsigned int address =
__mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;
__syncthreads();
// note two adds per thread
g_data[address] += uni;
g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
}
#endif // #ifndef _SCAN_BEST_KERNEL_CU_