/* * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. * * NOTICE TO USER: * * This source code is subject to NVIDIA ownership rights under U.S. and * international Copyright laws. * * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE * OR PERFORMANCE OF THIS SOURCE CODE. * * U.S. Government End Users. This source code is a "commercial item" as * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of * "commercial computer software" and "commercial computer software * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) * and is provided to the U.S. Government only as a commercial end item. * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * source code with only those rights set forth herein. */ #ifndef _SCAN_BEST_KERNEL_CU_ #define _SCAN_BEST_KERNEL_CU_ // Define this to more rigorously avoid bank conflicts, // even at the lower (root) levels of the tree // Note that due to the higher addressing overhead, performance // is lower with ZERO_BANK_CONFLICTS enabled. It is provided // as an example. //#define ZERO_BANK_CONFLICTS // 16 banks on G80 #define NUM_BANKS 16 #define LOG_NUM_BANKS 4 #ifdef ZERO_BANK_CONFLICTS #define CONFLICT_FREE_OFFSET(index) \ ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS)) #else #define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS) #endif /////////////////////////////////////////////////////////////////////////////// // Work-efficient compute implementation of scan, one thread per 2 elements // Work-efficient: O(log(n)) steps, and O(n) adds. // Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no // ping-ponging Also avoids most bank conflicts using single-element offsets // every NUM_BANKS elements. // // In addition, If ZERO_BANK_CONFLICTS is defined, uses // n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS) // shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts // using single-element offsets every NUM_BANKS elements, plus additional // single-element offsets after every NUM_BANKS^2 elements. // // Uses a balanced tree type algorithm. See Blelloch, 1990 "Prefix Sums // and Their Applications", or Prins and Chatterjee PRAM course notes: // http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf // // This work-efficient version is based on the algorithm presented in Guy // Blelloch's excellent paper "Prefix sums and their applications". // http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html // // Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS // is defined) Con: More instructions to compute bank-conflict-free shared // memory addressing, and slightly more shared memory storage used. // template __device__ static void loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n, int baseIndex, int &ai, int &bi, int &mem_ai, int &mem_bi, int &bankOffsetA, int &bankOffsetB) { int thid = threadIdx.x; mem_ai = baseIndex + threadIdx.x; mem_bi = mem_ai + blockDim.x; ai = thid; bi = thid + blockDim.x; // compute spacing to avoid bank conflicts bankOffsetA = CONFLICT_FREE_OFFSET(ai); bankOffsetB = CONFLICT_FREE_OFFSET(bi); // Cache the computational window in shared memory // pad values beyond n with zeros s_data[ai + bankOffsetA] = g_idata[mem_ai]; if (isNP2) // compile-time decision { s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0; } else { s_data[bi + bankOffsetB] = g_idata[mem_bi]; } } template __device__ static void storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n, int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA, int bankOffsetB) { __syncthreads(); // write results to global memory g_odata[mem_ai] = s_data[ai + bankOffsetA]; if (isNP2) // compile-time decision { if (bi < n) g_odata[mem_bi] = s_data[bi + bankOffsetB]; } else { g_odata[mem_bi] = s_data[bi + bankOffsetB]; } } template __device__ static void clearLastElement(unsigned int *s_data, unsigned int *g_blockSums, int blockIndex) { if (threadIdx.x == 0) { int index = (blockDim.x << 1) - 1; index += CONFLICT_FREE_OFFSET(index); if (storeSum) // compile-time decision { // write this block's total sum to the corresponding index in the // blockSums array g_blockSums[blockIndex] = s_data[index]; } // zero the last element in the scan so it will propagate back to the front s_data[index] = 0; } } __device__ static unsigned int buildSum(unsigned int *s_data) { unsigned int thid = threadIdx.x; unsigned int stride = 1; // build the sum in place up the tree for (int d = blockDim.x; d > 0; d >>= 1) { __syncthreads(); if (thid < d) { int i = __mul24(__mul24(2, stride), thid); int ai = i + stride - 1; int bi = ai + stride; ai += CONFLICT_FREE_OFFSET(ai); bi += CONFLICT_FREE_OFFSET(bi); s_data[bi] += s_data[ai]; } stride *= 2; } return stride; } __device__ static void scanRootToLeaves(unsigned int *s_data, unsigned int stride) { unsigned int thid = threadIdx.x; // traverse down the tree building the scan in place for (int d = 1; d <= blockDim.x; d *= 2) { stride >>= 1; __syncthreads(); if (thid < d) { int i = __mul24(__mul24(2, stride), thid); int ai = i + stride - 1; int bi = ai + stride; ai += CONFLICT_FREE_OFFSET(ai); bi += CONFLICT_FREE_OFFSET(bi); unsigned int t = s_data[ai]; s_data[ai] = s_data[bi]; s_data[bi] += t; } } } template __device__ static void prescanBlock(unsigned int *data, int blockIndex, unsigned int *blockSums) { int stride = buildSum(data); // build the sum in place up the tree clearLastElement(data, blockSums, (blockIndex == 0) ? blockIdx.x : blockIndex); scanRootToLeaves(data, stride); // traverse down tree to build the scan } template __global__ static void prescan(unsigned int *g_odata, const unsigned int *g_idata, unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) { int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB; __shared__ unsigned int s_data[3072]; // load data into shared memory loadSharedChunkFromMem( s_data, g_idata, n, (baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB); // scan the data in each block prescanBlock(s_data, blockIndex, g_blockSums); // write results to device memory storeSharedChunkToMem(g_odata, s_data, n, ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB); } __global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms, int n, int blockOffset, int baseIndex) { __shared__ unsigned int uni; if (threadIdx.x == 0) uni = uniforms[blockIdx.x + blockOffset]; unsigned int address = __mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x; __syncthreads(); // note two adds per thread g_data[address] += uni; g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni; } #endif // #ifndef _SCAN_BEST_KERNEL_CU_