/* * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. * * NOTICE TO USER: * * This source code is subject to NVIDIA ownership rights under U.S. and * international Copyright laws. * * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE * OR PERFORMANCE OF THIS SOURCE CODE. * * U.S. Government End Users. This source code is a "commercial item" as * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of * "commercial computer software" and "commercial computer software * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) * and is provided to the U.S. Government only as a commercial end item. * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * source code with only those rights set forth herein. */ #ifndef _PRESCAN_CU_ #define _PRESCAN_CU_ // includes, kernels #include "cutil.h" #include "scanLargeArray_kernel.cu" #include <assert.h> #include <stdio.h> #define max(a, b) (a > b ? a : b) inline bool isPowerOfTwo(int n) { return ((n & (n - 1)) == 0); } inline int floorPow2(int n) { #ifdef WIN32 // method 2 return 1 << (int)logb((float)n); #else // method 1 // float nf = (float)n; // return 1 << (((*(int*)&nf) >> 23) - 127); int exp; frexp((float)n, &exp); return 1 << (exp - 1); #endif } #define BLOCK_SIZE 256 static unsigned int **g_scanBlockSums; static unsigned int g_numEltsAllocated = 0; static unsigned int g_numLevelsAllocated = 0; static void preallocBlockSums(unsigned int maxNumElements) { assert(g_numEltsAllocated == 0); // shouldn't be called g_numEltsAllocated = maxNumElements; unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks unsigned int numElts = maxNumElements; int level = 0; do { unsigned int numBlocks = max(1, (int)ceil((float)numElts / (2.f * blockSize))); if (numBlocks > 1) level++; numElts = numBlocks; } while (numElts > 1); g_scanBlockSums = (unsigned int **)malloc(level * sizeof(unsigned int *)); g_numLevelsAllocated = level; numElts = maxNumElements; level = 0; do { unsigned int numBlocks = max(1, (int)ceil((float)numElts / (2.f * blockSize))); if (numBlocks > 1) CUDA_SAFE_CALL(cudaMalloc((void **)&g_scanBlockSums[level++], numBlocks * sizeof(unsigned int))); numElts = numBlocks; } while (numElts > 1); CUT_CHECK_ERROR("preallocBlockSums"); } static void deallocBlockSums() { for (unsigned int i = 0; i < g_numLevelsAllocated; i++) { cudaFree(g_scanBlockSums[i]); } CUT_CHECK_ERROR("deallocBlockSums"); free((void **)g_scanBlockSums); g_scanBlockSums = 0; g_numEltsAllocated = 0; g_numLevelsAllocated = 0; } static void prescanArrayRecursive(unsigned int *outArray, const unsigned int *inArray, int numElements, int level) { unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks unsigned int numBlocks = max(1, (int)ceil((float)numElements / (2.f * blockSize))); unsigned int numThreads; if (numBlocks > 1) numThreads = blockSize; else if (isPowerOfTwo(numElements)) numThreads = numElements / 2; else numThreads = floorPow2(numElements); unsigned int numEltsPerBlock = numThreads * 2; // if this is a non-power-of-2 array, the last block will be non-full // compute the smallest power of 2 able to compute its scan. unsigned int numEltsLastBlock = numElements - (numBlocks - 1) * numEltsPerBlock; unsigned int numThreadsLastBlock = max(1, numEltsLastBlock / 2); unsigned int np2LastBlock = 0; unsigned int sharedMemLastBlock = 0; if (numEltsLastBlock != numEltsPerBlock) { np2LastBlock = 1; if (!isPowerOfTwo(numEltsLastBlock)) numThreadsLastBlock = floorPow2(numEltsLastBlock); unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS; sharedMemLastBlock = sizeof(unsigned int) * (2 * numThreadsLastBlock + extraSpace); } // padding space is used to avoid shared memory bank conflicts unsigned int extraSpace = numEltsPerBlock / NUM_BANKS; unsigned int sharedMemSize = sizeof(unsigned int) * (numEltsPerBlock + extraSpace); #ifdef DEBUG if (numBlocks > 1) { assert(g_numEltsAllocated >= numElements); } #endif // setup execution parameters // if NP2, we process the last block separately dim3 grid(max(1, numBlocks - np2LastBlock), 1, 1); dim3 threads(numThreads, 1, 1); // make sure there are no CUDA errors before we start CUT_CHECK_ERROR("prescanArrayRecursive before kernels"); // execute the scan if (numBlocks > 1) { prescan<true, false><<<grid, threads>>>( outArray, inArray, g_scanBlockSums[level], numThreads * 2, 0, 0); cudaThreadSynchronize(); CUT_CHECK_ERROR("prescanWithBlockSums"); if (np2LastBlock) { prescan<true, true><<<1, numThreadsLastBlock>>>( outArray, inArray, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock); cudaThreadSynchronize(); CUT_CHECK_ERROR("prescanNP2WithBlockSums"); } // After scanning all the sub-blocks, we are mostly done. But now we // need to take all of the last values of the sub-blocks and scan those. // This will give us a new value that must be sdded to each block to // get the final results. // recursive (CPU) call prescanArrayRecursive(g_scanBlockSums[level], g_scanBlockSums[level], numBlocks, level + 1); uniformAdd<<<grid, threads>>>(outArray, g_scanBlockSums[level], numElements - numEltsLastBlock, 0, 0); cudaThreadSynchronize(); CUT_CHECK_ERROR("uniformAdd"); if (np2LastBlock) { uniformAdd<<<1, numThreadsLastBlock>>>(outArray, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock); cudaThreadSynchronize(); CUT_CHECK_ERROR("uniformAdd"); } } else if (isPowerOfTwo(numElements)) { prescan<false, false> <<<grid, threads>>>(outArray, inArray, 0, numThreads * 2, 0, 0); cudaThreadSynchronize(); CUT_CHECK_ERROR("prescan"); } else { prescan<false, true> <<<grid, threads>>>(outArray, inArray, 0, numElements, 0, 0); cudaThreadSynchronize(); CUT_CHECK_ERROR("prescanNP2"); } } static void prescanArray(unsigned int *outArray, unsigned int *inArray, int numElements) { prescanArrayRecursive(outArray, inArray, numElements, 0); } #endif // _PRESCAN_CU_