CuPBoP/examples/huffman/scan.cu

217 lines
7.4 KiB
Plaintext
Executable File

/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#ifndef _PRESCAN_CU_
#define _PRESCAN_CU_
// includes, kernels
#include "cutil.h"
#include "scanLargeArray_kernel.cu"
#include <assert.h>
#include <stdio.h>
#define max(a, b) (a > b ? a : b)
inline bool isPowerOfTwo(int n) { return ((n & (n - 1)) == 0); }
inline int floorPow2(int n) {
#ifdef WIN32
// method 2
return 1 << (int)logb((float)n);
#else
// method 1
// float nf = (float)n;
// return 1 << (((*(int*)&nf) >> 23) - 127);
int exp;
frexp((float)n, &exp);
return 1 << (exp - 1);
#endif
}
#define BLOCK_SIZE 256
static unsigned int **g_scanBlockSums;
static unsigned int g_numEltsAllocated = 0;
static unsigned int g_numLevelsAllocated = 0;
static void preallocBlockSums(unsigned int maxNumElements) {
assert(g_numEltsAllocated == 0); // shouldn't be called
g_numEltsAllocated = maxNumElements;
unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
unsigned int numElts = maxNumElements;
int level = 0;
do {
unsigned int numBlocks =
max(1, (int)ceil((float)numElts / (2.f * blockSize)));
if (numBlocks > 1)
level++;
numElts = numBlocks;
} while (numElts > 1);
g_scanBlockSums = (unsigned int **)malloc(level * sizeof(unsigned int *));
g_numLevelsAllocated = level;
numElts = maxNumElements;
level = 0;
do {
unsigned int numBlocks =
max(1, (int)ceil((float)numElts / (2.f * blockSize)));
if (numBlocks > 1)
CUDA_SAFE_CALL(cudaMalloc((void **)&g_scanBlockSums[level++],
numBlocks * sizeof(unsigned int)));
numElts = numBlocks;
} while (numElts > 1);
CUT_CHECK_ERROR("preallocBlockSums");
}
static void deallocBlockSums() {
for (unsigned int i = 0; i < g_numLevelsAllocated; i++) {
cudaFree(g_scanBlockSums[i]);
}
CUT_CHECK_ERROR("deallocBlockSums");
free((void **)g_scanBlockSums);
g_scanBlockSums = 0;
g_numEltsAllocated = 0;
g_numLevelsAllocated = 0;
}
static void prescanArrayRecursive(unsigned int *outArray,
const unsigned int *inArray, int numElements,
int level) {
unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
unsigned int numBlocks =
max(1, (int)ceil((float)numElements / (2.f * blockSize)));
unsigned int numThreads;
if (numBlocks > 1)
numThreads = blockSize;
else if (isPowerOfTwo(numElements))
numThreads = numElements / 2;
else
numThreads = floorPow2(numElements);
unsigned int numEltsPerBlock = numThreads * 2;
// if this is a non-power-of-2 array, the last block will be non-full
// compute the smallest power of 2 able to compute its scan.
unsigned int numEltsLastBlock =
numElements - (numBlocks - 1) * numEltsPerBlock;
unsigned int numThreadsLastBlock = max(1, numEltsLastBlock / 2);
unsigned int np2LastBlock = 0;
unsigned int sharedMemLastBlock = 0;
if (numEltsLastBlock != numEltsPerBlock) {
np2LastBlock = 1;
if (!isPowerOfTwo(numEltsLastBlock))
numThreadsLastBlock = floorPow2(numEltsLastBlock);
unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
sharedMemLastBlock =
sizeof(unsigned int) * (2 * numThreadsLastBlock + extraSpace);
}
// padding space is used to avoid shared memory bank conflicts
unsigned int extraSpace = numEltsPerBlock / NUM_BANKS;
unsigned int sharedMemSize =
sizeof(unsigned int) * (numEltsPerBlock + extraSpace);
#ifdef DEBUG
if (numBlocks > 1) {
assert(g_numEltsAllocated >= numElements);
}
#endif
// setup execution parameters
// if NP2, we process the last block separately
dim3 grid(max(1, numBlocks - np2LastBlock), 1, 1);
dim3 threads(numThreads, 1, 1);
// make sure there are no CUDA errors before we start
CUT_CHECK_ERROR("prescanArrayRecursive before kernels");
// execute the scan
if (numBlocks > 1) {
prescan<true, false><<<grid, threads>>>(
outArray, inArray, g_scanBlockSums[level], numThreads * 2, 0, 0);
cudaThreadSynchronize();
CUT_CHECK_ERROR("prescanWithBlockSums");
if (np2LastBlock) {
prescan<true, true><<<1, numThreadsLastBlock>>>(
outArray, inArray, g_scanBlockSums[level], numEltsLastBlock,
numBlocks - 1, numElements - numEltsLastBlock);
cudaThreadSynchronize();
CUT_CHECK_ERROR("prescanNP2WithBlockSums");
}
// After scanning all the sub-blocks, we are mostly done. But now we
// need to take all of the last values of the sub-blocks and scan those.
// This will give us a new value that must be sdded to each block to
// get the final results.
// recursive (CPU) call
prescanArrayRecursive(g_scanBlockSums[level], g_scanBlockSums[level],
numBlocks, level + 1);
uniformAdd<<<grid, threads>>>(outArray, g_scanBlockSums[level],
numElements - numEltsLastBlock, 0, 0);
cudaThreadSynchronize();
CUT_CHECK_ERROR("uniformAdd");
if (np2LastBlock) {
uniformAdd<<<1, numThreadsLastBlock>>>(outArray, g_scanBlockSums[level],
numEltsLastBlock, numBlocks - 1,
numElements - numEltsLastBlock);
cudaThreadSynchronize();
CUT_CHECK_ERROR("uniformAdd");
}
} else if (isPowerOfTwo(numElements)) {
prescan<false, false>
<<<grid, threads>>>(outArray, inArray, 0, numThreads * 2, 0, 0);
cudaThreadSynchronize();
CUT_CHECK_ERROR("prescan");
} else {
prescan<false, true>
<<<grid, threads>>>(outArray, inArray, 0, numElements, 0, 0);
cudaThreadSynchronize();
CUT_CHECK_ERROR("prescanNP2");
}
}
static void prescanArray(unsigned int *outArray, unsigned int *inArray,
int numElements) {
prescanArrayRecursive(outArray, inArray, numElements, 0);
}
#endif // _PRESCAN_CU_