CuPBoP/examples/huffman/pack_kernels.cu

44 lines
1.7 KiB
Plaintext
Executable File

#ifndef _PACK_KERNELS_H_
#define _PACK_KERNELS_H_
#include "parameters.h"
__global__ static void pack2(unsigned int *srcData, unsigned int *cindex,
unsigned int *cindex2, unsigned int *dstData,
unsigned int original_num_block_elements) {
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
// source index
unsigned int offset = tid * original_num_block_elements; // DPB,
unsigned int bitsize = cindex[tid];
// destination index
unsigned int pos = cindex2[tid], dword = pos / 32, bit = pos % 32;
unsigned int i, dw, tmp;
dw = srcData[offset]; // load the first dword from srcData[]
tmp = dw >> bit; // cut off those bits that do not fit into the initial
// location in destData[]
atomicOr(&dstData[dword], tmp); // fill up this initial location
tmp = (bit == 0) ? 0 : (dw << 32 - bit);
for (i = 1; i < bitsize / 32;
i++) { // from now on, we have exclusive access to destData[]
dw = srcData[offset + i]; // load next dword from srcData[]
tmp |= dw >> bit; // fill up tmp
dstData[dword + i] = tmp; // write complete dword to destData[]
tmp = (bit == 0) ? 0 : (dw << 32 - bit);
}
// exclusive access to dstData[] ends here
// the remaining block can, or rather should be further optimized
// write the remaining bits in tmp, UNLESS bit is 0 and bitsize is divisible
// by 32, in this case do nothing
if (bit != 0 || bitsize % 32 != 0)
atomicOr(&dstData[dword + i], tmp);
if (bitsize % 32 != 0) {
dw = srcData[offset + i];
atomicOr(&dstData[dword + i], dw >> bit);
atomicOr(&dstData[dword + i + 1], (bit == 0) ? 0 : (dw << 32 - bit));
}
}
#endif