#ifndef _PACK_KERNELS_H_ #define _PACK_KERNELS_H_ #include "parameters.h" __global__ static void pack2(unsigned int *srcData, unsigned int *cindex, unsigned int *cindex2, unsigned int *dstData, unsigned int original_num_block_elements) { unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; // source index unsigned int offset = tid * original_num_block_elements; // DPB, unsigned int bitsize = cindex[tid]; // destination index unsigned int pos = cindex2[tid], dword = pos / 32, bit = pos % 32; unsigned int i, dw, tmp; dw = srcData[offset]; // load the first dword from srcData[] tmp = dw >> bit; // cut off those bits that do not fit into the initial // location in destData[] atomicOr(&dstData[dword], tmp); // fill up this initial location tmp = (bit == 0) ? 0 : (dw << 32 - bit); for (i = 1; i < bitsize / 32; i++) { // from now on, we have exclusive access to destData[] dw = srcData[offset + i]; // load next dword from srcData[] tmp |= dw >> bit; // fill up tmp dstData[dword + i] = tmp; // write complete dword to destData[] tmp = (bit == 0) ? 0 : (dw << 32 - bit); } // exclusive access to dstData[] ends here // the remaining block can, or rather should be further optimized // write the remaining bits in tmp, UNLESS bit is 0 and bitsize is divisible // by 32, in this case do nothing if (bit != 0 || bitsize % 32 != 0) atomicOr(&dstData[dword + i], tmp); if (bitsize % 32 != 0) { dw = srcData[offset + i]; atomicOr(&dstData[dword + i], dw >> bit); atomicOr(&dstData[dword + i + 1], (bit == 0) ? 0 : (dw << 32 - bit)); } } #endif