44 lines
1.7 KiB
Plaintext
Executable File
44 lines
1.7 KiB
Plaintext
Executable File
#ifndef _PACK_KERNELS_H_
|
|
#define _PACK_KERNELS_H_
|
|
#include "parameters.h"
|
|
|
|
__global__ static void pack2(unsigned int *srcData, unsigned int *cindex,
|
|
unsigned int *cindex2, unsigned int *dstData,
|
|
unsigned int original_num_block_elements) {
|
|
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
// source index
|
|
unsigned int offset = tid * original_num_block_elements; // DPB,
|
|
unsigned int bitsize = cindex[tid];
|
|
|
|
// destination index
|
|
unsigned int pos = cindex2[tid], dword = pos / 32, bit = pos % 32;
|
|
|
|
unsigned int i, dw, tmp;
|
|
dw = srcData[offset]; // load the first dword from srcData[]
|
|
tmp = dw >> bit; // cut off those bits that do not fit into the initial
|
|
// location in destData[]
|
|
atomicOr(&dstData[dword], tmp); // fill up this initial location
|
|
tmp = (bit == 0) ? 0 : (dw << 32 - bit);
|
|
for (i = 1; i < bitsize / 32;
|
|
i++) { // from now on, we have exclusive access to destData[]
|
|
dw = srcData[offset + i]; // load next dword from srcData[]
|
|
tmp |= dw >> bit; // fill up tmp
|
|
dstData[dword + i] = tmp; // write complete dword to destData[]
|
|
tmp = (bit == 0) ? 0 : (dw << 32 - bit);
|
|
}
|
|
// exclusive access to dstData[] ends here
|
|
// the remaining block can, or rather should be further optimized
|
|
// write the remaining bits in tmp, UNLESS bit is 0 and bitsize is divisible
|
|
// by 32, in this case do nothing
|
|
if (bit != 0 || bitsize % 32 != 0)
|
|
atomicOr(&dstData[dword + i], tmp);
|
|
if (bitsize % 32 != 0) {
|
|
dw = srcData[offset + i];
|
|
atomicOr(&dstData[dword + i], dw >> bit);
|
|
atomicOr(&dstData[dword + i + 1], (bit == 0) ? 0 : (dw << 32 - bit));
|
|
}
|
|
}
|
|
|
|
#endif
|