#ifndef _VLC_SM64HUFF_KERNEL_H_ #define _VLC_SM64HUFF_KERNEL_H_ #include "pabio_kernels_v2.cu" #include "parameters.h" #include #ifdef SMATOMICS /* HUFFMAN-FRIENDLY PAVLE CHARACTERISTICS: 1. CACHE CW_LUT INTO SM, LOAD AS 2 INT ARRAYS 2. PARALLEL PREFIX SUM 3. PARALLEL BIT I/O USING SHARED-MEMORY ATOMIC OPERATIONS (COMAPTIBLE WITH CUDA1.3+) NOTES & ASSUMPTIONS: - HUFFMAN-CODING FRIENDLY, SUPPORTS CODEWORDS OF 2X SIZE OF ORIGINAL SYMBOLS (BYTES). - NUMBER OF THREADS PER BLOCK IS 256; IF YOU WANT TO PLAY WITH DIFFERENT NUMBERS, THE CW CACHING SHOULD BE MODIFIED (SEE DPT* KERNELS) - SM usage: 1x size of the input data (REUSE) + size of CWLUT TURN ON CACHING FOR HIGH ENTROPY DATA! */ __global__ static void vlc_encode_kernel_sm64huff(unsigned int *data, const unsigned int *gm_codewords, const unsigned int *gm_codewordlens, #ifdef TESTING unsigned int *cw32, unsigned int *cw32len, unsigned int *cw32idx, #endif unsigned int *out, unsigned int *outidx) { unsigned int kn = blockIdx.x * blockDim.x + threadIdx.x; unsigned int k = threadIdx.x; unsigned int kc, startbit, wrbits; unsigned long long cw64 = 0; unsigned int val32, codewordlen = 0; unsigned char tmpbyte, tmpcwlen; unsigned int tmpcw32; __shared__ unsigned int sm[3072]; __shared__ unsigned int kcmax; #ifdef CACHECWLUT unsigned int *codewords = (unsigned int *)sm; unsigned int *codewordlens = (unsigned int *)(sm + NUM_SYMBOLS); unsigned int *as = (unsigned int *)(sm + 2 * NUM_SYMBOLS); /* Load the codewords and the original data*/ codewords[k] = gm_codewords[k]; codewordlens[k] = gm_codewordlens[k]; val32 = data[kn]; __syncthreads(); for (unsigned int i = 0; i < 4; i++) { tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8)); tmpcw32 = codewords[tmpbyte]; tmpcwlen = codewordlens[tmpbyte]; cw64 = (cw64 << tmpcwlen) | tmpcw32; codewordlen += tmpcwlen; } #else unsigned int *as = (unsigned int *)sm; val32 = data[kn]; for (unsigned int i = 0; i < 4; i++) { tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8)); tmpcw32 = gm_codewords[tmpbyte]; tmpcwlen = gm_codewordlens[tmpbyte]; cw64 = (cw64 << tmpcwlen) | tmpcw32; codewordlen += tmpcwlen; } #endif as[k] = codewordlen; __syncthreads(); /* Prefix sum of codeword lengths (denoted in bits) [inplace implementation] */ unsigned int offset = 1; /* Build the sum in place up the tree */ for (unsigned int d = (blockDim.x) >> 1; d > 0; d >>= 1) { __syncthreads(); if (k < d) { unsigned char ai = offset * (2 * k + 1) - 1; unsigned char bi = offset * (2 * k + 2) - 1; as[bi] += as[ai]; } offset *= 2; } /* scan back down the tree */ /* clear the last element */ if (k == 0) as[blockDim.x - 1] = 0; // traverse down the tree building the scan in place for (unsigned int d = 1; d < blockDim.x; d *= 2) { offset >>= 1; __syncthreads(); if (k < d) { unsigned char ai = offset * (2 * k + 1) - 1; unsigned char bi = offset * (2 * k + 2) - 1; unsigned int t = as[ai]; as[ai] = as[bi]; as[bi] += t; } } __syncthreads(); if (k == blockDim.x - 1) { outidx[blockIdx.x] = as[k] + codewordlen; kcmax = (as[k] + codewordlen) / 32; // printf("kcmax: %d\n", kcmax); } /* Write the codes */ kc = as[k] / 32; startbit = as[k] % 32; as[k] = 0U; __syncthreads(); /* Part 1*/ wrbits = codewordlen > (32 - startbit) ? (32 - startbit) : codewordlen; tmpcw32 = (unsigned int)(cw64 >> (codewordlen - wrbits)); // if (wrbits == 32) as[kc] = tmpcw32; // //unnecessary overhead; increases number of branches else atomicOr(&as[kc], tmpcw32 << (32 - startbit - wrbits)); // shift left in case it's shorter // then the available space codewordlen -= wrbits; /*Part 2*/ if (codewordlen) { wrbits = codewordlen > 32 ? 32 : codewordlen; tmpcw32 = (unsigned int)(cw64 >> (codewordlen - wrbits)) & ((1 << wrbits) - 1); // if (wrbits == 32) as[kc+1] = tmpcw32; // else atomicOr(&as[kc + 1], tmpcw32 << (32 - wrbits)); codewordlen -= wrbits; } /*Part 3*/ if (codewordlen) { tmpcw32 = (unsigned int)(cw64 & ((1 << codewordlen) - 1)); // if (wrbits == 32) as[kc+2] = tmpcw32; // else atomicOr(&as[kc + 2], tmpcw32 << (32 - codewordlen)); } __syncthreads(); if (k <= kcmax) out[kn] = as[k]; } ////////////////////////////////////////////////////////////////////////////// #endif #endif