226 lines
8.0 KiB
Plaintext
Executable File
226 lines
8.0 KiB
Plaintext
Executable File
/*
|
|
* PAVLE - Parallel Variable-Length Encoder for CUDA. Main file.
|
|
*
|
|
* Copyright (C) 2009 Ana Balevic <ana.balevic@gmail.com>
|
|
* All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify it under
|
|
* the terms of the MIT License. Read the full licence:
|
|
* http://www.opensource.org/licenses/mit-license.php
|
|
*
|
|
* If you find this program useful, please contact me and reference PAVLE home
|
|
* page in your work.
|
|
*
|
|
*/
|
|
|
|
#include "comparison_helpers.h"
|
|
#include "cuda_helpers.h"
|
|
#include "load_data.h"
|
|
#include "print_helpers.h"
|
|
#include "stats_logger.h"
|
|
#include "stdafx.h"
|
|
#include <cuda_runtime.h>
|
|
#include <sys/time.h>
|
|
|
|
//#include "vlc_kernel_gm32.cu"
|
|
//#include "vlc_kernel_sm32.cu"
|
|
#include "vlc_kernel_sm64huff.cu"
|
|
//#include "vlc_kernel_dpt.cu"
|
|
//#include "vlc_kernel_dptt.cu"
|
|
//#include "scan_kernel.cu"
|
|
#include "cpuencode.h"
|
|
#include "pack_kernels.cu"
|
|
#include "scan.cu"
|
|
|
|
long long get_time() {
|
|
struct timeval tv;
|
|
gettimeofday(&tv, NULL);
|
|
return (tv.tv_sec * 1000000) + tv.tv_usec;
|
|
}
|
|
void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks = 1);
|
|
|
|
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
|
|
unsigned int *outdata, unsigned int *outsize,
|
|
unsigned int *codewords,
|
|
unsigned int *codewordlens);
|
|
|
|
int main(int argc, char *argv[]) {
|
|
if (!InitCUDA()) {
|
|
return 0;
|
|
}
|
|
unsigned int num_block_threads = 256;
|
|
if (argc > 1)
|
|
for (int i = 1; i < argc; i++)
|
|
runVLCTest(argv[i], num_block_threads);
|
|
else {
|
|
runVLCTest(NULL, num_block_threads, 1024);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) {
|
|
printf("CUDA! Starting VLC Tests!\n");
|
|
unsigned int
|
|
num_elements; // uint num_elements = num_blocks * num_block_threads;
|
|
unsigned int mem_size; // uint mem_size = num_elements * sizeof(int);
|
|
unsigned int symbol_type_size = sizeof(int);
|
|
//////// LOAD DATA ///////////////
|
|
double H; // entropy
|
|
initParams(file_name, num_block_threads, num_blocks, num_elements, mem_size,
|
|
symbol_type_size);
|
|
printf("Parameters: num_elements: %d, num_blocks: %d, num_block_threads: "
|
|
"%d\n----------------------------\n",
|
|
num_elements, num_blocks, num_block_threads);
|
|
////////LOAD DATA ///////////////
|
|
uint *sourceData = (uint *)malloc(mem_size);
|
|
uint *destData = (uint *)malloc(mem_size);
|
|
uint *crefData = (uint *)malloc(mem_size);
|
|
|
|
uint *codewords = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
|
|
uint *codewordlens = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
|
|
|
|
uint *cw32 = (uint *)malloc(mem_size);
|
|
uint *cw32len = (uint *)malloc(mem_size);
|
|
uint *cw32idx = (uint *)malloc(mem_size);
|
|
|
|
uint *cindex2 = (uint *)malloc(num_blocks * sizeof(int));
|
|
|
|
memset(sourceData, 0, mem_size);
|
|
memset(destData, 0, mem_size);
|
|
memset(crefData, 0, mem_size);
|
|
memset(cw32, 0, mem_size);
|
|
memset(cw32len, 0, mem_size);
|
|
memset(cw32idx, 0, mem_size);
|
|
memset(codewords, 0, NUM_SYMBOLS * symbol_type_size);
|
|
memset(codewordlens, 0, NUM_SYMBOLS * symbol_type_size);
|
|
memset(cindex2, 0, num_blocks * sizeof(int));
|
|
//////// LOAD DATA ///////////////
|
|
loadData(file_name, sourceData, codewords, codewordlens, num_elements,
|
|
mem_size, H);
|
|
|
|
//////// LOAD DATA ///////////////
|
|
|
|
unsigned int *d_sourceData, *d_destData, *d_destDataPacked;
|
|
unsigned int *d_codewords, *d_codewordlens;
|
|
unsigned int *d_cw32, *d_cw32len, *d_cw32idx, *d_cindex, *d_cindex2;
|
|
|
|
CUDA_SAFE_CALL(cudaMalloc((void **)&d_sourceData, mem_size));
|
|
CUDA_SAFE_CALL(cudaMalloc((void **)&d_destData, mem_size));
|
|
CUDA_SAFE_CALL(cudaMalloc((void **)&d_destDataPacked, mem_size));
|
|
|
|
CUDA_SAFE_CALL(
|
|
cudaMalloc((void **)&d_codewords, NUM_SYMBOLS * symbol_type_size));
|
|
CUDA_SAFE_CALL(
|
|
cudaMalloc((void **)&d_codewordlens, NUM_SYMBOLS * symbol_type_size));
|
|
|
|
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32, mem_size));
|
|
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32len, mem_size));
|
|
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32idx, mem_size));
|
|
|
|
CUDA_SAFE_CALL(
|
|
cudaMalloc((void **)&d_cindex, num_blocks * sizeof(unsigned int)));
|
|
CUDA_SAFE_CALL(
|
|
cudaMalloc((void **)&d_cindex2, num_blocks * sizeof(unsigned int)));
|
|
// printf("source data\n");
|
|
// for (int i = 0; i < 200; i++) {
|
|
// printf("%d ", sourceData[i]);
|
|
// }
|
|
// printf("\n");
|
|
// printf("codewords\n");
|
|
// for (int i = 0; i < 200; i++) {
|
|
// printf("%d ", codewords[i]);
|
|
// }
|
|
// printf("\n");
|
|
// printf("codeword lens\n");
|
|
// for (int i = 0; i < 200; i++) {
|
|
// printf("%d ", codewordlens[i]);
|
|
// }
|
|
// printf("\n");
|
|
// return;
|
|
CUDA_SAFE_CALL(
|
|
cudaMemcpy(d_sourceData, sourceData, mem_size, cudaMemcpyHostToDevice));
|
|
CUDA_SAFE_CALL(cudaMemcpy(d_codewords, codewords,
|
|
NUM_SYMBOLS * symbol_type_size,
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_SAFE_CALL(cudaMemcpy(d_codewordlens, codewordlens,
|
|
NUM_SYMBOLS * symbol_type_size,
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_SAFE_CALL(
|
|
cudaMemcpy(d_destData, destData, mem_size, cudaMemcpyHostToDevice));
|
|
|
|
dim3 grid_size(num_blocks, 1, 1);
|
|
dim3 block_size(num_block_threads, 1, 1);
|
|
unsigned int sm_size;
|
|
|
|
unsigned int NT = 10; // number of runs for each execution time
|
|
|
|
//////////////////* CPU ENCODER *///////////////////////////////////
|
|
unsigned int refbytesize;
|
|
long long timer = get_time();
|
|
cpu_vlc_encode((unsigned int *)sourceData, num_elements,
|
|
(unsigned int *)crefData, &refbytesize, codewords,
|
|
codewordlens);
|
|
float msec = (float)((get_time() - timer) / 1000.0);
|
|
printf("CPU Encoding time (CPU): %f (ms)\n", msec);
|
|
printf("CPU Encoded to %d [B]\n", refbytesize);
|
|
unsigned int num_ints = refbytesize / 4 + ((refbytesize % 4 == 0) ? 0 : 1);
|
|
//////////////////* END CPU *///////////////////////////////////
|
|
|
|
//////////////////* SM64HUFF KERNEL *///////////////////////////////////
|
|
grid_size.x = num_blocks;
|
|
block_size.x = num_block_threads;
|
|
sm_size = block_size.x * sizeof(unsigned int);
|
|
#ifdef CACHECWLUT
|
|
sm_size = 2 * NUM_SYMBOLS * sizeof(int) + block_size.x * sizeof(unsigned int);
|
|
#endif
|
|
|
|
for (int i = 0; i < NT; i++) {
|
|
vlc_encode_kernel_sm64huff<<<grid_size, block_size>>>(
|
|
d_sourceData, d_codewords, d_codewordlens,
|
|
#ifdef TESTING
|
|
d_cw32, d_cw32len, d_cw32idx,
|
|
#endif
|
|
d_destData, d_cindex); // testedOK2
|
|
cudaThreadSynchronize();
|
|
}
|
|
// //////////////////* END KERNEL *///////////////////////////////////
|
|
|
|
#ifdef TESTING
|
|
unsigned int num_scan_elements = grid_size.x;
|
|
preallocBlockSums(num_scan_elements);
|
|
cudaMemset(d_destDataPacked, 0, mem_size);
|
|
printf("Num_blocks to be passed to scan is %d.\n", num_scan_elements);
|
|
prescanArray(d_cindex2, d_cindex, num_scan_elements);
|
|
pack2<<<num_scan_elements / 32, 32>>>(
|
|
(unsigned int *)d_destData, d_cindex, d_cindex2,
|
|
(unsigned int *)d_destDataPacked, num_elements / num_scan_elements);
|
|
cudaThreadSynchronize();
|
|
CUT_CHECK_ERROR("Pack2 Kernel execution failed\n");
|
|
deallocBlockSums();
|
|
// return;
|
|
|
|
CUDA_SAFE_CALL(
|
|
cudaMemcpy(destData, d_destDataPacked, mem_size, cudaMemcpyDeviceToHost));
|
|
compare_vectors((unsigned int *)crefData, (unsigned int *)destData, num_ints);
|
|
#endif
|
|
|
|
free(sourceData);
|
|
free(destData);
|
|
free(codewords);
|
|
free(codewordlens);
|
|
free(cw32);
|
|
free(cw32len);
|
|
free(crefData);
|
|
CUDA_SAFE_CALL(cudaFree(d_sourceData));
|
|
CUDA_SAFE_CALL(cudaFree(d_destData));
|
|
CUDA_SAFE_CALL(cudaFree(d_destDataPacked));
|
|
CUDA_SAFE_CALL(cudaFree(d_codewords));
|
|
CUDA_SAFE_CALL(cudaFree(d_codewordlens));
|
|
CUDA_SAFE_CALL(cudaFree(d_cw32));
|
|
CUDA_SAFE_CALL(cudaFree(d_cw32len));
|
|
CUDA_SAFE_CALL(cudaFree(d_cw32idx));
|
|
CUDA_SAFE_CALL(cudaFree(d_cindex));
|
|
CUDA_SAFE_CALL(cudaFree(d_cindex2));
|
|
free(cindex2);
|
|
}
|