CuPBoP/examples/huffman/main_test_cu.cu

/*
 * PAVLE - Parallel Variable-Length Encoder for CUDA. Main file.
 *
 * Copyright (C) 2009 Ana Balevic <ana.balevic@gmail.com>
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the MIT License. Read the full licence:
 * http://www.opensource.org/licenses/mit-license.php
 *
 * If you find this program useful, please contact me and reference PAVLE home
 * page in your work.
 *
 */

#include "comparison_helpers.h"
#include "cuda_helpers.h"
#include "load_data.h"
#include "print_helpers.h"
#include "stats_logger.h"
#include "stdafx.h"
#include <cuda_runtime.h>
#include <sys/time.h>

//#include "vlc_kernel_gm32.cu"
//#include "vlc_kernel_sm32.cu"
#include "vlc_kernel_sm64huff.cu"
//#include "vlc_kernel_dpt.cu"
//#include "vlc_kernel_dptt.cu"
//#include "scan_kernel.cu"
#include "cpuencode.h"
#include "pack_kernels.cu"
#include "scan.cu"

long long get_time() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (tv.tv_sec * 1000000) + tv.tv_usec;
}
void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks = 1);

extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
                               unsigned int *outdata, unsigned int *outsize,
                               unsigned int *codewords,
                               unsigned int *codewordlens);

int main(int argc, char *argv[]) {
  if (!InitCUDA()) {
    return 0;
  }
  unsigned int num_block_threads = 256;
  if (argc > 1)
    for (int i = 1; i < argc; i++)
      runVLCTest(argv[i], num_block_threads);
  else {
    runVLCTest(NULL, num_block_threads, 1024);
  }
  return 0;
}

void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) {
  printf("CUDA! Starting VLC Tests!\n");
  unsigned int
      num_elements;      // uint num_elements = num_blocks * num_block_threads;
  unsigned int mem_size; // uint mem_size = num_elements * sizeof(int);
  unsigned int symbol_type_size = sizeof(int);
  //////// LOAD DATA ///////////////
  double H; // entropy
  initParams(file_name, num_block_threads, num_blocks, num_elements, mem_size,
             symbol_type_size);
  printf("Parameters: num_elements: %d, num_blocks: %d, num_block_threads: "
         "%d\n----------------------------\n",
         num_elements, num_blocks, num_block_threads);
  ////////LOAD DATA ///////////////
  uint *sourceData = (uint *)malloc(mem_size);
  uint *destData = (uint *)malloc(mem_size);
  uint *crefData = (uint *)malloc(mem_size);

  uint *codewords = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
  uint *codewordlens = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);

  uint *cw32 = (uint *)malloc(mem_size);
  uint *cw32len = (uint *)malloc(mem_size);
  uint *cw32idx = (uint *)malloc(mem_size);

  uint *cindex2 = (uint *)malloc(num_blocks * sizeof(int));

  memset(sourceData, 0, mem_size);
  memset(destData, 0, mem_size);
  memset(crefData, 0, mem_size);
  memset(cw32, 0, mem_size);
  memset(cw32len, 0, mem_size);
  memset(cw32idx, 0, mem_size);
  memset(codewords, 0, NUM_SYMBOLS * symbol_type_size);
  memset(codewordlens, 0, NUM_SYMBOLS * symbol_type_size);
  memset(cindex2, 0, num_blocks * sizeof(int));
  //////// LOAD DATA ///////////////
  loadData(file_name, sourceData, codewords, codewordlens, num_elements,
           mem_size, H);

  //////// LOAD DATA ///////////////

  unsigned int *d_sourceData, *d_destData, *d_destDataPacked;
  unsigned int *d_codewords, *d_codewordlens;
  unsigned int *d_cw32, *d_cw32len, *d_cw32idx, *d_cindex, *d_cindex2;

  CUDA_SAFE_CALL(cudaMalloc((void **)&d_sourceData, mem_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_destData, mem_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_destDataPacked, mem_size));

  CUDA_SAFE_CALL(
      cudaMalloc((void **)&d_codewords, NUM_SYMBOLS * symbol_type_size));
  CUDA_SAFE_CALL(
      cudaMalloc((void **)&d_codewordlens, NUM_SYMBOLS * symbol_type_size));

  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32, mem_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32len, mem_size));
  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32idx, mem_size));

  CUDA_SAFE_CALL(
      cudaMalloc((void **)&d_cindex, num_blocks * sizeof(unsigned int)));
  CUDA_SAFE_CALL(
      cudaMalloc((void **)&d_cindex2, num_blocks * sizeof(unsigned int)));
  // printf("source data\n");
  // for (int i = 0; i < 200; i++) {
  //   printf("%d ", sourceData[i]);
  // }
  // printf("\n");
  // printf("codewords\n");
  // for (int i = 0; i < 200; i++) {
  //   printf("%d ", codewords[i]);
  // }
  // printf("\n");
  // printf("codeword lens\n");
  // for (int i = 0; i < 200; i++) {
  //   printf("%d ", codewordlens[i]);
  // }
  // printf("\n");
  // return;
  CUDA_SAFE_CALL(
      cudaMemcpy(d_sourceData, sourceData, mem_size, cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(cudaMemcpy(d_codewords, codewords,
                            NUM_SYMBOLS * symbol_type_size,
                            cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(cudaMemcpy(d_codewordlens, codewordlens,
                            NUM_SYMBOLS * symbol_type_size,
                            cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(
      cudaMemcpy(d_destData, destData, mem_size, cudaMemcpyHostToDevice));

  dim3 grid_size(num_blocks, 1, 1);
  dim3 block_size(num_block_threads, 1, 1);
  unsigned int sm_size;

  unsigned int NT = 10; // number of runs for each execution time

  //////////////////* CPU ENCODER *///////////////////////////////////
  unsigned int refbytesize;
  long long timer = get_time();
  cpu_vlc_encode((unsigned int *)sourceData, num_elements,
                 (unsigned int *)crefData, &refbytesize, codewords,
                 codewordlens);
  float msec = (float)((get_time() - timer) / 1000.0);
  printf("CPU Encoding time (CPU): %f (ms)\n", msec);
  printf("CPU Encoded to %d [B]\n", refbytesize);
  unsigned int num_ints = refbytesize / 4 + ((refbytesize % 4 == 0) ? 0 : 1);
  //////////////////* END CPU *///////////////////////////////////

  //////////////////* SM64HUFF KERNEL *///////////////////////////////////
  grid_size.x = num_blocks;
  block_size.x = num_block_threads;
  sm_size = block_size.x * sizeof(unsigned int);
#ifdef CACHECWLUT
  sm_size = 2 * NUM_SYMBOLS * sizeof(int) + block_size.x * sizeof(unsigned int);
#endif

  for (int i = 0; i < NT; i++) {
    vlc_encode_kernel_sm64huff<<<grid_size, block_size>>>(
        d_sourceData, d_codewords, d_codewordlens,
#ifdef TESTING
        d_cw32, d_cw32len, d_cw32idx,
#endif
        d_destData, d_cindex); // testedOK2
    cudaThreadSynchronize();
  }
  //   //////////////////* END KERNEL *///////////////////////////////////

#ifdef TESTING
  unsigned int num_scan_elements = grid_size.x;
  preallocBlockSums(num_scan_elements);
  cudaMemset(d_destDataPacked, 0, mem_size);
  printf("Num_blocks to be passed to scan is %d.\n", num_scan_elements);
  prescanArray(d_cindex2, d_cindex, num_scan_elements);
  pack2<<<num_scan_elements / 32, 32>>>(
      (unsigned int *)d_destData, d_cindex, d_cindex2,
      (unsigned int *)d_destDataPacked, num_elements / num_scan_elements);
  cudaThreadSynchronize();
  CUT_CHECK_ERROR("Pack2 Kernel execution failed\n");
  deallocBlockSums();
  // return;

  CUDA_SAFE_CALL(
      cudaMemcpy(destData, d_destDataPacked, mem_size, cudaMemcpyDeviceToHost));
  compare_vectors((unsigned int *)crefData, (unsigned int *)destData, num_ints);
#endif

  free(sourceData);
  free(destData);
  free(codewords);
  free(codewordlens);
  free(cw32);
  free(cw32len);
  free(crefData);
  CUDA_SAFE_CALL(cudaFree(d_sourceData));
  CUDA_SAFE_CALL(cudaFree(d_destData));
  CUDA_SAFE_CALL(cudaFree(d_destDataPacked));
  CUDA_SAFE_CALL(cudaFree(d_codewords));
  CUDA_SAFE_CALL(cudaFree(d_codewordlens));
  CUDA_SAFE_CALL(cudaFree(d_cw32));
  CUDA_SAFE_CALL(cudaFree(d_cw32len));
  CUDA_SAFE_CALL(cudaFree(d_cw32idx));
  CUDA_SAFE_CALL(cudaFree(d_cindex));
  CUDA_SAFE_CALL(cudaFree(d_cindex2));
  free(cindex2);
}
add codebase for TACO submission 2022-05-04 20:59:38 +08:00			`/*`
			`* PAVLE - Parallel Variable-Length Encoder for CUDA. Main file.`
			`*`
			`* Copyright (C) 2009 Ana Balevic <ana.balevic@gmail.com>`
			`* All rights reserved.`
			`*`
			`* This program is free software; you can redistribute it and/or modify it under`
			`* the terms of the MIT License. Read the full licence:`
			`* http://www.opensource.org/licenses/mit-license.php`
			`*`
			`* If you find this program useful, please contact me and reference PAVLE home`
			`* page in your work.`
			`*`
			`*/`

			`#include "comparison_helpers.h"`
			`#include "cuda_helpers.h"`
			`#include "load_data.h"`
			`#include "print_helpers.h"`
			`#include "stats_logger.h"`
			`#include "stdafx.h"`
			`#include <cuda_runtime.h>`
			`#include <sys/time.h>`

			`//#include "vlc_kernel_gm32.cu"`
			`//#include "vlc_kernel_sm32.cu"`
			`#include "vlc_kernel_sm64huff.cu"`
			`//#include "vlc_kernel_dpt.cu"`
			`//#include "vlc_kernel_dptt.cu"`
			`//#include "scan_kernel.cu"`
			`#include "cpuencode.h"`
			`#include "pack_kernels.cu"`
			`#include "scan.cu"`

			`long long get_time() {`
			`struct timeval tv;`
			`gettimeofday(&tv, NULL);`
			`return (tv.tv_sec * 1000000) + tv.tv_usec;`
			`}`
			`void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks = 1);`

			`extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,`
			`unsigned int outdata, unsigned int outsize,`
			`unsigned int *codewords,`
			`unsigned int *codewordlens);`

			`int main(int argc, char *argv[]) {`
			`if (!InitCUDA()) {`
			`return 0;`
			`}`
			`unsigned int num_block_threads = 256;`
			`if (argc > 1)`
			`for (int i = 1; i < argc; i++)`
			`runVLCTest(argv[i], num_block_threads);`
			`else {`
			`runVLCTest(NULL, num_block_threads, 1024);`
			`}`
			`return 0;`
			`}`

			`void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) {`
			`printf("CUDA! Starting VLC Tests!\n");`
			`unsigned int`
			`num_elements; // uint num_elements = num_blocks * num_block_threads;`
			`unsigned int mem_size; // uint mem_size = num_elements * sizeof(int);`
			`unsigned int symbol_type_size = sizeof(int);`
			`//////// LOAD DATA ///////////////`
			`double H; // entropy`
			`initParams(file_name, num_block_threads, num_blocks, num_elements, mem_size,`
			`symbol_type_size);`
			`printf("Parameters: num_elements: %d, num_blocks: %d, num_block_threads: "`
			`"%d\n----------------------------\n",`
			`num_elements, num_blocks, num_block_threads);`
			`////////LOAD DATA ///////////////`
			`uint sourceData = (uint )malloc(mem_size);`
			`uint destData = (uint )malloc(mem_size);`
			`uint crefData = (uint )malloc(mem_size);`

			`uint codewords = (uint )malloc(NUM_SYMBOLS * symbol_type_size);`
			`uint codewordlens = (uint )malloc(NUM_SYMBOLS * symbol_type_size);`

			`uint cw32 = (uint )malloc(mem_size);`
			`uint cw32len = (uint )malloc(mem_size);`
			`uint cw32idx = (uint )malloc(mem_size);`

			`uint cindex2 = (uint )malloc(num_blocks * sizeof(int));`

			`memset(sourceData, 0, mem_size);`
			`memset(destData, 0, mem_size);`
			`memset(crefData, 0, mem_size);`
			`memset(cw32, 0, mem_size);`
			`memset(cw32len, 0, mem_size);`
			`memset(cw32idx, 0, mem_size);`
			`memset(codewords, 0, NUM_SYMBOLS * symbol_type_size);`
			`memset(codewordlens, 0, NUM_SYMBOLS * symbol_type_size);`
			`memset(cindex2, 0, num_blocks * sizeof(int));`
			`//////// LOAD DATA ///////////////`
			`loadData(file_name, sourceData, codewords, codewordlens, num_elements,`
			`mem_size, H);`

			`//////// LOAD DATA ///////////////`

			`unsigned int d_sourceData, d_destData, *d_destDataPacked;`
			`unsigned int d_codewords, d_codewordlens;`
			`unsigned int d_cw32, d_cw32len, d_cw32idx, d_cindex, *d_cindex2;`

			`CUDA_SAFE_CALL(cudaMalloc((void **)&d_sourceData, mem_size));`
			`CUDA_SAFE_CALL(cudaMalloc((void **)&d_destData, mem_size));`
			`CUDA_SAFE_CALL(cudaMalloc((void **)&d_destDataPacked, mem_size));`

			`CUDA_SAFE_CALL(`
			`cudaMalloc((void *)&d_codewords, NUM_SYMBOLS symbol_type_size));`
			`CUDA_SAFE_CALL(`
			`cudaMalloc((void *)&d_codewordlens, NUM_SYMBOLS symbol_type_size));`

			`CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32, mem_size));`
			`CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32len, mem_size));`
			`CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32idx, mem_size));`

			`CUDA_SAFE_CALL(`
			`cudaMalloc((void *)&d_cindex, num_blocks sizeof(unsigned int)));`
			`CUDA_SAFE_CALL(`
			`cudaMalloc((void *)&d_cindex2, num_blocks sizeof(unsigned int)));`
			`// printf("source data\n");`
			`// for (int i = 0; i < 200; i++) {`
			`// printf("%d ", sourceData[i]);`
			`// }`
			`// printf("\n");`
			`// printf("codewords\n");`
			`// for (int i = 0; i < 200; i++) {`
			`// printf("%d ", codewords[i]);`
			`// }`
			`// printf("\n");`
			`// printf("codeword lens\n");`
			`// for (int i = 0; i < 200; i++) {`
			`// printf("%d ", codewordlens[i]);`
			`// }`
			`// printf("\n");`
			`// return;`
			`CUDA_SAFE_CALL(`
			`cudaMemcpy(d_sourceData, sourceData, mem_size, cudaMemcpyHostToDevice));`
			`CUDA_SAFE_CALL(cudaMemcpy(d_codewords, codewords,`
			`NUM_SYMBOLS * symbol_type_size,`
			`cudaMemcpyHostToDevice));`
			`CUDA_SAFE_CALL(cudaMemcpy(d_codewordlens, codewordlens,`
			`NUM_SYMBOLS * symbol_type_size,`
			`cudaMemcpyHostToDevice));`
			`CUDA_SAFE_CALL(`
			`cudaMemcpy(d_destData, destData, mem_size, cudaMemcpyHostToDevice));`

			`dim3 grid_size(num_blocks, 1, 1);`
			`dim3 block_size(num_block_threads, 1, 1);`
			`unsigned int sm_size;`

			`unsigned int NT = 10; // number of runs for each execution time`

			`//////////////////* CPU ENCODER *///////////////////////////////////`
			`unsigned int refbytesize;`
			`long long timer = get_time();`
			`cpu_vlc_encode((unsigned int *)sourceData, num_elements,`
			`(unsigned int *)crefData, &refbytesize, codewords,`
			`codewordlens);`
			`float msec = (float)((get_time() - timer) / 1000.0);`
			`printf("CPU Encoding time (CPU): %f (ms)\n", msec);`
			`printf("CPU Encoded to %d [B]\n", refbytesize);`
			`unsigned int num_ints = refbytesize / 4 + ((refbytesize % 4 == 0) ? 0 : 1);`
			`//////////////////* END CPU *///////////////////////////////////`

			`//////////////////* SM64HUFF KERNEL *///////////////////////////////////`
			`grid_size.x = num_blocks;`
			`block_size.x = num_block_threads;`
			`sm_size = block_size.x * sizeof(unsigned int);`
			`#ifdef CACHECWLUT`
			`sm_size = 2 * NUM_SYMBOLS * sizeof(int) + block_size.x * sizeof(unsigned int);`
			`#endif`

			`for (int i = 0; i < NT; i++) {`
			`vlc_encode_kernel_sm64huff<<<grid_size, block_size>>>(`
			`d_sourceData, d_codewords, d_codewordlens,`
			`#ifdef TESTING`
			`d_cw32, d_cw32len, d_cw32idx,`
			`#endif`
			`d_destData, d_cindex); // testedOK2`
			`cudaThreadSynchronize();`
			`}`
			`// //////////////////* END KERNEL *///////////////////////////////////`

			`#ifdef TESTING`
			`unsigned int num_scan_elements = grid_size.x;`
			`preallocBlockSums(num_scan_elements);`
			`cudaMemset(d_destDataPacked, 0, mem_size);`
			`printf("Num_blocks to be passed to scan is %d.\n", num_scan_elements);`
			`prescanArray(d_cindex2, d_cindex, num_scan_elements);`
			`pack2<<<num_scan_elements / 32, 32>>>(`
			`(unsigned int *)d_destData, d_cindex, d_cindex2,`
			`(unsigned int *)d_destDataPacked, num_elements / num_scan_elements);`
			`cudaThreadSynchronize();`
			`CUT_CHECK_ERROR("Pack2 Kernel execution failed\n");`
			`deallocBlockSums();`
			`// return;`

			`CUDA_SAFE_CALL(`
			`cudaMemcpy(destData, d_destDataPacked, mem_size, cudaMemcpyDeviceToHost));`
			`compare_vectors((unsigned int )crefData, (unsigned int )destData, num_ints);`
			`#endif`

			`free(sourceData);`
			`free(destData);`
			`free(codewords);`
			`free(codewordlens);`
			`free(cw32);`
			`free(cw32len);`
			`free(crefData);`
			`CUDA_SAFE_CALL(cudaFree(d_sourceData));`
			`CUDA_SAFE_CALL(cudaFree(d_destData));`
			`CUDA_SAFE_CALL(cudaFree(d_destDataPacked));`
			`CUDA_SAFE_CALL(cudaFree(d_codewords));`
			`CUDA_SAFE_CALL(cudaFree(d_codewordlens));`
			`CUDA_SAFE_CALL(cudaFree(d_cw32));`
			`CUDA_SAFE_CALL(cudaFree(d_cw32len));`
			`CUDA_SAFE_CALL(cudaFree(d_cw32idx));`
			`CUDA_SAFE_CALL(cudaFree(d_cindex));`
			`CUDA_SAFE_CALL(cudaFree(d_cindex2));`
			`free(cindex2);`
			`}`